{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 23583, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012721027859051011, "ewc_loss": 0.0, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 0.0, "grad_norm": 4.835746765136719, "learning_rate": 0.0, "loss": 0.7982, "mean_token_accuracy": 0.7762961387634277, "num_tokens": 38493.0, "step": 1 }, { "epoch": 0.00025442055718102023, "ewc_loss": 0.0, "ewc_loss_diag": 0.0, "ewc_loss_parallel": 0.0, "grad_norm": 4.588861465454102, "learning_rate": 4.2390843577787196e-10, "loss": 0.8329, "mean_token_accuracy": 0.765798807144165, "num_tokens": 80419.0, "step": 2 }, { "epoch": 0.0003816308357715303, "ewc_loss": 1.6478563297084975e-15, "ewc_loss_diag": 6.437450399132683e-19, "ewc_loss_parallel": 1.002539180101513e-17, "grad_norm": 4.725566864013672, "learning_rate": 8.478168715557439e-10, "loss": 0.7225, "mean_token_accuracy": 0.7960126996040344, "num_tokens": 118717.0, "step": 3 }, { "epoch": 0.0005088411143620405, "ewc_loss": 1.0313229220279552e-13, "ewc_loss_diag": 6.245004513516506e-17, "ewc_loss_parallel": 4.051571668443112e-16, "grad_norm": 5.365752696990967, "learning_rate": 1.271725307333616e-09, "loss": 0.8139, "mean_token_accuracy": 0.7711055278778076, "num_tokens": 150155.0, "step": 4 }, { "epoch": 0.0006360513929525506, "ewc_loss": 2.0034015679043504e-12, "ewc_loss_diag": 1.6930901125533637e-15, "ewc_loss_parallel": 3.123098144436646e-15, "grad_norm": 4.307570934295654, "learning_rate": 1.6956337431114878e-09, "loss": 0.7919, "mean_token_accuracy": 0.7745044231414795, "num_tokens": 193616.0, "step": 5 }, { "epoch": 0.0007632616715430606, "ewc_loss": 7.619208215736695e-12, "ewc_loss_diag": 5.9396931817445875e-15, "ewc_loss_parallel": 1.6790712090565367e-14, "grad_norm": 5.211024284362793, "learning_rate": 2.1195421788893596e-09, "loss": 0.7896, "mean_token_accuracy": 0.7785345911979675, "num_tokens": 227640.0, "step": 6 }, { "epoch": 0.0008904719501335708, "ewc_loss": 1.5037488615421424e-11, "ewc_loss_diag": 1.176836406102666e-14, "ewc_loss_parallel": 3.270900057424954e-14, "grad_norm": 4.83239221572876, "learning_rate": 2.543450614667232e-09, "loss": 0.816, "mean_token_accuracy": 0.7750711441040039, "num_tokens": 265114.0, "step": 7 }, { "epoch": 0.001017682228724081, "ewc_loss": 5.359196419973955e-11, "ewc_loss_diag": 3.885780586188048e-14, "ewc_loss_parallel": 1.4711066295258063e-13, "grad_norm": 4.940354824066162, "learning_rate": 2.967359050445104e-09, "loss": 0.7584, "mean_token_accuracy": 0.7891727089881897, "num_tokens": 299865.0, "step": 8 }, { "epoch": 0.001144892507314591, "ewc_loss": 8.067761042562793e-11, "ewc_loss_diag": 5.773159728050814e-14, "ewc_loss_parallel": 2.292469599568525e-13, "grad_norm": 4.569688320159912, "learning_rate": 3.3912674862229757e-09, "loss": 0.814, "mean_token_accuracy": 0.7740728855133057, "num_tokens": 342063.0, "step": 9 }, { "epoch": 0.0012721027859051012, "ewc_loss": 1.963439688612567e-10, "ewc_loss_diag": 1.5276668818842154e-13, "ewc_loss_parallel": 4.3548858638144117e-13, "grad_norm": 5.443160533905029, "learning_rate": 3.815175922000847e-09, "loss": 0.8639, "mean_token_accuracy": 0.7636497020721436, "num_tokens": 374864.0, "step": 10 }, { "epoch": 0.0013993130644956112, "ewc_loss": 3.968429751477487e-10, "ewc_loss_diag": 2.682298827494378e-13, "ewc_loss_parallel": 1.2945153846435908e-12, "grad_norm": 4.302614688873291, "learning_rate": 4.239084357778719e-09, "loss": 0.7765, "mean_token_accuracy": 0.7755075097084045, "num_tokens": 416605.0, "step": 11 }, { "epoch": 0.0015265233430861213, "ewc_loss": 5.478692499671922e-10, "ewc_loss_diag": 3.7481129311345285e-13, "ewc_loss_parallel": 1.7315745191870402e-12, "grad_norm": 5.463967800140381, "learning_rate": 4.662992793556591e-09, "loss": 0.8345, "mean_token_accuracy": 0.7694711685180664, "num_tokens": 448798.0, "step": 12 }, { "epoch": 0.0016537336216766315, "ewc_loss": 6.765017435128584e-10, "ewc_loss_diag": 4.547473508864641e-13, "ewc_loss_parallel": 2.2175436400345694e-12, "grad_norm": 5.455956935882568, "learning_rate": 5.086901229334464e-09, "loss": 0.8738, "mean_token_accuracy": 0.7540663480758667, "num_tokens": 480084.0, "step": 13 }, { "epoch": 0.0017809439002671415, "ewc_loss": 2.2489878848119815e-09, "ewc_loss_diag": 1.5631940186722204e-12, "ewc_loss_parallel": 6.846569752111398e-12, "grad_norm": 4.402233600616455, "learning_rate": 5.510809665112336e-09, "loss": 0.8057, "mean_token_accuracy": 0.779091477394104, "num_tokens": 524543.0, "step": 14 }, { "epoch": 0.0019081541788576518, "ewc_loss": 3.1009907974777207e-09, "ewc_loss_diag": 2.0605739337042905e-12, "ewc_loss_parallel": 1.0346188315801808e-11, "grad_norm": 4.790429592132568, "learning_rate": 5.934718100890208e-09, "loss": 0.7519, "mean_token_accuracy": 0.7888119220733643, "num_tokens": 563314.0, "step": 15 }, { "epoch": 0.002035364457448162, "ewc_loss": 3.56263862855144e-09, "ewc_loss_diag": 2.3163693185779266e-12, "ewc_loss_parallel": 1.2488841558133679e-11, "grad_norm": 5.0517897605896, "learning_rate": 6.3586265366680796e-09, "loss": 0.8422, "mean_token_accuracy": 0.7628829479217529, "num_tokens": 598421.0, "step": 16 }, { "epoch": 0.002162574736038672, "ewc_loss": 4.419815624601142e-09, "ewc_loss_diag": 2.8563817977556027e-12, "ewc_loss_parallel": 1.567640461885844e-11, "grad_norm": 5.027663230895996, "learning_rate": 6.782534972445951e-09, "loss": 0.8318, "mean_token_accuracy": 0.7628101110458374, "num_tokens": 634690.0, "step": 17 }, { "epoch": 0.002289785014629182, "ewc_loss": 5.183095730387777e-09, "ewc_loss_diag": 3.382183422218077e-12, "ewc_loss_parallel": 1.8070513835288793e-11, "grad_norm": 5.131458759307861, "learning_rate": 7.206443408223823e-09, "loss": 0.8691, "mean_token_accuracy": 0.7594257593154907, "num_tokens": 674653.0, "step": 18 }, { "epoch": 0.0024169952932196924, "ewc_loss": 1.7246259531589203e-08, "ewc_loss_diag": 1.2505552149377763e-11, "ewc_loss_parallel": 4.731612324171408e-11, "grad_norm": 5.055959701538086, "learning_rate": 7.630351844001695e-09, "loss": 0.8003, "mean_token_accuracy": 0.7776939868927002, "num_tokens": 708238.0, "step": 19 }, { "epoch": 0.0025442055718102024, "ewc_loss": 2.171587709653977e-08, "ewc_loss_diag": 1.4551915228366852e-11, "ewc_loss_parallel": 7.163962478795582e-11, "grad_norm": 4.6268415451049805, "learning_rate": 8.054260279779567e-09, "loss": 0.7826, "mean_token_accuracy": 0.7786588668823242, "num_tokens": 749312.0, "step": 20 }, { "epoch": 0.0026714158504007124, "ewc_loss": 2.5578602347309243e-08, "ewc_loss_diag": 1.6825651982799172e-11, "ewc_loss_parallel": 8.6983809044483e-11, "grad_norm": 5.408430576324463, "learning_rate": 8.478168715557438e-09, "loss": 0.8526, "mean_token_accuracy": 0.7665163278579712, "num_tokens": 783532.0, "step": 21 }, { "epoch": 0.0027986261289912225, "ewc_loss": 2.799957776744577e-08, "ewc_loss_diag": 1.8189894035458565e-11, "ewc_loss_parallel": 9.838787423666062e-11, "grad_norm": 5.067831516265869, "learning_rate": 8.902077151335311e-09, "loss": 0.8301, "mean_token_accuracy": 0.7737022638320923, "num_tokens": 817429.0, "step": 22 }, { "epoch": 0.0029258364075817325, "ewc_loss": 3.3480525019058405e-08, "ewc_loss_diag": 2.1941559680271894e-11, "ewc_loss_parallel": 1.1594446236440348e-10, "grad_norm": 4.837804317474365, "learning_rate": 9.325985587113182e-09, "loss": 0.7639, "mean_token_accuracy": 0.7879300713539124, "num_tokens": 853964.0, "step": 23 }, { "epoch": 0.0030530466861722425, "ewc_loss": 3.685667593344988e-08, "ewc_loss_diag": 2.3874235921539366e-11, "ewc_loss_parallel": 1.2991535625062056e-10, "grad_norm": 5.457793712615967, "learning_rate": 9.749894022891054e-09, "loss": 0.8277, "mean_token_accuracy": 0.7672185301780701, "num_tokens": 885070.0, "step": 24 }, { "epoch": 0.003180256964762753, "ewc_loss": 4.065368841565942e-08, "ewc_loss_diag": 2.637534635141492e-11, "ewc_loss_parallel": 1.4227412303835507e-10, "grad_norm": 4.817604064941406, "learning_rate": 1.0173802458668929e-08, "loss": 0.8797, "mean_token_accuracy": 0.7513183355331421, "num_tokens": 926893.0, "step": 25 }, { "epoch": 0.003307467243353263, "ewc_loss": 1.1802295318830147e-07, "ewc_loss_diag": 8.776623872108757e-11, "ewc_loss_parallel": 3.0478636281472404e-10, "grad_norm": 4.870096206665039, "learning_rate": 1.05977108944468e-08, "loss": 0.8161, "mean_token_accuracy": 0.767782986164093, "num_tokens": 964773.0, "step": 26 }, { "epoch": 0.003434677521943773, "ewc_loss": 1.6085290610590164e-07, "ewc_loss_diag": 1.1459633242338896e-10, "ewc_loss_parallel": 4.630023309193376e-10, "grad_norm": 4.725923538208008, "learning_rate": 1.1021619330224672e-08, "loss": 0.7688, "mean_token_accuracy": 0.7854713201522827, "num_tokens": 1002725.0, "step": 27 }, { "epoch": 0.003561887800534283, "ewc_loss": 1.7862817003333475e-07, "ewc_loss_diag": 1.2187229003757238e-10, "ewc_loss_parallel": 5.662492141844666e-10, "grad_norm": 4.972189903259277, "learning_rate": 1.1445527766002543e-08, "loss": 0.8731, "mean_token_accuracy": 0.75742506980896, "num_tokens": 1040296.0, "step": 28 }, { "epoch": 0.003689098079124793, "ewc_loss": 1.985199418186312e-07, "ewc_loss_diag": 1.3369572116062045e-10, "ewc_loss_parallel": 6.440949440467136e-10, "grad_norm": 4.423267841339111, "learning_rate": 1.1869436201780416e-08, "loss": 0.7654, "mean_token_accuracy": 0.7856889963150024, "num_tokens": 1081711.0, "step": 29 }, { "epoch": 0.0038163083577153036, "ewc_loss": 2.1003984329581726e-07, "ewc_loss_diag": 1.4006218407303095e-10, "ewc_loss_parallel": 7.034145488304944e-10, "grad_norm": 4.797379493713379, "learning_rate": 1.2293344637558287e-08, "loss": 0.8121, "mean_token_accuracy": 0.7750877737998962, "num_tokens": 1120556.0, "step": 30 }, { "epoch": 0.003943518636305814, "ewc_loss": 2.3189387832189823e-07, "ewc_loss_diag": 1.5552359400317073e-10, "ewc_loss_parallel": 7.636300480839964e-10, "grad_norm": 4.761314392089844, "learning_rate": 1.2717253073336159e-08, "loss": 0.764, "mean_token_accuracy": 0.789752185344696, "num_tokens": 1157723.0, "step": 31 }, { "epoch": 0.004070728914896324, "ewc_loss": 2.557987954787677e-07, "ewc_loss_diag": 1.709850039333105e-10, "ewc_loss_parallel": 8.44354253182189e-10, "grad_norm": 4.665714740753174, "learning_rate": 1.314116150911403e-08, "loss": 0.8102, "mean_token_accuracy": 0.7741343379020691, "num_tokens": 1197879.0, "step": 32 }, { "epoch": 0.004197939193486834, "ewc_loss": 2.723796228565334e-07, "ewc_loss_diag": 1.8189894035458565e-10, "ewc_loss_parallel": 9.077170681770497e-10, "grad_norm": 4.747186183929443, "learning_rate": 1.3565069944891903e-08, "loss": 0.8107, "mean_token_accuracy": 0.774878203868866, "num_tokens": 1237342.0, "step": 33 }, { "epoch": 0.004325149472077344, "ewc_loss": 2.893575867801701e-07, "ewc_loss_diag": 1.9281287677586079e-10, "ewc_loss_parallel": 9.657381561112288e-10, "grad_norm": 4.493772506713867, "learning_rate": 1.3988978380669775e-08, "loss": 0.7822, "mean_token_accuracy": 0.7825630903244019, "num_tokens": 1280197.0, "step": 34 }, { "epoch": 0.004452359750667854, "ewc_loss": 3.3730742643456324e-07, "ewc_loss_diag": 2.3283064365386963e-10, "ewc_loss_parallel": 1.0447679388292386e-09, "grad_norm": 4.626460075378418, "learning_rate": 1.4412886816447646e-08, "loss": 0.7642, "mean_token_accuracy": 0.7847480773925781, "num_tokens": 1318625.0, "step": 35 }, { "epoch": 0.004579570029258364, "ewc_loss": 8.404389291172265e-07, "ewc_loss_diag": 6.366462912410498e-10, "ewc_loss_parallel": 2.0341428541570394e-09, "grad_norm": 4.774989128112793, "learning_rate": 1.4836795252225519e-08, "loss": 0.8217, "mean_token_accuracy": 0.7725617289543152, "num_tokens": 1356868.0, "step": 36 }, { "epoch": 0.004706780307848874, "ewc_loss": 1.0913769301623688e-06, "ewc_loss_diag": 7.894414011389017e-10, "ewc_loss_parallel": 3.016154215274014e-09, "grad_norm": 4.814386367797852, "learning_rate": 1.526070368800339e-08, "loss": 0.7936, "mean_token_accuracy": 0.7808494567871094, "num_tokens": 1394696.0, "step": 37 }, { "epoch": 0.004833990586439385, "ewc_loss": 1.2552857242553728e-06, "ewc_loss_diag": 8.87666828930378e-10, "ewc_loss_parallel": 3.6866669717028344e-09, "grad_norm": 4.357654094696045, "learning_rate": 1.5684612123781262e-08, "loss": 0.7826, "mean_token_accuracy": 0.7822824716567993, "num_tokens": 1438738.0, "step": 38 }, { "epoch": 0.004961200865029895, "ewc_loss": 1.3515507362171775e-06, "ewc_loss_diag": 9.38598532229662e-10, "ewc_loss_parallel": 4.127776342954803e-09, "grad_norm": 4.716159343719482, "learning_rate": 1.6108520559559135e-08, "loss": 0.7483, "mean_token_accuracy": 0.7926405072212219, "num_tokens": 1475089.0, "step": 39 }, { "epoch": 0.005088411143620405, "ewc_loss": 1.432386738997593e-06, "ewc_loss_diag": 9.822542779147625e-10, "ewc_loss_parallel": 4.489101090854319e-09, "grad_norm": 4.712368488311768, "learning_rate": 1.6532428995337004e-08, "loss": 0.8082, "mean_token_accuracy": 0.7741730213165283, "num_tokens": 1514566.0, "step": 40 }, { "epoch": 0.005215621422210915, "ewc_loss": 1.510598735876556e-06, "ewc_loss_diag": 1.0331859812140465e-09, "ewc_loss_parallel": 4.74968064878567e-09, "grad_norm": 4.93391752243042, "learning_rate": 1.6956337431114877e-08, "loss": 0.8233, "mean_token_accuracy": 0.7736216187477112, "num_tokens": 1552560.0, "step": 41 }, { "epoch": 0.005342831700801425, "ewc_loss": 1.5621799320797436e-06, "ewc_loss_diag": 1.0622898116707802e-09, "ewc_loss_parallel": 4.9674691027234985e-09, "grad_norm": 5.380032062530518, "learning_rate": 1.738024586689275e-08, "loss": 0.8006, "mean_token_accuracy": 0.7773917317390442, "num_tokens": 1584759.0, "step": 42 }, { "epoch": 0.005470041979391935, "ewc_loss": 1.6581197996856645e-06, "ewc_loss_diag": 1.1423253454267979e-09, "ewc_loss_parallel": 5.181809203946841e-09, "grad_norm": 4.813974380493164, "learning_rate": 1.7804154302670622e-08, "loss": 0.8021, "mean_token_accuracy": 0.775635302066803, "num_tokens": 1621825.0, "step": 43 }, { "epoch": 0.005597252257982445, "ewc_loss": 1.805611873351154e-06, "ewc_loss_diag": 1.2514647096395493e-09, "ewc_loss_parallel": 5.539142922117435e-09, "grad_norm": 4.741023540496826, "learning_rate": 1.8228062738448494e-08, "loss": 0.8276, "mean_token_accuracy": 0.7681269645690918, "num_tokens": 1662946.0, "step": 44 }, { "epoch": 0.005724462536572955, "ewc_loss": 1.9189999420632375e-06, "ewc_loss_diag": 1.331500243395567e-09, "ewc_loss_parallel": 5.853459938975902e-09, "grad_norm": 4.620482921600342, "learning_rate": 1.8651971174226364e-08, "loss": 0.8153, "mean_token_accuracy": 0.7735557556152344, "num_tokens": 1699433.0, "step": 45 }, { "epoch": 0.005851672815163465, "ewc_loss": 1.9972057998529635e-06, "ewc_loss_diag": 1.382431946694851e-09, "ewc_loss_parallel": 6.113977324417874e-09, "grad_norm": 4.309771537780762, "learning_rate": 1.9075879610004236e-08, "loss": 0.7462, "mean_token_accuracy": 0.7886000275611877, "num_tokens": 1742812.0, "step": 46 }, { "epoch": 0.005978883093753975, "ewc_loss": 2.0607026272045914e-06, "ewc_loss_diag": 1.4260876923799515e-09, "ewc_loss_parallel": 6.376417172049287e-09, "grad_norm": 4.820437431335449, "learning_rate": 1.949978804578211e-08, "loss": 0.8016, "mean_token_accuracy": 0.7782822847366333, "num_tokens": 1778725.0, "step": 47 }, { "epoch": 0.006106093372344485, "ewc_loss": 2.1598109469778137e-06, "ewc_loss_diag": 1.4842953532934189e-09, "ewc_loss_parallel": 6.771453620046941e-09, "grad_norm": 4.767100811004639, "learning_rate": 1.9923696481559985e-08, "loss": 0.8479, "mean_token_accuracy": 0.7621948719024658, "num_tokens": 1816592.0, "step": 48 }, { "epoch": 0.006233303650934996, "ewc_loss": 2.9756201911368407e-06, "ewc_loss_diag": 2.066371962428093e-09, "ewc_loss_parallel": 9.043587212431703e-09, "grad_norm": 4.3456315994262695, "learning_rate": 2.0347604917337857e-08, "loss": 0.7398, "mean_token_accuracy": 0.7903152704238892, "num_tokens": 1859907.0, "step": 49 }, { "epoch": 0.006360513929525506, "ewc_loss": 5.379133654059842e-06, "ewc_loss_diag": 3.7834979593753815e-09, "ewc_loss_parallel": 1.594238518975999e-08, "grad_norm": 4.583129405975342, "learning_rate": 2.0771513353115727e-08, "loss": 0.7292, "mean_token_accuracy": 0.7999317646026611, "num_tokens": 1896627.0, "step": 50 }, { "epoch": 0.006487724208116016, "ewc_loss": 7.121964699763339e-06, "ewc_loss_diag": 4.976755008101463e-09, "ewc_loss_parallel": 2.1449766407499737e-08, "grad_norm": 4.843439102172852, "learning_rate": 2.11954217888936e-08, "loss": 0.8446, "mean_token_accuracy": 0.7669968605041504, "num_tokens": 1934041.0, "step": 51 }, { "epoch": 0.006614934486706526, "ewc_loss": 7.9756118793739e-06, "ewc_loss_diag": 5.50062395632267e-09, "ewc_loss_parallel": 2.462182457918516e-08, "grad_norm": 4.458742141723633, "learning_rate": 2.1619330224671472e-08, "loss": 0.7736, "mean_token_accuracy": 0.7844394445419312, "num_tokens": 1976482.0, "step": 52 }, { "epoch": 0.006742144765297036, "ewc_loss": 8.436898497166112e-06, "ewc_loss_diag": 5.791662260890007e-09, "ewc_loss_parallel": 2.6552484655439912e-08, "grad_norm": 5.183436393737793, "learning_rate": 2.2043238660449344e-08, "loss": 0.8027, "mean_token_accuracy": 0.7718530893325806, "num_tokens": 2009224.0, "step": 53 }, { "epoch": 0.006869355043887546, "ewc_loss": 8.88005433807848e-06, "ewc_loss_diag": 6.082700565457344e-09, "ewc_loss_parallel": 2.8003803720366705e-08, "grad_norm": 4.768021106719971, "learning_rate": 2.2467147096227214e-08, "loss": 0.8475, "mean_token_accuracy": 0.7616364359855652, "num_tokens": 2049235.0, "step": 54 }, { "epoch": 0.006996565322478056, "ewc_loss": 9.272107490687631e-06, "ewc_loss_diag": 6.344635039567947e-09, "ewc_loss_parallel": 2.924213049482205e-08, "grad_norm": 4.489978790283203, "learning_rate": 2.2891055532005086e-08, "loss": 0.8607, "mean_token_accuracy": 0.7611328363418579, "num_tokens": 2090260.0, "step": 55 }, { "epoch": 0.007123775601068566, "ewc_loss": 9.605505510990042e-06, "ewc_loss_diag": 6.548361852765083e-09, "ewc_loss_parallel": 3.048994301479979e-08, "grad_norm": 4.883786201477051, "learning_rate": 2.331496396778296e-08, "loss": 0.8658, "mean_token_accuracy": 0.7607275247573853, "num_tokens": 2126686.0, "step": 56 }, { "epoch": 0.007250985879659076, "ewc_loss": 9.913006579154171e-06, "ewc_loss_diag": 6.7229848355054855e-09, "ewc_loss_parallel": 3.1776817621675946e-08, "grad_norm": 4.249879360198975, "learning_rate": 2.373887240356083e-08, "loss": 0.7674, "mean_token_accuracy": 0.7850896716117859, "num_tokens": 2171355.0, "step": 57 }, { "epoch": 0.007378196158249586, "ewc_loss": 1.0115110853803344e-05, "ewc_loss_diag": 6.83940015733242e-09, "ewc_loss_parallel": 3.2903795244010325e-08, "grad_norm": 4.501101016998291, "learning_rate": 2.4162780839338704e-08, "loss": 0.7408, "mean_token_accuracy": 0.7926605343818665, "num_tokens": 2211660.0, "step": 58 }, { "epoch": 0.007505406436840096, "ewc_loss": 1.0452235073898919e-05, "ewc_loss_diag": 7.07223080098629e-09, "ewc_loss_parallel": 3.389084213267779e-08, "grad_norm": 5.432986259460449, "learning_rate": 2.4586689275116573e-08, "loss": 0.8712, "mean_token_accuracy": 0.7590738534927368, "num_tokens": 2244411.0, "step": 59 }, { "epoch": 0.007632616715430607, "ewc_loss": 1.0775920600281097e-05, "ewc_loss_diag": 7.275957614183426e-09, "ewc_loss_parallel": 3.504153767153184e-08, "grad_norm": 4.236932754516602, "learning_rate": 2.5010597710894446e-08, "loss": 0.7975, "mean_token_accuracy": 0.7704958915710449, "num_tokens": 2285253.0, "step": 60 }, { "epoch": 0.007759826994021117, "ewc_loss": 1.130625605583191e-05, "ewc_loss_diag": 7.62520357966423e-09, "ewc_loss_parallel": 3.676861837220713e-08, "grad_norm": 4.381787300109863, "learning_rate": 2.5434506146672318e-08, "loss": 0.7624, "mean_token_accuracy": 0.7853771448135376, "num_tokens": 2328577.0, "step": 61 }, { "epoch": 0.007887037272611627, "ewc_loss": 1.1898578122782055e-05, "ewc_loss_diag": 8.032657206058502e-09, "ewc_loss_parallel": 3.8519512912671416e-08, "grad_norm": 4.58535623550415, "learning_rate": 2.585841458245019e-08, "loss": 0.8418, "mean_token_accuracy": 0.7695786952972412, "num_tokens": 2366524.0, "step": 62 }, { "epoch": 0.008014247551202136, "ewc_loss": 1.2462796803447418e-05, "ewc_loss_diag": 8.440110832452774e-09, "ewc_loss_parallel": 3.998937003757419e-08, "grad_norm": 4.082830905914307, "learning_rate": 2.628232301822806e-08, "loss": 0.7553, "mean_token_accuracy": 0.789602518081665, "num_tokens": 2408628.0, "step": 63 }, { "epoch": 0.008141457829792647, "ewc_loss": 1.2816233720513992e-05, "ewc_loss_diag": 8.672941476106644e-09, "ewc_loss_parallel": 4.1139557538372173e-08, "grad_norm": 4.167138576507568, "learning_rate": 2.6706231454005933e-08, "loss": 0.795, "mean_token_accuracy": 0.7756844758987427, "num_tokens": 2451800.0, "step": 64 }, { "epoch": 0.008268668108383158, "ewc_loss": 1.3171589671401307e-05, "ewc_loss_diag": 8.96397978067398e-09, "ewc_loss_parallel": 4.2308926140322e-08, "grad_norm": 5.549566745758057, "learning_rate": 2.7130139889783805e-08, "loss": 0.8664, "mean_token_accuracy": 0.7545677423477173, "num_tokens": 2481448.0, "step": 65 }, { "epoch": 0.008395878386973667, "ewc_loss": 1.368290395475924e-05, "ewc_loss_diag": 9.313225746154785e-09, "ewc_loss_parallel": 4.384579099792063e-08, "grad_norm": 4.164225101470947, "learning_rate": 2.7554048325561678e-08, "loss": 0.7929, "mean_token_accuracy": 0.7753652334213257, "num_tokens": 2526339.0, "step": 66 }, { "epoch": 0.008523088665564178, "ewc_loss": 1.4160823411657475e-05, "ewc_loss_diag": 9.546056389808655e-09, "ewc_loss_parallel": 4.6240799633778806e-08, "grad_norm": 4.301955699920654, "learning_rate": 2.797795676133955e-08, "loss": 0.8179, "mean_token_accuracy": 0.7701246738433838, "num_tokens": 2570691.0, "step": 67 }, { "epoch": 0.008650298944154687, "ewc_loss": 1.5755005733808503e-05, "ewc_loss_diag": 1.05355866253376e-08, "ewc_loss_parallel": 5.204983111184447e-08, "grad_norm": 4.431390762329102, "learning_rate": 2.840186519711742e-08, "loss": 0.8043, "mean_token_accuracy": 0.7752410173416138, "num_tokens": 2609207.0, "step": 68 }, { "epoch": 0.008777509222745198, "ewc_loss": 2.336852776352316e-05, "ewc_loss_diag": 1.6298145055770874e-08, "ewc_loss_parallel": 7.036855720343738e-08, "grad_norm": 4.697834491729736, "learning_rate": 2.8825773632895292e-08, "loss": 0.7926, "mean_token_accuracy": 0.7780281901359558, "num_tokens": 2645494.0, "step": 69 }, { "epoch": 0.008904719501335707, "ewc_loss": 3.299677337054163e-05, "ewc_loss_diag": 2.3166649043560028e-08, "ewc_loss_parallel": 9.87017116926836e-08, "grad_norm": 4.177772045135498, "learning_rate": 2.9249682068673165e-08, "loss": 0.7495, "mean_token_accuracy": 0.7918670177459717, "num_tokens": 2686258.0, "step": 70 }, { "epoch": 0.009031929779926218, "ewc_loss": 3.924536213162355e-05, "ewc_loss_diag": 2.735760062932968e-08, "ewc_loss_parallel": 1.194643601820644e-07, "grad_norm": 4.7783098220825195, "learning_rate": 2.9673590504451037e-08, "loss": 0.8042, "mean_token_accuracy": 0.7786123752593994, "num_tokens": 2720544.0, "step": 71 }, { "epoch": 0.009159140058516728, "ewc_loss": 4.334201003075577e-05, "ewc_loss_diag": 2.9802322387695312e-08, "ewc_loss_parallel": 1.3539687415686785e-07, "grad_norm": 4.555132865905762, "learning_rate": 3.0097498940228907e-08, "loss": 0.7675, "mean_token_accuracy": 0.7856956720352173, "num_tokens": 2758068.0, "step": 72 }, { "epoch": 0.009286350337107238, "ewc_loss": 4.64260192529764e-05, "ewc_loss_diag": 3.14321368932724e-08, "ewc_loss_parallel": 1.495476595891887e-07, "grad_norm": 4.771862983703613, "learning_rate": 3.052140737600678e-08, "loss": 0.8513, "mean_token_accuracy": 0.763370156288147, "num_tokens": 2793342.0, "step": 73 }, { "epoch": 0.009413560615697748, "ewc_loss": 4.8982030421029776e-05, "ewc_loss_diag": 3.259629011154175e-08, "ewc_loss_parallel": 1.6318686846261699e-07, "grad_norm": 4.951112747192383, "learning_rate": 3.094531581178465e-08, "loss": 0.8889, "mean_token_accuracy": 0.7548484206199646, "num_tokens": 2828003.0, "step": 74 }, { "epoch": 0.009540770894288259, "ewc_loss": 5.120671630720608e-05, "ewc_loss_diag": 3.3527612686157227e-08, "ewc_loss_parallel": 1.7589695744391065e-07, "grad_norm": 3.984346628189087, "learning_rate": 3.1369224247562524e-08, "loss": 0.7639, "mean_token_accuracy": 0.7840394973754883, "num_tokens": 2874755.0, "step": 75 }, { "epoch": 0.00966798117287877, "ewc_loss": 5.33169622940477e-05, "ewc_loss_diag": 3.4924596548080444e-08, "ewc_loss_parallel": 1.8507849119941966e-07, "grad_norm": 4.345953464508057, "learning_rate": 3.17931326833404e-08, "loss": 0.786, "mean_token_accuracy": 0.7766088247299194, "num_tokens": 2909703.0, "step": 76 }, { "epoch": 0.009795191451469279, "ewc_loss": 5.4512551287189126e-05, "ewc_loss_diag": 3.5390257835388184e-08, "ewc_loss_parallel": 1.9226600045385567e-07, "grad_norm": 3.873575210571289, "learning_rate": 3.221704111911827e-08, "loss": 0.7161, "mean_token_accuracy": 0.7989076375961304, "num_tokens": 2954020.0, "step": 77 }, { "epoch": 0.00992240173005979, "ewc_loss": 5.578760101343505e-05, "ewc_loss_diag": 3.608874976634979e-08, "ewc_loss_parallel": 1.9786395455412276e-07, "grad_norm": 4.93356466293335, "learning_rate": 3.264094955489614e-08, "loss": 0.8113, "mean_token_accuracy": 0.7748699188232422, "num_tokens": 2984894.0, "step": 78 }, { "epoch": 0.010049612008650299, "ewc_loss": 5.6829521781764925e-05, "ewc_loss_diag": 3.655441105365753e-08, "ewc_loss_parallel": 2.035147730339304e-07, "grad_norm": 4.65369987487793, "learning_rate": 3.306485799067401e-08, "loss": 0.7183, "mean_token_accuracy": 0.7955659627914429, "num_tokens": 3017773.0, "step": 79 }, { "epoch": 0.01017682228724081, "ewc_loss": 5.8265544794267043e-05, "ewc_loss_diag": 3.748573362827301e-08, "ewc_loss_parallel": 2.0833827818478312e-07, "grad_norm": 3.9933700561523438, "learning_rate": 3.348876642645188e-08, "loss": 0.7176, "mean_token_accuracy": 0.7980425357818604, "num_tokens": 3059739.0, "step": 80 }, { "epoch": 0.010304032565831319, "ewc_loss": 5.95352248637937e-05, "ewc_loss_diag": 3.841705620288849e-08, "ewc_loss_parallel": 2.114983601586573e-07, "grad_norm": 4.581633567810059, "learning_rate": 3.391267486222975e-08, "loss": 0.8095, "mean_token_accuracy": 0.7722476124763489, "num_tokens": 3094910.0, "step": 81 }, { "epoch": 0.01043124284442183, "ewc_loss": 6.0077236412325874e-05, "ewc_loss_diag": 3.864988684654236e-08, "ewc_loss_parallel": 2.1453426768403006e-07, "grad_norm": 4.2975754737854, "learning_rate": 3.4336583298007626e-08, "loss": 0.8134, "mean_token_accuracy": 0.7742296457290649, "num_tokens": 3133884.0, "step": 82 }, { "epoch": 0.010558453123012339, "ewc_loss": 6.053127071936615e-05, "ewc_loss_diag": 3.888271749019623e-08, "ewc_loss_parallel": 2.166904238265488e-07, "grad_norm": 4.857149600982666, "learning_rate": 3.47604917337855e-08, "loss": 0.846, "mean_token_accuracy": 0.7609719038009644, "num_tokens": 3166755.0, "step": 83 }, { "epoch": 0.01068566340160285, "ewc_loss": 6.12788790022023e-05, "ewc_loss_diag": 3.934837877750397e-08, "ewc_loss_parallel": 2.1939813166227395e-07, "grad_norm": 4.436580181121826, "learning_rate": 3.518440016956337e-08, "loss": 0.8152, "mean_token_accuracy": 0.7710539102554321, "num_tokens": 3204825.0, "step": 84 }, { "epoch": 0.010812873680193359, "ewc_loss": 6.239692447707057e-05, "ewc_loss_diag": 4.0046870708465576e-08, "ewc_loss_parallel": 2.234260563227508e-07, "grad_norm": 4.424106597900391, "learning_rate": 3.5608308605341244e-08, "loss": 0.8501, "mean_token_accuracy": 0.7628199458122253, "num_tokens": 3244036.0, "step": 85 }, { "epoch": 0.01094008395878387, "ewc_loss": 6.459128053393215e-05, "ewc_loss_diag": 4.1676685214042664e-08, "ewc_loss_parallel": 2.2868026405831188e-07, "grad_norm": 4.225394248962402, "learning_rate": 3.6032217041119116e-08, "loss": 0.8378, "mean_token_accuracy": 0.7668761014938354, "num_tokens": 3285969.0, "step": 86 }, { "epoch": 0.01106729423737438, "ewc_loss": 6.627980474149808e-05, "ewc_loss_diag": 4.284083843231201e-08, "ewc_loss_parallel": 2.3364461299024697e-07, "grad_norm": 4.169989585876465, "learning_rate": 3.645612547689699e-08, "loss": 0.8088, "mean_token_accuracy": 0.7745395302772522, "num_tokens": 3327648.0, "step": 87 }, { "epoch": 0.01119450451596489, "ewc_loss": 6.814939843025059e-05, "ewc_loss_diag": 4.423782229423523e-08, "ewc_loss_parallel": 2.3803545445844065e-07, "grad_norm": 4.303160190582275, "learning_rate": 3.6880033912674855e-08, "loss": 0.8121, "mean_token_accuracy": 0.7726193070411682, "num_tokens": 3367399.0, "step": 88 }, { "epoch": 0.0113217147945554, "ewc_loss": 6.976412259973586e-05, "ewc_loss_diag": 4.563480615615845e-08, "ewc_loss_parallel": 2.4226170580732287e-07, "grad_norm": 4.491087436676025, "learning_rate": 3.730394234845273e-08, "loss": 0.7952, "mean_token_accuracy": 0.7794708013534546, "num_tokens": 3405402.0, "step": 89 }, { "epoch": 0.01144892507314591, "ewc_loss": 7.089426799211651e-05, "ewc_loss_diag": 4.6333298087120056e-08, "ewc_loss_parallel": 2.4641062168484495e-07, "grad_norm": 4.6026716232299805, "learning_rate": 3.77278507842306e-08, "loss": 0.8783, "mean_token_accuracy": 0.7545019388198853, "num_tokens": 3441791.0, "step": 90 }, { "epoch": 0.01157613535173642, "ewc_loss": 7.247102621477097e-05, "ewc_loss_diag": 4.7497451305389404e-08, "ewc_loss_parallel": 2.5025732952599355e-07, "grad_norm": 4.531004905700684, "learning_rate": 3.815175922000847e-08, "loss": 0.8382, "mean_token_accuracy": 0.7690663933753967, "num_tokens": 3480151.0, "step": 91 }, { "epoch": 0.01170334563032693, "ewc_loss": 7.411037222482264e-05, "ewc_loss_diag": 4.866160452365875e-08, "ewc_loss_parallel": 2.547298265653808e-07, "grad_norm": 4.426536560058594, "learning_rate": 3.8575667655786345e-08, "loss": 0.8016, "mean_token_accuracy": 0.7760406136512756, "num_tokens": 3516867.0, "step": 92 }, { "epoch": 0.01183055590891744, "ewc_loss": 7.561976963188499e-05, "ewc_loss_diag": 4.959292709827423e-08, "ewc_loss_parallel": 2.602870665668888e-07, "grad_norm": 3.9917051792144775, "learning_rate": 3.899957609156422e-08, "loss": 0.7777, "mean_token_accuracy": 0.7826352119445801, "num_tokens": 3557466.0, "step": 93 }, { "epoch": 0.01195776618750795, "ewc_loss": 7.854960131226107e-05, "ewc_loss_diag": 5.168840289115906e-08, "ewc_loss_parallel": 2.681277067040355e-07, "grad_norm": 4.207128047943115, "learning_rate": 3.94234845273421e-08, "loss": 0.8071, "mean_token_accuracy": 0.7735612392425537, "num_tokens": 3596009.0, "step": 94 }, { "epoch": 0.012084976466098461, "ewc_loss": 8.351319411303848e-05, "ewc_loss_diag": 5.51808625459671e-08, "ewc_loss_parallel": 2.843850097633549e-07, "grad_norm": 4.382684707641602, "learning_rate": 3.984739296311997e-08, "loss": 0.7992, "mean_token_accuracy": 0.7747360467910767, "num_tokens": 3631582.0, "step": 95 }, { "epoch": 0.01221218674468897, "ewc_loss": 9.564160427544266e-05, "ewc_loss_diag": 6.239861249923706e-08, "ewc_loss_parallel": 3.317593382234918e-07, "grad_norm": 4.273091793060303, "learning_rate": 4.027130139889784e-08, "loss": 0.8251, "mean_token_accuracy": 0.7673804759979248, "num_tokens": 3672855.0, "step": 96 }, { "epoch": 0.012339397023279481, "ewc_loss": 0.0001240780984517187, "ewc_loss_diag": 8.149072527885437e-08, "ewc_loss_parallel": 4.253895156125509e-07, "grad_norm": 4.004971981048584, "learning_rate": 4.0695209834675715e-08, "loss": 0.8163, "mean_token_accuracy": 0.7726219892501831, "num_tokens": 3714100.0, "step": 97 }, { "epoch": 0.012466607301869992, "ewc_loss": 0.00014702642511110753, "ewc_loss_diag": 9.359791874885559e-08, "ewc_loss_parallel": 5.356633892006357e-07, "grad_norm": 4.094971179962158, "learning_rate": 4.111911827045358e-08, "loss": 0.7671, "mean_token_accuracy": 0.7855087518692017, "num_tokens": 3752834.0, "step": 98 }, { "epoch": 0.012593817580460501, "ewc_loss": 0.0001672977232374251, "ewc_loss_diag": 1.0337680578231812e-07, "ewc_loss_parallel": 6.382406354532577e-07, "grad_norm": 4.051748275756836, "learning_rate": 4.154302670623145e-08, "loss": 0.7775, "mean_token_accuracy": 0.7809967398643494, "num_tokens": 3791513.0, "step": 99 }, { "epoch": 0.012721027859051012, "ewc_loss": 0.00018168272799812257, "ewc_loss_diag": 1.0943040251731873e-07, "ewc_loss_parallel": 7.248702900142234e-07, "grad_norm": 4.546911716461182, "learning_rate": 4.1966935142009326e-08, "loss": 0.8005, "mean_token_accuracy": 0.7744757533073425, "num_tokens": 3825085.0, "step": 100 }, { "epoch": 0.012848238137641521, "ewc_loss": 0.0001949175784830004, "ewc_loss_diag": 1.150183379650116e-07, "ewc_loss_parallel": 7.999982472028933e-07, "grad_norm": 3.9746615886688232, "learning_rate": 4.23908435777872e-08, "loss": 0.8008, "mean_token_accuracy": 0.7769722938537598, "num_tokens": 3866492.0, "step": 101 }, { "epoch": 0.012975448416232032, "ewc_loss": 0.00020498177036643028, "ewc_loss_diag": 1.1920928955078125e-07, "ewc_loss_parallel": 8.577247854191228e-07, "grad_norm": 4.35603141784668, "learning_rate": 4.281475201356507e-08, "loss": 0.8281, "mean_token_accuracy": 0.7628748416900635, "num_tokens": 3904053.0, "step": 102 }, { "epoch": 0.013102658694822541, "ewc_loss": 0.00021073143579997122, "ewc_loss_diag": 1.2014061212539673e-07, "ewc_loss_parallel": 9.056847147803637e-07, "grad_norm": 4.337789535522461, "learning_rate": 4.3238660449342943e-08, "loss": 0.7365, "mean_token_accuracy": 0.7953848838806152, "num_tokens": 3935118.0, "step": 103 }, { "epoch": 0.013229868973413052, "ewc_loss": 0.000217701934161596, "ewc_loss_diag": 1.2293457984924316e-07, "ewc_loss_parallel": 9.467794939155283e-07, "grad_norm": 3.917722463607788, "learning_rate": 4.3662568885120816e-08, "loss": 0.7871, "mean_token_accuracy": 0.7759958505630493, "num_tokens": 3977899.0, "step": 104 }, { "epoch": 0.013357079252003561, "ewc_loss": 0.0002226687065558508, "ewc_loss_diag": 1.2479722499847412e-07, "ewc_loss_parallel": 9.773737019713735e-07, "grad_norm": 3.903362274169922, "learning_rate": 4.408647732089869e-08, "loss": 0.8072, "mean_token_accuracy": 0.7739092111587524, "num_tokens": 4019416.0, "step": 105 }, { "epoch": 0.013484289530594072, "ewc_loss": 0.00022865500068292022, "ewc_loss_diag": 1.2852251529693604e-07, "ewc_loss_parallel": 9.990897069656057e-07, "grad_norm": 3.8426871299743652, "learning_rate": 4.451038575667656e-08, "loss": 0.7964, "mean_token_accuracy": 0.7735815048217773, "num_tokens": 4063887.0, "step": 106 }, { "epoch": 0.013611499809184581, "ewc_loss": 0.00023218058049678802, "ewc_loss_diag": 1.30385160446167e-07, "ewc_loss_parallel": 1.0152720051337383e-06, "grad_norm": 3.970632791519165, "learning_rate": 4.493429419245443e-08, "loss": 0.8273, "mean_token_accuracy": 0.7685048580169678, "num_tokens": 4101309.0, "step": 107 }, { "epoch": 0.013738710087775092, "ewc_loss": 0.00023372532450594008, "ewc_loss_diag": 1.30385160446167e-07, "ewc_loss_parallel": 1.0307194315828383e-06, "grad_norm": 4.107096195220947, "learning_rate": 4.53582026282323e-08, "loss": 0.8116, "mean_token_accuracy": 0.7704823017120361, "num_tokens": 4136669.0, "step": 108 }, { "epoch": 0.013865920366365603, "ewc_loss": 0.00023607173352502286, "ewc_loss_diag": 1.3131648302078247e-07, "ewc_loss_parallel": 1.0446467513247626e-06, "grad_norm": 3.641752243041992, "learning_rate": 4.578211106401017e-08, "loss": 0.7427, "mean_token_accuracy": 0.7896551489830017, "num_tokens": 4178583.0, "step": 109 }, { "epoch": 0.013993130644956112, "ewc_loss": 0.00023676478303968906, "ewc_loss_diag": 1.3131648302078247e-07, "ewc_loss_parallel": 1.0515773283259477e-06, "grad_norm": 3.6655845642089844, "learning_rate": 4.6206019499788045e-08, "loss": 0.7732, "mean_token_accuracy": 0.7770309448242188, "num_tokens": 4219552.0, "step": 110 }, { "epoch": 0.014120340923546623, "ewc_loss": 0.000237134619965218, "ewc_loss_diag": 1.3131648302078247e-07, "ewc_loss_parallel": 1.0552756748438696e-06, "grad_norm": 3.7279036045074463, "learning_rate": 4.662992793556592e-08, "loss": 0.7661, "mean_token_accuracy": 0.785243034362793, "num_tokens": 4264487.0, "step": 111 }, { "epoch": 0.014247551202137132, "ewc_loss": 0.0002374265604885295, "ewc_loss_diag": 1.3131648302078247e-07, "ewc_loss_parallel": 1.058195039149723e-06, "grad_norm": 3.805980682373047, "learning_rate": 4.705383637134379e-08, "loss": 0.7322, "mean_token_accuracy": 0.790501594543457, "num_tokens": 4303015.0, "step": 112 }, { "epoch": 0.014374761480727643, "ewc_loss": 0.00023866935225669295, "ewc_loss_diag": 1.3224780559539795e-07, "ewc_loss_parallel": 1.0610862091198214e-06, "grad_norm": 3.743521213531494, "learning_rate": 4.747774480712166e-08, "loss": 0.7871, "mean_token_accuracy": 0.7771217823028564, "num_tokens": 4345446.0, "step": 113 }, { "epoch": 0.014501971759318152, "ewc_loss": 0.00023756995506118983, "ewc_loss_diag": 1.3131648302078247e-07, "ewc_loss_parallel": 1.0596289712339058e-06, "grad_norm": 3.8606832027435303, "learning_rate": 4.7901653242899535e-08, "loss": 0.7392, "mean_token_accuracy": 0.7914757132530212, "num_tokens": 4383854.0, "step": 114 }, { "epoch": 0.014629182037908663, "ewc_loss": 0.00024003718863241374, "ewc_loss_diag": 1.341104507446289e-07, "ewc_loss_parallel": 1.0556912002357421e-06, "grad_norm": 3.9047434329986572, "learning_rate": 4.832556167867741e-08, "loss": 0.8081, "mean_token_accuracy": 0.7727051973342896, "num_tokens": 4420750.0, "step": 115 }, { "epoch": 0.014756392316499172, "ewc_loss": 0.0002415601338725537, "ewc_loss_diag": 1.3597309589385986e-07, "ewc_loss_parallel": 1.0518471071918611e-06, "grad_norm": 4.042475700378418, "learning_rate": 4.8749470114455274e-08, "loss": 0.8053, "mean_token_accuracy": 0.7734549045562744, "num_tokens": 4458761.0, "step": 116 }, { "epoch": 0.014883602595089683, "ewc_loss": 0.00024415997904725373, "ewc_loss_diag": 1.387670636177063e-07, "ewc_loss_parallel": 1.0492352657820447e-06, "grad_norm": 3.8245956897735596, "learning_rate": 4.9173378550233146e-08, "loss": 0.7654, "mean_token_accuracy": 0.7813302278518677, "num_tokens": 4496547.0, "step": 117 }, { "epoch": 0.015010812873680193, "ewc_loss": 0.00024496990954503417, "ewc_loss_diag": 1.4062970876693726e-07, "ewc_loss_parallel": 1.047797695719055e-06, "grad_norm": 3.885272264480591, "learning_rate": 4.959728698601102e-08, "loss": 0.7631, "mean_token_accuracy": 0.7888261079788208, "num_tokens": 4533357.0, "step": 118 }, { "epoch": 0.015138023152270703, "ewc_loss": 0.00024595361901447177, "ewc_loss_diag": 1.4156103134155273e-07, "ewc_loss_parallel": 1.0480982837179909e-06, "grad_norm": 3.730719566345215, "learning_rate": 5.002119542178889e-08, "loss": 0.7376, "mean_token_accuracy": 0.7906442880630493, "num_tokens": 4573570.0, "step": 119 }, { "epoch": 0.015265233430861214, "ewc_loss": 0.00025016177096404135, "ewc_loss_diag": 1.4528632164001465e-07, "ewc_loss_parallel": 1.0520326441110228e-06, "grad_norm": 3.9192557334899902, "learning_rate": 5.0445103857566764e-08, "loss": 0.7928, "mean_token_accuracy": 0.7750020027160645, "num_tokens": 4613195.0, "step": 120 }, { "epoch": 0.015392443709451724, "ewc_loss": 0.00025465767248533666, "ewc_loss_diag": 1.4901161193847656e-07, "ewc_loss_parallel": 1.058844759427302e-06, "grad_norm": 3.5667569637298584, "learning_rate": 5.0869012293344637e-08, "loss": 0.7134, "mean_token_accuracy": 0.798256516456604, "num_tokens": 4657230.0, "step": 121 }, { "epoch": 0.015519653988042234, "ewc_loss": 0.0002598321298137307, "ewc_loss_diag": 1.5366822481155396e-07, "ewc_loss_parallel": 1.0629055395838805e-06, "grad_norm": 3.7236804962158203, "learning_rate": 5.129292072912251e-08, "loss": 0.787, "mean_token_accuracy": 0.780608594417572, "num_tokens": 4701436.0, "step": 122 }, { "epoch": 0.015646864266632744, "ewc_loss": 0.0002624310727696866, "ewc_loss_diag": 1.555308699607849e-07, "ewc_loss_parallel": 1.0698215646698372e-06, "grad_norm": 3.7935760021209717, "learning_rate": 5.171682916490038e-08, "loss": 0.7747, "mean_token_accuracy": 0.7803581953048706, "num_tokens": 4741806.0, "step": 123 }, { "epoch": 0.015774074545223254, "ewc_loss": 0.00026714851264841855, "ewc_loss_diag": 1.5925616025924683e-07, "ewc_loss_parallel": 1.0788489817059599e-06, "grad_norm": 3.988024950027466, "learning_rate": 5.2140737600678254e-08, "loss": 0.8117, "mean_token_accuracy": 0.7654769420623779, "num_tokens": 4780671.0, "step": 124 }, { "epoch": 0.015901284823813765, "ewc_loss": 0.00027405843138694763, "ewc_loss_diag": 1.648440957069397e-07, "ewc_loss_parallel": 1.090727892005816e-06, "grad_norm": 3.9226977825164795, "learning_rate": 5.256464603645612e-08, "loss": 0.7604, "mean_token_accuracy": 0.7863008975982666, "num_tokens": 4816879.0, "step": 125 }, { "epoch": 0.016028495102404273, "ewc_loss": 0.00028301545535214245, "ewc_loss_diag": 1.7229467630386353e-07, "ewc_loss_parallel": 1.1040041272281087e-06, "grad_norm": 3.8474886417388916, "learning_rate": 5.298855447223399e-08, "loss": 0.8131, "mean_token_accuracy": 0.7676842212677002, "num_tokens": 4858704.0, "step": 126 }, { "epoch": 0.016155705380994784, "ewc_loss": 0.00028969088452868164, "ewc_loss_diag": 1.7695128917694092e-07, "ewc_loss_parallel": 1.1230745258217212e-06, "grad_norm": 3.8383710384368896, "learning_rate": 5.3412462908011865e-08, "loss": 0.7697, "mean_token_accuracy": 0.7827020883560181, "num_tokens": 4896780.0, "step": 127 }, { "epoch": 0.016282915659585295, "ewc_loss": 0.00029554704087786376, "ewc_loss_diag": 1.8067657947540283e-07, "ewc_loss_parallel": 1.1530258916536695e-06, "grad_norm": 4.062897205352783, "learning_rate": 5.383637134378974e-08, "loss": 0.7994, "mean_token_accuracy": 0.7721726894378662, "num_tokens": 4932108.0, "step": 128 }, { "epoch": 0.016410125938175806, "ewc_loss": 0.00030688827973790467, "ewc_loss_diag": 1.8719583749771118e-07, "ewc_loss_parallel": 1.1996811508652172e-06, "grad_norm": 3.7717654705047607, "learning_rate": 5.426027977956761e-08, "loss": 0.7751, "mean_token_accuracy": 0.7824088335037231, "num_tokens": 4970384.0, "step": 129 }, { "epoch": 0.016537336216766316, "ewc_loss": 0.00032630361965857446, "ewc_loss_diag": 1.993030309677124e-07, "ewc_loss_parallel": 1.2698568525593146e-06, "grad_norm": 4.207439422607422, "learning_rate": 5.468418821534548e-08, "loss": 0.7832, "mean_token_accuracy": 0.7770127654075623, "num_tokens": 5005996.0, "step": 130 }, { "epoch": 0.016664546495356824, "ewc_loss": 0.0003508435911498964, "ewc_loss_diag": 2.123415470123291e-07, "ewc_loss_parallel": 1.3817420949635562e-06, "grad_norm": 3.9026081562042236, "learning_rate": 5.5108096651123356e-08, "loss": 0.7625, "mean_token_accuracy": 0.7820358276367188, "num_tokens": 5044208.0, "step": 131 }, { "epoch": 0.016791756773947335, "ewc_loss": 0.0003846850013360381, "ewc_loss_diag": 2.2910535335540771e-07, "ewc_loss_parallel": 1.558031840431795e-06, "grad_norm": 3.76191782951355, "learning_rate": 5.553200508690123e-08, "loss": 0.7768, "mean_token_accuracy": 0.7765272855758667, "num_tokens": 5085173.0, "step": 132 }, { "epoch": 0.016918967052537846, "ewc_loss": 0.00041904058889485896, "ewc_loss_diag": 2.384185791015625e-07, "ewc_loss_parallel": 1.8062200979329646e-06, "grad_norm": 4.192916393280029, "learning_rate": 5.59559135226791e-08, "loss": 0.7915, "mean_token_accuracy": 0.7740029096603394, "num_tokens": 5118440.0, "step": 133 }, { "epoch": 0.017046177331128357, "ewc_loss": 0.0004620541585609317, "ewc_loss_diag": 2.514570951461792e-07, "ewc_loss_parallel": 2.1028413357271347e-06, "grad_norm": 4.18226957321167, "learning_rate": 5.637982195845697e-08, "loss": 0.7938, "mean_token_accuracy": 0.7748976945877075, "num_tokens": 5155202.0, "step": 134 }, { "epoch": 0.017173387609718864, "ewc_loss": 0.0005057987873442471, "ewc_loss_diag": 2.6635825634002686e-07, "ewc_loss_parallel": 2.3876996237959247e-06, "grad_norm": 3.9294204711914062, "learning_rate": 5.680373039423484e-08, "loss": 0.7783, "mean_token_accuracy": 0.7807179093360901, "num_tokens": 5193240.0, "step": 135 }, { "epoch": 0.017300597888309375, "ewc_loss": 0.000537435756996274, "ewc_loss_diag": 2.7194619178771973e-07, "ewc_loss_parallel": 2.6468494525033748e-06, "grad_norm": 3.877439498901367, "learning_rate": 5.722763883001271e-08, "loss": 0.7662, "mean_token_accuracy": 0.7820161581039429, "num_tokens": 5230975.0, "step": 136 }, { "epoch": 0.017427808166899886, "ewc_loss": 0.0005695034633390605, "ewc_loss_diag": 2.8312206268310547e-07, "ewc_loss_parallel": 2.8721588023472577e-06, "grad_norm": 4.264955043792725, "learning_rate": 5.7651547265790585e-08, "loss": 0.8263, "mean_token_accuracy": 0.7654924392700195, "num_tokens": 5264786.0, "step": 137 }, { "epoch": 0.017555018445490397, "ewc_loss": 0.0006049983203411102, "ewc_loss_diag": 2.980232238769531e-07, "ewc_loss_parallel": 3.07451909975498e-06, "grad_norm": 3.8489389419555664, "learning_rate": 5.807545570156846e-08, "loss": 0.789, "mean_token_accuracy": 0.7754392623901367, "num_tokens": 5301554.0, "step": 138 }, { "epoch": 0.017682228724080904, "ewc_loss": 0.0006403764709830284, "ewc_loss_diag": 3.166496753692627e-07, "ewc_loss_parallel": 3.2375662613048917e-06, "grad_norm": 3.857989549636841, "learning_rate": 5.849936413734633e-08, "loss": 0.728, "mean_token_accuracy": 0.7917879819869995, "num_tokens": 5337356.0, "step": 139 }, { "epoch": 0.017809439002671415, "ewc_loss": 0.0006617017788812518, "ewc_loss_diag": 3.241002559661865e-07, "ewc_loss_parallel": 3.3745252494554734e-06, "grad_norm": 3.7041146755218506, "learning_rate": 5.89232725731242e-08, "loss": 0.8001, "mean_token_accuracy": 0.7749589085578918, "num_tokens": 5375767.0, "step": 140 }, { "epoch": 0.017936649281261926, "ewc_loss": 0.0006856013787910342, "ewc_loss_diag": 3.371387720108032e-07, "ewc_loss_parallel": 3.480006625977694e-06, "grad_norm": 3.8173768520355225, "learning_rate": 5.9347181008902075e-08, "loss": 0.7508, "mean_token_accuracy": 0.7838706970214844, "num_tokens": 5413683.0, "step": 141 }, { "epoch": 0.018063859559852437, "ewc_loss": 0.0007037727627903223, "ewc_loss_diag": 3.46451997756958e-07, "ewc_loss_parallel": 3.566352688721963e-06, "grad_norm": 3.3695969581604004, "learning_rate": 5.977108944467995e-08, "loss": 0.7332, "mean_token_accuracy": 0.7887769937515259, "num_tokens": 5457750.0, "step": 142 }, { "epoch": 0.018191069838442948, "ewc_loss": 0.0007166464347392321, "ewc_loss_diag": 3.5390257835388184e-07, "ewc_loss_parallel": 3.618795517468243e-06, "grad_norm": 3.5792014598846436, "learning_rate": 6.019499788045781e-08, "loss": 0.7273, "mean_token_accuracy": 0.7901921272277832, "num_tokens": 5495743.0, "step": 143 }, { "epoch": 0.018318280117033455, "ewc_loss": 0.000727956008631736, "ewc_loss_diag": 3.632158041000366e-07, "ewc_loss_parallel": 3.6555977658281336e-06, "grad_norm": 3.5893168449401855, "learning_rate": 6.061890631623569e-08, "loss": 0.7409, "mean_token_accuracy": 0.7829563617706299, "num_tokens": 5532198.0, "step": 144 }, { "epoch": 0.018445490395623966, "ewc_loss": 0.0007407880621030927, "ewc_loss_diag": 3.725290298461914e-07, "ewc_loss_parallel": 3.6885508052364457e-06, "grad_norm": 3.6284611225128174, "learning_rate": 6.104281475201356e-08, "loss": 0.7536, "mean_token_accuracy": 0.7831873893737793, "num_tokens": 5568977.0, "step": 145 }, { "epoch": 0.018572700674214477, "ewc_loss": 0.0007431150879710913, "ewc_loss_diag": 3.725290298461914e-07, "ewc_loss_parallel": 3.711820681928657e-06, "grad_norm": 3.5756289958953857, "learning_rate": 6.146672318779143e-08, "loss": 0.7658, "mean_token_accuracy": 0.7753271460533142, "num_tokens": 5606229.0, "step": 146 }, { "epoch": 0.018699910952804988, "ewc_loss": 0.0007499409257434309, "ewc_loss_diag": 3.781169652938843e-07, "ewc_loss_parallel": 3.7228589917503996e-06, "grad_norm": 3.284898042678833, "learning_rate": 6.18906316235693e-08, "loss": 0.7677, "mean_token_accuracy": 0.7776029109954834, "num_tokens": 5649828.0, "step": 147 }, { "epoch": 0.018827121231395495, "ewc_loss": 0.0007569550070911646, "ewc_loss_diag": 3.855675458908081e-07, "ewc_loss_parallel": 3.7167062600929057e-06, "grad_norm": 3.3956050872802734, "learning_rate": 6.231454005934718e-08, "loss": 0.7138, "mean_token_accuracy": 0.7977794408798218, "num_tokens": 5687433.0, "step": 148 }, { "epoch": 0.018954331509986006, "ewc_loss": 0.000758146052248776, "ewc_loss_diag": 3.8743019104003906e-07, "ewc_loss_parallel": 3.7095426250743913e-06, "grad_norm": 3.5033628940582275, "learning_rate": 6.273844849512505e-08, "loss": 0.7445, "mean_token_accuracy": 0.7843354940414429, "num_tokens": 5723247.0, "step": 149 }, { "epoch": 0.019081541788576517, "ewc_loss": 0.0007621792610734701, "ewc_loss_diag": 3.91155481338501e-07, "ewc_loss_parallel": 3.711727913469076e-06, "grad_norm": 3.3575222492218018, "learning_rate": 6.316235693090292e-08, "loss": 0.7597, "mean_token_accuracy": 0.7794438004493713, "num_tokens": 5760305.0, "step": 150 }, { "epoch": 0.019208752067167028, "ewc_loss": 0.0007742887828499079, "ewc_loss_diag": 4.041939973831177e-07, "ewc_loss_parallel": 3.699308763316367e-06, "grad_norm": 3.232259511947632, "learning_rate": 6.35862653666808e-08, "loss": 0.7607, "mean_token_accuracy": 0.777920126914978, "num_tokens": 5800586.0, "step": 151 }, { "epoch": 0.01933596234575754, "ewc_loss": 0.0007760816952213645, "ewc_loss_diag": 4.079192876815796e-07, "ewc_loss_parallel": 3.679090468722279e-06, "grad_norm": 3.252563953399658, "learning_rate": 6.401017380245867e-08, "loss": 0.7524, "mean_token_accuracy": 0.779800295829773, "num_tokens": 5840351.0, "step": 152 }, { "epoch": 0.019463172624348046, "ewc_loss": 0.0007777721039019525, "ewc_loss_diag": 4.116445779800415e-07, "ewc_loss_parallel": 3.6578480830939952e-06, "grad_norm": 3.3462986946105957, "learning_rate": 6.443408223823654e-08, "loss": 0.7927, "mean_token_accuracy": 0.7710720300674438, "num_tokens": 5880162.0, "step": 153 }, { "epoch": 0.019590382902938557, "ewc_loss": 0.0007755064289085567, "ewc_loss_diag": 4.116445779800415e-07, "ewc_loss_parallel": 3.635191205830779e-06, "grad_norm": 3.4797825813293457, "learning_rate": 6.485799067401441e-08, "loss": 0.8487, "mean_token_accuracy": 0.7560012340545654, "num_tokens": 5922213.0, "step": 154 }, { "epoch": 0.019717593181529068, "ewc_loss": 0.0007810048409737647, "ewc_loss_diag": 4.1909515857696533e-07, "ewc_loss_parallel": 3.6138812902208883e-06, "grad_norm": 3.32401442527771, "learning_rate": 6.528189910979228e-08, "loss": 0.7298, "mean_token_accuracy": 0.7859637141227722, "num_tokens": 5957461.0, "step": 155 }, { "epoch": 0.01984480346011958, "ewc_loss": 0.0007762604509480298, "ewc_loss_diag": 4.172325134277344e-07, "ewc_loss_parallel": 3.5855109672411345e-06, "grad_norm": 3.2055258750915527, "learning_rate": 6.570580754557016e-08, "loss": 0.7152, "mean_token_accuracy": 0.7861344814300537, "num_tokens": 5994675.0, "step": 156 }, { "epoch": 0.019972013738710086, "ewc_loss": 0.0007766047492623329, "ewc_loss_diag": 4.209578037261963e-07, "ewc_loss_parallel": 3.5508066957845585e-06, "grad_norm": 3.177704334259033, "learning_rate": 6.612971598134802e-08, "loss": 0.6785, "mean_token_accuracy": 0.801832914352417, "num_tokens": 6032514.0, "step": 157 }, { "epoch": 0.020099224017300597, "ewc_loss": 0.000775470572989434, "ewc_loss_diag": 4.2282044887542725e-07, "ewc_loss_parallel": 3.520391828715219e-06, "grad_norm": 3.3880720138549805, "learning_rate": 6.655362441712589e-08, "loss": 0.7503, "mean_token_accuracy": 0.7806335091590881, "num_tokens": 6069160.0, "step": 158 }, { "epoch": 0.020226434295891108, "ewc_loss": 0.0007731589721515775, "ewc_loss_diag": 4.2282044887542725e-07, "ewc_loss_parallel": 3.497275884001283e-06, "grad_norm": 3.3718295097351074, "learning_rate": 6.697753285290376e-08, "loss": 0.8043, "mean_token_accuracy": 0.7656818628311157, "num_tokens": 6106652.0, "step": 159 }, { "epoch": 0.02035364457448162, "ewc_loss": 0.0007725044852122664, "ewc_loss_diag": 4.246830940246582e-07, "ewc_loss_parallel": 3.4716570098680677e-06, "grad_norm": 3.2801129817962646, "learning_rate": 6.740144128868163e-08, "loss": 0.7391, "mean_token_accuracy": 0.7862910032272339, "num_tokens": 6143070.0, "step": 160 }, { "epoch": 0.020480854853072127, "ewc_loss": 0.0007813718402758241, "ewc_loss_diag": 4.3585896492004395e-07, "ewc_loss_parallel": 3.4458898880984634e-06, "grad_norm": 3.0017526149749756, "learning_rate": 6.78253497244595e-08, "loss": 0.7316, "mean_token_accuracy": 0.7873073220252991, "num_tokens": 6186495.0, "step": 161 }, { "epoch": 0.020608065131662637, "ewc_loss": 0.0007783938199281693, "ewc_loss_diag": 4.3585896492004395e-07, "ewc_loss_parallel": 3.416110075704637e-06, "grad_norm": 3.4928183555603027, "learning_rate": 6.824925816023738e-08, "loss": 0.7318, "mean_token_accuracy": 0.787117600440979, "num_tokens": 6222454.0, "step": 162 }, { "epoch": 0.02073527541025315, "ewc_loss": 0.0007811725954525173, "ewc_loss_diag": 4.414469003677368e-07, "ewc_loss_parallel": 3.4057504763040924e-06, "grad_norm": 3.8887321949005127, "learning_rate": 6.867316659601525e-08, "loss": 0.7434, "mean_token_accuracy": 0.7863706350326538, "num_tokens": 6253260.0, "step": 163 }, { "epoch": 0.02086248568884366, "ewc_loss": 0.0007853655843064189, "ewc_loss_diag": 4.4517219066619873e-07, "ewc_loss_parallel": 3.409533519516117e-06, "grad_norm": 3.045983076095581, "learning_rate": 6.909707503179312e-08, "loss": 0.7251, "mean_token_accuracy": 0.7908272743225098, "num_tokens": 6294773.0, "step": 164 }, { "epoch": 0.02098969596743417, "ewc_loss": 0.0007900435011833906, "ewc_loss_diag": 4.5262277126312256e-07, "ewc_loss_parallel": 3.3800188248278573e-06, "grad_norm": 3.2040486335754395, "learning_rate": 6.9520983467571e-08, "loss": 0.7686, "mean_token_accuracy": 0.7762541174888611, "num_tokens": 6335334.0, "step": 165 }, { "epoch": 0.021116906246024678, "ewc_loss": 0.0007926409598439932, "ewc_loss_diag": 4.5634806156158447e-07, "ewc_loss_parallel": 3.367846375113004e-06, "grad_norm": 3.1339306831359863, "learning_rate": 6.994489190334887e-08, "loss": 0.7036, "mean_token_accuracy": 0.7928107976913452, "num_tokens": 6372255.0, "step": 166 }, { "epoch": 0.02124411652461519, "ewc_loss": 0.0007959491922520101, "ewc_loss_diag": 4.600733518600464e-07, "ewc_loss_parallel": 3.3627818538661813e-06, "grad_norm": 2.898845672607422, "learning_rate": 7.036880033912674e-08, "loss": 0.6804, "mean_token_accuracy": 0.8013847470283508, "num_tokens": 6411971.0, "step": 167 }, { "epoch": 0.0213713268032057, "ewc_loss": 0.0008060320978984237, "ewc_loss_diag": 4.7124922275543213e-07, "ewc_loss_parallel": 3.3491701287857722e-06, "grad_norm": 3.451866388320923, "learning_rate": 7.079270877490461e-08, "loss": 0.7498, "mean_token_accuracy": 0.7807163596153259, "num_tokens": 6450366.0, "step": 168 }, { "epoch": 0.02149853708179621, "ewc_loss": 0.0008130879723466933, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 3.3625080959609477e-06, "grad_norm": 3.5235302448272705, "learning_rate": 7.121661721068249e-08, "loss": 0.7126, "mean_token_accuracy": 0.7901326417922974, "num_tokens": 6482199.0, "step": 169 }, { "epoch": 0.021625747360386718, "ewc_loss": 0.0008145702304318547, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 3.3773303584894165e-06, "grad_norm": 3.172593116760254, "learning_rate": 7.164052564646036e-08, "loss": 0.6946, "mean_token_accuracy": 0.7993905544281006, "num_tokens": 6519697.0, "step": 170 }, { "epoch": 0.02175295763897723, "ewc_loss": 0.0008150445646606386, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 3.3820740554801887e-06, "grad_norm": 3.339770555496216, "learning_rate": 7.206443408223823e-08, "loss": 0.7546, "mean_token_accuracy": 0.7797400951385498, "num_tokens": 6554687.0, "step": 171 }, { "epoch": 0.02188016791756774, "ewc_loss": 0.0008178490679711103, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 3.4101190067303833e-06, "grad_norm": 3.3182692527770996, "learning_rate": 7.24883425180161e-08, "loss": 0.7174, "mean_token_accuracy": 0.783591091632843, "num_tokens": 6593066.0, "step": 172 }, { "epoch": 0.02200737819615825, "ewc_loss": 0.0008211784297600389, "ewc_loss_diag": 4.76837158203125e-07, "ewc_loss_parallel": 3.443412424530834e-06, "grad_norm": 2.9831488132476807, "learning_rate": 7.291225095379398e-08, "loss": 0.6757, "mean_token_accuracy": 0.8036434650421143, "num_tokens": 6635484.0, "step": 173 }, { "epoch": 0.02213458847474876, "ewc_loss": 0.0008276688167825341, "ewc_loss_diag": 4.805624485015869e-07, "ewc_loss_parallel": 3.4701697586569935e-06, "grad_norm": 3.021592855453491, "learning_rate": 7.333615938957185e-08, "loss": 0.7679, "mean_token_accuracy": 0.7767712473869324, "num_tokens": 6677090.0, "step": 174 }, { "epoch": 0.02226179875333927, "ewc_loss": 0.0008391793817281723, "ewc_loss_diag": 4.880130290985107e-07, "ewc_loss_parallel": 3.508981308186776e-06, "grad_norm": 3.6344549655914307, "learning_rate": 7.376006782534971e-08, "loss": 0.6828, "mean_token_accuracy": 0.7978848218917847, "num_tokens": 6710250.0, "step": 175 }, { "epoch": 0.02238900903192978, "ewc_loss": 0.0008992903167381883, "ewc_loss_diag": 5.401670932769775e-07, "ewc_loss_parallel": 3.576033122953959e-06, "grad_norm": 7.727354049682617, "learning_rate": 7.418397626112758e-08, "loss": 0.7092, "mean_token_accuracy": 0.7892789840698242, "num_tokens": 6752998.0, "step": 176 }, { "epoch": 0.02251621931052029, "ewc_loss": 0.0008629107614979148, "ewc_loss_diag": 4.917383193969727e-07, "ewc_loss_parallel": 3.7081479149492225e-06, "grad_norm": 3.474050998687744, "learning_rate": 7.460788469690545e-08, "loss": 0.7102, "mean_token_accuracy": 0.7953976392745972, "num_tokens": 6789568.0, "step": 177 }, { "epoch": 0.0226434295891108, "ewc_loss": 0.0008871670579537749, "ewc_loss_diag": 5.103647708892822e-07, "ewc_loss_parallel": 3.759976152650779e-06, "grad_norm": 3.842992067337036, "learning_rate": 7.503179313268333e-08, "loss": 0.7505, "mean_token_accuracy": 0.7792120575904846, "num_tokens": 6822810.0, "step": 178 }, { "epoch": 0.02277063986770131, "ewc_loss": 0.0008986742468550801, "ewc_loss_diag": 5.140900611877441e-07, "ewc_loss_parallel": 3.836901214526733e-06, "grad_norm": 3.0844640731811523, "learning_rate": 7.54557015684612e-08, "loss": 0.6895, "mean_token_accuracy": 0.7948135137557983, "num_tokens": 6861598.0, "step": 179 }, { "epoch": 0.02289785014629182, "ewc_loss": 0.0009141332120634615, "ewc_loss_diag": 5.21540641784668e-07, "ewc_loss_parallel": 3.9151968849182595e-06, "grad_norm": 2.9602315425872803, "learning_rate": 7.587961000423907e-08, "loss": 0.7114, "mean_token_accuracy": 0.7916728258132935, "num_tokens": 6900220.0, "step": 180 }, { "epoch": 0.02302506042488233, "ewc_loss": 0.0009300677920691669, "ewc_loss_diag": 5.289912223815918e-07, "ewc_loss_parallel": 3.998248757852707e-06, "grad_norm": 2.9989092350006104, "learning_rate": 7.630351844001694e-08, "loss": 0.8195, "mean_token_accuracy": 0.7608166337013245, "num_tokens": 6946983.0, "step": 181 }, { "epoch": 0.02315227070347284, "ewc_loss": 0.0009544992353767157, "ewc_loss_diag": 5.438923835754395e-07, "ewc_loss_parallel": 4.089975391252665e-06, "grad_norm": 3.1978447437286377, "learning_rate": 7.672742687579482e-08, "loss": 0.7237, "mean_token_accuracy": 0.7887639403343201, "num_tokens": 6986019.0, "step": 182 }, { "epoch": 0.02327948098206335, "ewc_loss": 0.0009779534302651882, "ewc_loss_diag": 5.550682544708252e-07, "ewc_loss_parallel": 4.210076895105885e-06, "grad_norm": 3.331430196762085, "learning_rate": 7.715133531157269e-08, "loss": 0.6971, "mean_token_accuracy": 0.7905606031417847, "num_tokens": 7020722.0, "step": 183 }, { "epoch": 0.02340669126065386, "ewc_loss": 0.0009917845018208027, "ewc_loss_diag": 5.587935447692871e-07, "ewc_loss_parallel": 4.348386937635951e-06, "grad_norm": 3.486039161682129, "learning_rate": 7.757524374735056e-08, "loss": 0.7683, "mean_token_accuracy": 0.77558833360672, "num_tokens": 7062691.0, "step": 184 }, { "epoch": 0.02353390153924437, "ewc_loss": 0.001023265183903277, "ewc_loss_diag": 5.736947059631348e-07, "ewc_loss_parallel": 4.510605776886223e-06, "grad_norm": 3.254021167755127, "learning_rate": 7.799915218312844e-08, "loss": 0.7156, "mean_token_accuracy": 0.7908542156219482, "num_tokens": 7101090.0, "step": 185 }, { "epoch": 0.02366111181783488, "ewc_loss": 0.0010471615241840482, "ewc_loss_diag": 5.811452865600586e-07, "ewc_loss_parallel": 4.673275725508574e-06, "grad_norm": 3.3022661209106445, "learning_rate": 7.842306061890631e-08, "loss": 0.6884, "mean_token_accuracy": 0.7919862270355225, "num_tokens": 7139891.0, "step": 186 }, { "epoch": 0.023788322096425393, "ewc_loss": 0.0010716248070821166, "ewc_loss_diag": 5.885958671569824e-07, "ewc_loss_parallel": 4.841614099859726e-06, "grad_norm": 3.3396034240722656, "learning_rate": 7.88469690546842e-08, "loss": 0.7359, "mean_token_accuracy": 0.7833043932914734, "num_tokens": 7179501.0, "step": 187 }, { "epoch": 0.0239155323750159, "ewc_loss": 0.0011000060476362705, "ewc_loss_diag": 5.997717380523682e-07, "ewc_loss_parallel": 5.010985660192091e-06, "grad_norm": 3.4790401458740234, "learning_rate": 7.927087749046207e-08, "loss": 0.7049, "mean_token_accuracy": 0.7909963726997375, "num_tokens": 7213809.0, "step": 188 }, { "epoch": 0.02404274265360641, "ewc_loss": 0.0011288481764495373, "ewc_loss_diag": 6.109476089477539e-07, "ewc_loss_parallel": 5.1849669944203924e-06, "grad_norm": 3.2439615726470947, "learning_rate": 7.969478592623994e-08, "loss": 0.7791, "mean_token_accuracy": 0.7689516544342041, "num_tokens": 7254493.0, "step": 189 }, { "epoch": 0.024169952932196922, "ewc_loss": 0.001156376558355987, "ewc_loss_diag": 6.221234798431396e-07, "ewc_loss_parallel": 5.345809313439531e-06, "grad_norm": 3.2783260345458984, "learning_rate": 8.011869436201781e-08, "loss": 0.6367, "mean_token_accuracy": 0.8111610412597656, "num_tokens": 7292643.0, "step": 190 }, { "epoch": 0.024297163210787433, "ewc_loss": 0.0011911166366189718, "ewc_loss_diag": 6.407499313354492e-07, "ewc_loss_parallel": 5.502475232788129e-06, "grad_norm": 3.816100835800171, "learning_rate": 8.054260279779568e-08, "loss": 0.7589, "mean_token_accuracy": 0.7790032625198364, "num_tokens": 7329810.0, "step": 191 }, { "epoch": 0.02442437348937794, "ewc_loss": 0.0012127242516726255, "ewc_loss_diag": 6.444752216339111e-07, "ewc_loss_parallel": 5.680403319274774e-06, "grad_norm": 3.199585199356079, "learning_rate": 8.096651123357356e-08, "loss": 0.7446, "mean_token_accuracy": 0.7809116840362549, "num_tokens": 7367630.0, "step": 192 }, { "epoch": 0.02455158376796845, "ewc_loss": 0.001226717373356223, "ewc_loss_diag": 6.444752216339111e-07, "ewc_loss_parallel": 5.8203345361107495e-06, "grad_norm": 3.163254976272583, "learning_rate": 8.139041966935143e-08, "loss": 0.7265, "mean_token_accuracy": 0.785261869430542, "num_tokens": 7411580.0, "step": 193 }, { "epoch": 0.024678794046558962, "ewc_loss": 0.00130407081451267, "ewc_loss_diag": 7.078051567077637e-07, "ewc_loss_parallel": 5.945371412963141e-06, "grad_norm": 7.953936576843262, "learning_rate": 8.181432810512929e-08, "loss": 0.6801, "mean_token_accuracy": 0.7977839708328247, "num_tokens": 7451631.0, "step": 194 }, { "epoch": 0.024806004325149473, "ewc_loss": 0.001292229862883687, "ewc_loss_diag": 6.705522537231445e-07, "ewc_loss_parallel": 6.208431841514539e-06, "grad_norm": 3.7997381687164307, "learning_rate": 8.223823654090716e-08, "loss": 0.6893, "mean_token_accuracy": 0.7951313257217407, "num_tokens": 7493645.0, "step": 195 }, { "epoch": 0.024933214603739984, "ewc_loss": 0.0013065438251942396, "ewc_loss_diag": 6.705522537231445e-07, "ewc_loss_parallel": 6.3515717556583695e-06, "grad_norm": 3.696808099746704, "learning_rate": 8.266214497668503e-08, "loss": 0.649, "mean_token_accuracy": 0.8085569739341736, "num_tokens": 7538042.0, "step": 196 }, { "epoch": 0.02506042488233049, "ewc_loss": 0.0013167806901037693, "ewc_loss_diag": 6.705522537231445e-07, "ewc_loss_parallel": 6.4539394770690706e-06, "grad_norm": 3.227679967880249, "learning_rate": 8.30860534124629e-08, "loss": 0.7113, "mean_token_accuracy": 0.7888268232345581, "num_tokens": 7575374.0, "step": 197 }, { "epoch": 0.025187635160921002, "ewc_loss": 0.001326009165495634, "ewc_loss_diag": 6.742775440216064e-07, "ewc_loss_parallel": 6.508077603939455e-06, "grad_norm": 4.086698532104492, "learning_rate": 8.350996184824078e-08, "loss": 0.7033, "mean_token_accuracy": 0.7906665802001953, "num_tokens": 7606880.0, "step": 198 }, { "epoch": 0.025314845439511513, "ewc_loss": 0.0013424481730908155, "ewc_loss_diag": 6.817281246185303e-07, "ewc_loss_parallel": 6.596173534489935e-06, "grad_norm": 3.3542861938476562, "learning_rate": 8.393387028401865e-08, "loss": 0.6815, "mean_token_accuracy": 0.7993730902671814, "num_tokens": 7644840.0, "step": 199 }, { "epoch": 0.025442055718102024, "ewc_loss": 0.0013600513339042664, "ewc_loss_diag": 6.966292858123779e-07, "ewc_loss_parallel": 6.619616669922834e-06, "grad_norm": 3.1440560817718506, "learning_rate": 8.435777871979652e-08, "loss": 0.7227, "mean_token_accuracy": 0.7848478555679321, "num_tokens": 7683856.0, "step": 200 }, { "epoch": 0.02556926599669253, "ewc_loss": 0.0013803669717162848, "ewc_loss_diag": 7.152557373046875e-07, "ewc_loss_parallel": 6.632039458054351e-06, "grad_norm": 4.124300479888916, "learning_rate": 8.47816871555744e-08, "loss": 0.6793, "mean_token_accuracy": 0.7980621457099915, "num_tokens": 7715306.0, "step": 201 }, { "epoch": 0.025696476275283042, "ewc_loss": 0.0014050937024876475, "ewc_loss_diag": 7.338821887969971e-07, "ewc_loss_parallel": 6.726718311256263e-06, "grad_norm": 3.5776801109313965, "learning_rate": 8.520559559135227e-08, "loss": 0.6402, "mean_token_accuracy": 0.808287501335144, "num_tokens": 7752442.0, "step": 202 }, { "epoch": 0.025823686553873553, "ewc_loss": 0.0014219859149307013, "ewc_loss_diag": 7.450580596923828e-07, "ewc_loss_parallel": 6.781199317629216e-06, "grad_norm": 4.281235218048096, "learning_rate": 8.562950402713014e-08, "loss": 0.7257, "mean_token_accuracy": 0.7874850630760193, "num_tokens": 7799891.0, "step": 203 }, { "epoch": 0.025950896832464064, "ewc_loss": 0.001431055716238916, "ewc_loss_diag": 7.450580596923828e-07, "ewc_loss_parallel": 6.871897312521469e-06, "grad_norm": 3.9965527057647705, "learning_rate": 8.605341246290801e-08, "loss": 0.7123, "mean_token_accuracy": 0.7862036228179932, "num_tokens": 7830696.0, "step": 204 }, { "epoch": 0.026078107111054575, "ewc_loss": 0.0014493801863864064, "ewc_loss_diag": 7.562339305877686e-07, "ewc_loss_parallel": 6.940701041457942e-06, "grad_norm": 4.4486541748046875, "learning_rate": 8.647732089868589e-08, "loss": 0.6844, "mean_token_accuracy": 0.7982789278030396, "num_tokens": 7870968.0, "step": 205 }, { "epoch": 0.026205317389645082, "ewc_loss": 0.0014677023282274604, "ewc_loss_diag": 7.674098014831543e-07, "ewc_loss_parallel": 7.0094815782795195e-06, "grad_norm": 3.373568534851074, "learning_rate": 8.690122933446376e-08, "loss": 0.7091, "mean_token_accuracy": 0.7899554967880249, "num_tokens": 7914418.0, "step": 206 }, { "epoch": 0.026332527668235593, "ewc_loss": 0.0014607867924496531, "ewc_loss_diag": 7.636845111846924e-07, "ewc_loss_parallel": 6.978473265917273e-06, "grad_norm": 3.839318037033081, "learning_rate": 8.732513777024163e-08, "loss": 0.6467, "mean_token_accuracy": 0.8120861053466797, "num_tokens": 7954544.0, "step": 207 }, { "epoch": 0.026459737946826104, "ewc_loss": 0.0014706698711961508, "ewc_loss_diag": 7.748603820800781e-07, "ewc_loss_parallel": 6.962864063098095e-06, "grad_norm": 3.226027011871338, "learning_rate": 8.77490462060195e-08, "loss": 0.6947, "mean_token_accuracy": 0.7959800362586975, "num_tokens": 7997373.0, "step": 208 }, { "epoch": 0.026586948225416615, "ewc_loss": 0.0014759525656700134, "ewc_loss_diag": 7.860362529754639e-07, "ewc_loss_parallel": 6.901248980284436e-06, "grad_norm": 4.113339900970459, "learning_rate": 8.817295464179738e-08, "loss": 0.658, "mean_token_accuracy": 0.7987750768661499, "num_tokens": 8033314.0, "step": 209 }, { "epoch": 0.026714158504007122, "ewc_loss": 0.0014922611881047487, "ewc_loss_diag": 8.009374141693115e-07, "ewc_loss_parallel": 6.9117477323743515e-06, "grad_norm": 3.7786903381347656, "learning_rate": 8.859686307757525e-08, "loss": 0.6779, "mean_token_accuracy": 0.7988284230232239, "num_tokens": 8069362.0, "step": 210 }, { "epoch": 0.026841368782597633, "ewc_loss": 0.0015059902798384428, "ewc_loss_diag": 8.158385753631592e-07, "ewc_loss_parallel": 6.896450486237882e-06, "grad_norm": 3.5333120822906494, "learning_rate": 8.902077151335312e-08, "loss": 0.7103, "mean_token_accuracy": 0.7867426872253418, "num_tokens": 8109126.0, "step": 211 }, { "epoch": 0.026968579061188144, "ewc_loss": 0.0015007528709247708, "ewc_loss_diag": 8.121132850646973e-07, "ewc_loss_parallel": 6.882223260618048e-06, "grad_norm": 3.4356751441955566, "learning_rate": 8.944467994913098e-08, "loss": 0.7629, "mean_token_accuracy": 0.7790102958679199, "num_tokens": 8143083.0, "step": 212 }, { "epoch": 0.027095789339778655, "ewc_loss": 0.0015250500291585922, "ewc_loss_diag": 8.381903171539307e-07, "ewc_loss_parallel": 6.858166671008803e-06, "grad_norm": 4.189538955688477, "learning_rate": 8.986858838490885e-08, "loss": 0.6134, "mean_token_accuracy": 0.8116267919540405, "num_tokens": 8176636.0, "step": 213 }, { "epoch": 0.027222999618369163, "ewc_loss": 0.0015346844447776675, "ewc_loss_diag": 8.456408977508545e-07, "ewc_loss_parallel": 6.878216481709387e-06, "grad_norm": 3.2618613243103027, "learning_rate": 9.029249682068673e-08, "loss": 0.6743, "mean_token_accuracy": 0.7977156043052673, "num_tokens": 8220262.0, "step": 214 }, { "epoch": 0.027350209896959674, "ewc_loss": 0.0015596625162288547, "ewc_loss_diag": 8.791685104370117e-07, "ewc_loss_parallel": 6.8228214331611525e-06, "grad_norm": 5.402998447418213, "learning_rate": 9.07164052564646e-08, "loss": 0.6916, "mean_token_accuracy": 0.7945253849029541, "num_tokens": 8256303.0, "step": 215 }, { "epoch": 0.027477420175550184, "ewc_loss": 0.0015477933920919895, "ewc_loss_diag": 8.568167686462402e-07, "ewc_loss_parallel": 6.894865691720042e-06, "grad_norm": 6.406033515930176, "learning_rate": 9.114031369224247e-08, "loss": 0.6591, "mean_token_accuracy": 0.8061675429344177, "num_tokens": 8293888.0, "step": 216 }, { "epoch": 0.027604630454140695, "ewc_loss": 0.001569757703691721, "ewc_loss_diag": 8.67992639541626e-07, "ewc_loss_parallel": 7.000066489126766e-06, "grad_norm": 6.257645130157471, "learning_rate": 9.156422212802034e-08, "loss": 0.6203, "mean_token_accuracy": 0.813434898853302, "num_tokens": 8335368.0, "step": 217 }, { "epoch": 0.027731840732731206, "ewc_loss": 0.0015819271793588996, "ewc_loss_diag": 8.754432201385498e-07, "ewc_loss_parallel": 7.083614946168382e-06, "grad_norm": 4.831065654754639, "learning_rate": 9.198813056379822e-08, "loss": 0.6738, "mean_token_accuracy": 0.801565945148468, "num_tokens": 8370338.0, "step": 218 }, { "epoch": 0.027859051011321714, "ewc_loss": 0.001589380670338869, "ewc_loss_diag": 8.828938007354736e-07, "ewc_loss_parallel": 7.081855528667802e-06, "grad_norm": 3.8101489543914795, "learning_rate": 9.241203899957609e-08, "loss": 0.6597, "mean_token_accuracy": 0.802204430103302, "num_tokens": 8409904.0, "step": 219 }, { "epoch": 0.027986261289912225, "ewc_loss": 0.0015854202210903168, "ewc_loss_diag": 8.903443813323975e-07, "ewc_loss_parallel": 6.965957709326176e-06, "grad_norm": 3.6238996982574463, "learning_rate": 9.283594743535396e-08, "loss": 0.7866, "mean_token_accuracy": 0.7689913511276245, "num_tokens": 8446579.0, "step": 220 }, { "epoch": 0.028113471568502735, "ewc_loss": 0.0015705181285738945, "ewc_loss_diag": 8.903443813323975e-07, "ewc_loss_parallel": 6.816936547693331e-06, "grad_norm": 2.776280403137207, "learning_rate": 9.325985587113183e-08, "loss": 0.6349, "mean_token_accuracy": 0.8118845224380493, "num_tokens": 8483663.0, "step": 221 }, { "epoch": 0.028240681847093246, "ewc_loss": 0.0015604556538164616, "ewc_loss_diag": 9.015202522277832e-07, "ewc_loss_parallel": 6.601870609301841e-06, "grad_norm": 4.387193202972412, "learning_rate": 9.368376430690971e-08, "loss": 0.6637, "mean_token_accuracy": 0.8006258010864258, "num_tokens": 8516346.0, "step": 222 }, { "epoch": 0.028367892125683754, "ewc_loss": 0.0015554449055343866, "ewc_loss_diag": 9.015202522277832e-07, "ewc_loss_parallel": 6.551763362949714e-06, "grad_norm": 3.8672661781311035, "learning_rate": 9.410767274268758e-08, "loss": 0.6611, "mean_token_accuracy": 0.803450345993042, "num_tokens": 8556914.0, "step": 223 }, { "epoch": 0.028495102404274265, "ewc_loss": 0.0015626871027052402, "ewc_loss_diag": 9.126961231231689e-07, "ewc_loss_parallel": 6.509744707727805e-06, "grad_norm": 4.198505878448486, "learning_rate": 9.453158117846545e-08, "loss": 0.6507, "mean_token_accuracy": 0.8031326532363892, "num_tokens": 8596548.0, "step": 224 }, { "epoch": 0.028622312682864776, "ewc_loss": 0.0015698657371103764, "ewc_loss_diag": 9.201467037200928e-07, "ewc_loss_parallel": 6.505237706733169e-06, "grad_norm": 4.222657680511475, "learning_rate": 9.495548961424333e-08, "loss": 0.6804, "mean_token_accuracy": 0.8008344173431396, "num_tokens": 8634366.0, "step": 225 }, { "epoch": 0.028749522961455286, "ewc_loss": 0.0015777908265590668, "ewc_loss_diag": 9.275972843170166e-07, "ewc_loss_parallel": 6.508194474008633e-06, "grad_norm": 4.408182144165039, "learning_rate": 9.53793980500212e-08, "loss": 0.6201, "mean_token_accuracy": 0.8118242025375366, "num_tokens": 8666480.0, "step": 226 }, { "epoch": 0.028876733240045797, "ewc_loss": 0.0015846670139580965, "ewc_loss_diag": 9.350478649139404e-07, "ewc_loss_parallel": 6.5006615841411985e-06, "grad_norm": 6.691761016845703, "learning_rate": 9.580330648579907e-08, "loss": 0.7702, "mean_token_accuracy": 0.7763980627059937, "num_tokens": 8705880.0, "step": 227 }, { "epoch": 0.029003943518636305, "ewc_loss": 0.0015949581284075975, "ewc_loss_diag": 9.387731552124023e-07, "ewc_loss_parallel": 6.565425337612396e-06, "grad_norm": 4.6664252281188965, "learning_rate": 9.622721492157694e-08, "loss": 0.6315, "mean_token_accuracy": 0.8093459010124207, "num_tokens": 8743772.0, "step": 228 }, { "epoch": 0.029131153797226816, "ewc_loss": 0.0015928801149129868, "ewc_loss_diag": 9.387731552124023e-07, "ewc_loss_parallel": 6.54464611216099e-06, "grad_norm": 3.590668201446533, "learning_rate": 9.665112335735482e-08, "loss": 0.6564, "mean_token_accuracy": 0.7998962998390198, "num_tokens": 8775919.0, "step": 229 }, { "epoch": 0.029258364075817327, "ewc_loss": 0.0015854643424972892, "ewc_loss_diag": 9.424984455108643e-07, "ewc_loss_parallel": 6.432341251638718e-06, "grad_norm": 4.198570728302002, "learning_rate": 9.707503179313267e-08, "loss": 0.6052, "mean_token_accuracy": 0.8150609135627747, "num_tokens": 8809336.0, "step": 230 }, { "epoch": 0.029385574354407838, "ewc_loss": 0.001604076474905014, "ewc_loss_diag": 9.685754776000977e-07, "ewc_loss_parallel": 6.351433967211051e-06, "grad_norm": 8.675256729125977, "learning_rate": 9.749894022891055e-08, "loss": 0.6591, "mean_token_accuracy": 0.8020392656326294, "num_tokens": 8845952.0, "step": 231 }, { "epoch": 0.029512784632998345, "ewc_loss": 0.0015921733574941754, "ewc_loss_diag": 9.499490261077881e-07, "ewc_loss_parallel": 6.423137165256776e-06, "grad_norm": 4.200858116149902, "learning_rate": 9.792284866468842e-08, "loss": 0.679, "mean_token_accuracy": 0.7932847142219543, "num_tokens": 8889801.0, "step": 232 }, { "epoch": 0.029639994911588856, "ewc_loss": 0.0015842781867831945, "ewc_loss_diag": 9.499490261077881e-07, "ewc_loss_parallel": 6.344185294437921e-06, "grad_norm": 4.52830171585083, "learning_rate": 9.834675710046629e-08, "loss": 0.6385, "mean_token_accuracy": 0.8051337003707886, "num_tokens": 8925429.0, "step": 233 }, { "epoch": 0.029767205190179367, "ewc_loss": 0.0015888424823060632, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 6.275387477217009e-06, "grad_norm": 3.894378185272217, "learning_rate": 9.877066553624416e-08, "loss": 0.6708, "mean_token_accuracy": 0.798152506351471, "num_tokens": 8963360.0, "step": 234 }, { "epoch": 0.029894415468769878, "ewc_loss": 0.0015787893207743764, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 6.174856025609188e-06, "grad_norm": 5.256947040557861, "learning_rate": 9.919457397202204e-08, "loss": 0.6822, "mean_token_accuracy": 0.7986240983009338, "num_tokens": 8998314.0, "step": 235 }, { "epoch": 0.030021625747360385, "ewc_loss": 0.0015748180449008942, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 6.135142484708922e-06, "grad_norm": 4.569917678833008, "learning_rate": 9.961848240779991e-08, "loss": 0.7514, "mean_token_accuracy": 0.7787885665893555, "num_tokens": 9034804.0, "step": 236 }, { "epoch": 0.030148836025950896, "ewc_loss": 0.0015686795813962817, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 6.073758413549513e-06, "grad_norm": 3.552046775817871, "learning_rate": 1.0004239084357778e-07, "loss": 0.6742, "mean_token_accuracy": 0.7994645833969116, "num_tokens": 9070840.0, "step": 237 }, { "epoch": 0.030276046304541407, "ewc_loss": 0.0015588535461574793, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 5.975498424959369e-06, "grad_norm": 4.858114719390869, "learning_rate": 1.0046629927935566e-07, "loss": 0.6565, "mean_token_accuracy": 0.8038707971572876, "num_tokens": 9109370.0, "step": 238 }, { "epoch": 0.030403256583131918, "ewc_loss": 0.0015588637907058, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 5.975601197860669e-06, "grad_norm": 4.6323394775390625, "learning_rate": 1.0089020771513353e-07, "loss": 0.6003, "mean_token_accuracy": 0.8197939395904541, "num_tokens": 9148191.0, "step": 239 }, { "epoch": 0.03053046686172243, "ewc_loss": 0.0015564245404675603, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 5.951208549959119e-06, "grad_norm": 4.526782035827637, "learning_rate": 1.013141161509114e-07, "loss": 0.6669, "mean_token_accuracy": 0.8008451461791992, "num_tokens": 9182923.0, "step": 240 }, { "epoch": 0.030657677140312936, "ewc_loss": 0.0015535315033048391, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 5.9222775234957226e-06, "grad_norm": 5.087978363037109, "learning_rate": 1.0173802458668927e-07, "loss": 0.6662, "mean_token_accuracy": 0.7973479628562927, "num_tokens": 9215882.0, "step": 241 }, { "epoch": 0.030784887418903447, "ewc_loss": 0.0015523077454417944, "ewc_loss_diag": 9.611248970031738e-07, "ewc_loss_parallel": 5.910040272283368e-06, "grad_norm": 4.812439441680908, "learning_rate": 1.0216193302246715e-07, "loss": 0.6982, "mean_token_accuracy": 0.7863467931747437, "num_tokens": 9249718.0, "step": 242 }, { "epoch": 0.030912097697493958, "ewc_loss": 0.0015590048860758543, "ewc_loss_diag": 9.685754776000977e-07, "ewc_loss_parallel": 5.9007170420954935e-06, "grad_norm": 3.8164541721343994, "learning_rate": 1.0258584145824502e-07, "loss": 0.6212, "mean_token_accuracy": 0.8128457069396973, "num_tokens": 9291146.0, "step": 243 }, { "epoch": 0.03103930797608447, "ewc_loss": 0.0015660746721550822, "ewc_loss_diag": 9.834766387939453e-07, "ewc_loss_parallel": 5.818827503389912e-06, "grad_norm": 4.827563762664795, "learning_rate": 1.0300974989402289e-07, "loss": 0.6124, "mean_token_accuracy": 0.8128111362457275, "num_tokens": 9326403.0, "step": 244 }, { "epoch": 0.031166518254674976, "ewc_loss": 0.0015635081799700856, "ewc_loss_diag": 9.834766387939453e-07, "ewc_loss_parallel": 5.793162927147932e-06, "grad_norm": 4.466235637664795, "learning_rate": 1.0343365832980076e-07, "loss": 0.6659, "mean_token_accuracy": 0.8009451031684875, "num_tokens": 9368491.0, "step": 245 }, { "epoch": 0.03129372853326549, "ewc_loss": 0.0015594502910971642, "ewc_loss_diag": 9.834766387939453e-07, "ewc_loss_parallel": 5.7525844567862805e-06, "grad_norm": 5.429673671722412, "learning_rate": 1.0385756676557864e-07, "loss": 0.6614, "mean_token_accuracy": 0.8024076223373413, "num_tokens": 9409041.0, "step": 246 }, { "epoch": 0.031420938811855995, "ewc_loss": 0.001552697503939271, "ewc_loss_diag": 9.760260581970215e-07, "ewc_loss_parallel": 5.761349711974617e-06, "grad_norm": 4.874011039733887, "learning_rate": 1.0428147520135651e-07, "loss": 0.6342, "mean_token_accuracy": 0.8092062473297119, "num_tokens": 9448591.0, "step": 247 }, { "epoch": 0.03154814909044651, "ewc_loss": 0.0015581888146698475, "ewc_loss_diag": 9.834766387939453e-07, "ewc_loss_parallel": 5.739968855777988e-06, "grad_norm": 4.125752925872803, "learning_rate": 1.0470538363713437e-07, "loss": 0.6649, "mean_token_accuracy": 0.7982169985771179, "num_tokens": 9486604.0, "step": 248 }, { "epoch": 0.031675359369037016, "ewc_loss": 0.001551889581605792, "ewc_loss_diag": 9.834766387939453e-07, "ewc_loss_parallel": 5.676977252733195e-06, "grad_norm": 5.329333782196045, "learning_rate": 1.0512929207291224e-07, "loss": 0.6704, "mean_token_accuracy": 0.7944692373275757, "num_tokens": 9525285.0, "step": 249 }, { "epoch": 0.03180256964762753, "ewc_loss": 0.0015592249110341072, "ewc_loss_diag": 9.909272193908691e-07, "ewc_loss_parallel": 5.6740359468676616e-06, "grad_norm": 6.602771759033203, "learning_rate": 1.0555320050869011e-07, "loss": 0.6517, "mean_token_accuracy": 0.7992502450942993, "num_tokens": 9557898.0, "step": 250 }, { "epoch": 0.03192977992621804, "ewc_loss": 0.0015624910593032837, "ewc_loss_diag": 9.909272193908691e-07, "ewc_loss_parallel": 5.706698175345082e-06, "grad_norm": 4.72873592376709, "learning_rate": 1.0597710894446799e-07, "loss": 0.7096, "mean_token_accuracy": 0.7837443351745605, "num_tokens": 9590438.0, "step": 251 }, { "epoch": 0.032056990204808546, "ewc_loss": 0.0015582866035401821, "ewc_loss_diag": 9.909272193908691e-07, "ewc_loss_parallel": 5.664653144776821e-06, "grad_norm": 3.5739879608154297, "learning_rate": 1.0640101738024586e-07, "loss": 0.6217, "mean_token_accuracy": 0.8118619918823242, "num_tokens": 9631402.0, "step": 252 }, { "epoch": 0.03218420048339906, "ewc_loss": 0.0015552768018096685, "ewc_loss_diag": 9.98377799987793e-07, "ewc_loss_parallel": 5.558260909310775e-06, "grad_norm": 7.904615879058838, "learning_rate": 1.0682492581602373e-07, "loss": 0.6029, "mean_token_accuracy": 0.8178318738937378, "num_tokens": 9663702.0, "step": 253 }, { "epoch": 0.03231141076198957, "ewc_loss": 0.0015553080011159182, "ewc_loss_diag": 9.909272193908691e-07, "ewc_loss_parallel": 5.634866283799056e-06, "grad_norm": 5.1785759925842285, "learning_rate": 1.072488342518016e-07, "loss": 0.5909, "mean_token_accuracy": 0.8178805708885193, "num_tokens": 9698969.0, "step": 254 }, { "epoch": 0.03243862104058008, "ewc_loss": 0.0015604998916387558, "ewc_loss_diag": 9.98377799987793e-07, "ewc_loss_parallel": 5.610491371044191e-06, "grad_norm": 3.981520414352417, "learning_rate": 1.0767274268757948e-07, "loss": 0.6604, "mean_token_accuracy": 0.7988193035125732, "num_tokens": 9738075.0, "step": 255 }, { "epoch": 0.03256583131917059, "ewc_loss": 0.001550584565848112, "ewc_loss_diag": 9.98377799987793e-07, "ewc_loss_parallel": 5.511338713404257e-06, "grad_norm": 3.984198570251465, "learning_rate": 1.0809665112335735e-07, "loss": 0.665, "mean_token_accuracy": 0.8012773990631104, "num_tokens": 9782675.0, "step": 256 }, { "epoch": 0.0326930415977611, "ewc_loss": 0.0015447591431438923, "ewc_loss_diag": 9.98377799987793e-07, "ewc_loss_parallel": 5.453084213513648e-06, "grad_norm": 4.242582321166992, "learning_rate": 1.0852055955913522e-07, "loss": 0.5653, "mean_token_accuracy": 0.8263108730316162, "num_tokens": 9820585.0, "step": 257 }, { "epoch": 0.03282025187635161, "ewc_loss": 0.0015503638423979282, "ewc_loss_diag": 1.0058283805847168e-06, "ewc_loss_parallel": 5.432837951957481e-06, "grad_norm": 4.469728946685791, "learning_rate": 1.089444679949131e-07, "loss": 0.6899, "mean_token_accuracy": 0.7926496267318726, "num_tokens": 9860693.0, "step": 258 }, { "epoch": 0.03294746215494212, "ewc_loss": 0.0015482462476938963, "ewc_loss_diag": 1.0058283805847168e-06, "ewc_loss_parallel": 5.411661732068751e-06, "grad_norm": 4.12086820602417, "learning_rate": 1.0936837643069097e-07, "loss": 0.6231, "mean_token_accuracy": 0.8114787340164185, "num_tokens": 9902629.0, "step": 259 }, { "epoch": 0.03307467243353263, "ewc_loss": 0.0015537745784968138, "ewc_loss_diag": 1.0132789611816406e-06, "ewc_loss_parallel": 5.390651494963095e-06, "grad_norm": 4.5203938484191895, "learning_rate": 1.0979228486646884e-07, "loss": 0.5948, "mean_token_accuracy": 0.8156590461730957, "num_tokens": 9937304.0, "step": 260 }, { "epoch": 0.03320188271212314, "ewc_loss": 0.0015552400145679712, "ewc_loss_diag": 1.0132789611816406e-06, "ewc_loss_parallel": 5.405305728345411e-06, "grad_norm": 3.954284906387329, "learning_rate": 1.1021619330224671e-07, "loss": 0.7279, "mean_token_accuracy": 0.7817083597183228, "num_tokens": 9976911.0, "step": 261 }, { "epoch": 0.03332909299071365, "ewc_loss": 0.00157600874081254, "ewc_loss_diag": 1.0356307029724121e-06, "ewc_loss_parallel": 5.384110409067944e-06, "grad_norm": 3.9985201358795166, "learning_rate": 1.1064010173802458e-07, "loss": 0.6627, "mean_token_accuracy": 0.8010083436965942, "num_tokens": 10015740.0, "step": 262 }, { "epoch": 0.03345630326930416, "ewc_loss": 0.001558720599859953, "ewc_loss_diag": 1.0207295417785645e-06, "ewc_loss_parallel": 5.363817308534635e-06, "grad_norm": 4.078646659851074, "learning_rate": 1.1106401017380246e-07, "loss": 0.6001, "mean_token_accuracy": 0.8181617856025696, "num_tokens": 10053054.0, "step": 263 }, { "epoch": 0.03358351354789467, "ewc_loss": 0.0015667395200580359, "ewc_loss_diag": 1.0281801223754883e-06, "ewc_loss_parallel": 5.367712219594978e-06, "grad_norm": 5.82050895690918, "learning_rate": 1.1148791860958033e-07, "loss": 0.6944, "mean_token_accuracy": 0.7867136001586914, "num_tokens": 10082088.0, "step": 264 }, { "epoch": 0.03371072382648518, "ewc_loss": 0.0015919626457616687, "ewc_loss_diag": 1.043081283569336e-06, "ewc_loss_parallel": 5.467356004373869e-06, "grad_norm": 4.121659278869629, "learning_rate": 1.119118270453582e-07, "loss": 0.6691, "mean_token_accuracy": 0.7969461679458618, "num_tokens": 10121122.0, "step": 265 }, { "epoch": 0.03383793410507569, "ewc_loss": 0.0015920009464025497, "ewc_loss_diag": 1.043081283569336e-06, "ewc_loss_parallel": 5.467739356390666e-06, "grad_norm": 7.865329265594482, "learning_rate": 1.1233573548113607e-07, "loss": 0.6634, "mean_token_accuracy": 0.8011429905891418, "num_tokens": 10161017.0, "step": 266 }, { "epoch": 0.0339651443836662, "ewc_loss": 0.0016137060010805726, "ewc_loss_diag": 1.0505318641662598e-06, "ewc_loss_parallel": 5.60849548492115e-06, "grad_norm": 4.458491325378418, "learning_rate": 1.1275964391691393e-07, "loss": 0.6303, "mean_token_accuracy": 0.8108820915222168, "num_tokens": 10204913.0, "step": 267 }, { "epoch": 0.03409235466225671, "ewc_loss": 0.001618530135601759, "ewc_loss_diag": 1.0579824447631836e-06, "ewc_loss_parallel": 5.580443030339666e-06, "grad_norm": 4.175411224365234, "learning_rate": 1.131835523526918e-07, "loss": 0.6783, "mean_token_accuracy": 0.7925974130630493, "num_tokens": 10245154.0, "step": 268 }, { "epoch": 0.03421956494084722, "ewc_loss": 0.0016284503508359194, "ewc_loss_diag": 1.0728836059570312e-06, "ewc_loss_parallel": 5.527056600840297e-06, "grad_norm": 7.609451770782471, "learning_rate": 1.1360746078846968e-07, "loss": 0.6854, "mean_token_accuracy": 0.7924753427505493, "num_tokens": 10283006.0, "step": 269 }, { "epoch": 0.03434677521943773, "ewc_loss": 0.0016467771492898464, "ewc_loss_diag": 1.080334186553955e-06, "ewc_loss_parallel": 5.634031822410179e-06, "grad_norm": 6.608564853668213, "learning_rate": 1.1403136922424755e-07, "loss": 0.6443, "mean_token_accuracy": 0.8035182952880859, "num_tokens": 10316689.0, "step": 270 }, { "epoch": 0.03447398549802824, "ewc_loss": 0.0016606297576799989, "ewc_loss_diag": 1.087784767150879e-06, "ewc_loss_parallel": 5.696263542631641e-06, "grad_norm": 4.293202877044678, "learning_rate": 1.1445527766002542e-07, "loss": 0.6392, "mean_token_accuracy": 0.8030403852462769, "num_tokens": 10358145.0, "step": 271 }, { "epoch": 0.03460119577661875, "ewc_loss": 0.001651258673518896, "ewc_loss_diag": 1.087784767150879e-06, "ewc_loss_parallel": 5.602552846539766e-06, "grad_norm": 4.708072185516357, "learning_rate": 1.148791860958033e-07, "loss": 0.7394, "mean_token_accuracy": 0.7789455056190491, "num_tokens": 10393424.0, "step": 272 }, { "epoch": 0.034728406055209264, "ewc_loss": 0.0016488904366269708, "ewc_loss_diag": 1.087784767150879e-06, "ewc_loss_parallel": 5.5788700592529494e-06, "grad_norm": 3.8487484455108643, "learning_rate": 1.1530309453158117e-07, "loss": 0.6543, "mean_token_accuracy": 0.8012695908546448, "num_tokens": 10433560.0, "step": 273 }, { "epoch": 0.03485561633379977, "ewc_loss": 0.0016506676329299808, "ewc_loss_diag": 1.0952353477478027e-06, "ewc_loss_parallel": 5.5203481679200195e-06, "grad_norm": 4.481837749481201, "learning_rate": 1.1572700296735904e-07, "loss": 0.6505, "mean_token_accuracy": 0.8016635179519653, "num_tokens": 10472024.0, "step": 274 }, { "epoch": 0.03498282661239028, "ewc_loss": 0.0016498616896569729, "ewc_loss_diag": 1.0952353477478027e-06, "ewc_loss_parallel": 5.512288225872908e-06, "grad_norm": 7.139129161834717, "learning_rate": 1.1615091140313691e-07, "loss": 0.6898, "mean_token_accuracy": 0.7920454740524292, "num_tokens": 10510874.0, "step": 275 }, { "epoch": 0.03511003689098079, "ewc_loss": 0.0016695563681423664, "ewc_loss_diag": 1.1026859283447266e-06, "ewc_loss_parallel": 5.6329413382627536e-06, "grad_norm": 6.372291088104248, "learning_rate": 1.1657481983891479e-07, "loss": 0.6275, "mean_token_accuracy": 0.8096054792404175, "num_tokens": 10552411.0, "step": 276 }, { "epoch": 0.0352372471695713, "ewc_loss": 0.0016778477001935244, "ewc_loss_diag": 1.1026859283447266e-06, "ewc_loss_parallel": 5.715854513255181e-06, "grad_norm": 4.074565410614014, "learning_rate": 1.1699872827469266e-07, "loss": 0.6432, "mean_token_accuracy": 0.8059550523757935, "num_tokens": 10591537.0, "step": 277 }, { "epoch": 0.03536445744816181, "ewc_loss": 0.00167941115796566, "ewc_loss_diag": 1.1101365089416504e-06, "ewc_loss_parallel": 5.6551948546257336e-06, "grad_norm": 3.4884088039398193, "learning_rate": 1.1742263671047053e-07, "loss": 0.6508, "mean_token_accuracy": 0.8035869002342224, "num_tokens": 10632228.0, "step": 278 }, { "epoch": 0.03549166772675232, "ewc_loss": 0.0016725125024095178, "ewc_loss_diag": 1.1101365089416504e-06, "ewc_loss_parallel": 5.586208772001555e-06, "grad_norm": 6.604696750640869, "learning_rate": 1.178465451462484e-07, "loss": 0.7066, "mean_token_accuracy": 0.7846832275390625, "num_tokens": 10672829.0, "step": 279 }, { "epoch": 0.03561887800534283, "ewc_loss": 0.0016852422850206494, "ewc_loss_diag": 1.1101365089416504e-06, "ewc_loss_parallel": 5.713507107429905e-06, "grad_norm": 5.546504497528076, "learning_rate": 1.1827045358202628e-07, "loss": 0.68, "mean_token_accuracy": 0.8001009821891785, "num_tokens": 10706146.0, "step": 280 }, { "epoch": 0.035746088283933344, "ewc_loss": 0.0016900496557354927, "ewc_loss_diag": 1.1175870895385742e-06, "ewc_loss_parallel": 5.761579814134166e-06, "grad_norm": 4.286739349365234, "learning_rate": 1.1869436201780415e-07, "loss": 0.5977, "mean_token_accuracy": 0.8196762800216675, "num_tokens": 10744112.0, "step": 281 }, { "epoch": 0.03587329856252385, "ewc_loss": 0.0016937539912760258, "ewc_loss_diag": 1.125037670135498e-06, "ewc_loss_parallel": 5.722329206037102e-06, "grad_norm": 4.425850868225098, "learning_rate": 1.1911827045358202e-07, "loss": 0.6282, "mean_token_accuracy": 0.8135502338409424, "num_tokens": 10783726.0, "step": 282 }, { "epoch": 0.03600050884111436, "ewc_loss": 0.0016861023614183068, "ewc_loss_diag": 1.1175870895385742e-06, "ewc_loss_parallel": 5.7221077440772206e-06, "grad_norm": 4.500190258026123, "learning_rate": 1.195421788893599e-07, "loss": 0.6927, "mean_token_accuracy": 0.7934533953666687, "num_tokens": 10824476.0, "step": 283 }, { "epoch": 0.036127719119704874, "ewc_loss": 0.0017103919526562095, "ewc_loss_diag": 1.1399388313293457e-06, "ewc_loss_parallel": 5.736121693189489e-06, "grad_norm": 4.408007621765137, "learning_rate": 1.1996608732513778e-07, "loss": 0.6517, "mean_token_accuracy": 0.8025617003440857, "num_tokens": 10859997.0, "step": 284 }, { "epoch": 0.03625492939829538, "ewc_loss": 0.001719904481433332, "ewc_loss_diag": 1.1473894119262695e-06, "ewc_loss_parallel": 5.754953235737048e-06, "grad_norm": 4.536210536956787, "learning_rate": 1.2038999576091563e-07, "loss": 0.6227, "mean_token_accuracy": 0.8114798069000244, "num_tokens": 10898800.0, "step": 285 }, { "epoch": 0.036382139676885895, "ewc_loss": 0.0017243430484086275, "ewc_loss_diag": 1.1473894119262695e-06, "ewc_loss_parallel": 5.7993383961729705e-06, "grad_norm": 4.559199810028076, "learning_rate": 1.208139041966935e-07, "loss": 0.5886, "mean_token_accuracy": 0.821333646774292, "num_tokens": 10937462.0, "step": 286 }, { "epoch": 0.0365093499554764, "ewc_loss": 0.0017365082167088985, "ewc_loss_diag": 1.1548399925231934e-06, "ewc_loss_parallel": 5.844697170687141e-06, "grad_norm": 4.77649450302124, "learning_rate": 1.2123781263247137e-07, "loss": 0.6494, "mean_token_accuracy": 0.7999516725540161, "num_tokens": 10970739.0, "step": 287 }, { "epoch": 0.03663656023406691, "ewc_loss": 0.0017420417862012982, "ewc_loss_diag": 1.1548399925231934e-06, "ewc_loss_parallel": 5.9000321925850585e-06, "grad_norm": 4.1860809326171875, "learning_rate": 1.2166172106824924e-07, "loss": 0.7016, "mean_token_accuracy": 0.7945982813835144, "num_tokens": 11003000.0, "step": 288 }, { "epoch": 0.036763770512657425, "ewc_loss": 0.0017583849839866161, "ewc_loss_diag": 1.169741153717041e-06, "ewc_loss_parallel": 5.910876552661648e-06, "grad_norm": 4.512075424194336, "learning_rate": 1.2208562950402712e-07, "loss": 0.6065, "mean_token_accuracy": 0.8122104406356812, "num_tokens": 11039665.0, "step": 289 }, { "epoch": 0.03689098079124793, "ewc_loss": 0.001770450733602047, "ewc_loss_diag": 1.1771917343139648e-06, "ewc_loss_parallel": 5.955239430477377e-06, "grad_norm": 3.893357753753662, "learning_rate": 1.22509537939805e-07, "loss": 0.6339, "mean_token_accuracy": 0.8021834492683411, "num_tokens": 11078368.0, "step": 290 }, { "epoch": 0.03701819106983844, "ewc_loss": 0.0017867038259282708, "ewc_loss_diag": 1.1920928955078125e-06, "ewc_loss_parallel": 5.9651829360518605e-06, "grad_norm": 4.0189080238342285, "learning_rate": 1.2293344637558286e-07, "loss": 0.6599, "mean_token_accuracy": 0.801239550113678, "num_tokens": 11122654.0, "step": 291 }, { "epoch": 0.037145401348428954, "ewc_loss": 0.0017893924377858639, "ewc_loss_diag": 1.1920928955078125e-06, "ewc_loss_parallel": 5.992068963678321e-06, "grad_norm": 4.432868003845215, "learning_rate": 1.2335735481136073e-07, "loss": 0.6142, "mean_token_accuracy": 0.8070754408836365, "num_tokens": 11162622.0, "step": 292 }, { "epoch": 0.03727261162701946, "ewc_loss": 0.0017974943621084094, "ewc_loss_diag": 1.1920928955078125e-06, "ewc_loss_parallel": 6.073088115954306e-06, "grad_norm": 5.195871829986572, "learning_rate": 1.237812632471386e-07, "loss": 0.6262, "mean_token_accuracy": 0.8102418184280396, "num_tokens": 11202564.0, "step": 293 }, { "epoch": 0.037399821905609976, "ewc_loss": 0.001847691833972931, "ewc_loss_diag": 1.2293457984924316e-06, "ewc_loss_parallel": 6.193592980707763e-06, "grad_norm": 4.661961078643799, "learning_rate": 1.2420517168291648e-07, "loss": 0.6442, "mean_token_accuracy": 0.8010966777801514, "num_tokens": 11243422.0, "step": 294 }, { "epoch": 0.03752703218420048, "ewc_loss": 0.0018538732547312975, "ewc_loss_diag": 1.2293457984924316e-06, "ewc_loss_parallel": 6.25540678811376e-06, "grad_norm": 4.468352794647217, "learning_rate": 1.2462908011869435e-07, "loss": 0.6257, "mean_token_accuracy": 0.806810736656189, "num_tokens": 11280867.0, "step": 295 }, { "epoch": 0.03765424246279099, "ewc_loss": 0.0018574109999462962, "ewc_loss_diag": 1.2293457984924316e-06, "ewc_loss_parallel": 6.290784767770674e-06, "grad_norm": 4.900647163391113, "learning_rate": 1.2505298855447223e-07, "loss": 0.5751, "mean_token_accuracy": 0.822432279586792, "num_tokens": 11318454.0, "step": 296 }, { "epoch": 0.037781452741381505, "ewc_loss": 0.0018776440992951393, "ewc_loss_diag": 1.2442469596862793e-06, "ewc_loss_parallel": 6.340528216242092e-06, "grad_norm": 5.490735054016113, "learning_rate": 1.254768969902501e-07, "loss": 0.7153, "mean_token_accuracy": 0.7849355936050415, "num_tokens": 11354721.0, "step": 297 }, { "epoch": 0.03790866301997201, "ewc_loss": 0.0018858547555282712, "ewc_loss_diag": 1.2442469596862793e-06, "ewc_loss_parallel": 6.422634669434046e-06, "grad_norm": 7.498607158660889, "learning_rate": 1.2590080542602797e-07, "loss": 0.6724, "mean_token_accuracy": 0.7967195510864258, "num_tokens": 11385938.0, "step": 298 }, { "epoch": 0.03803587329856253, "ewc_loss": 0.0019182239193469286, "ewc_loss_diag": 1.259148120880127e-06, "ewc_loss_parallel": 6.593737907678587e-06, "grad_norm": 4.477560997009277, "learning_rate": 1.2632471386180584e-07, "loss": 0.6517, "mean_token_accuracy": 0.8020834922790527, "num_tokens": 11424911.0, "step": 299 }, { "epoch": 0.038163083577153034, "ewc_loss": 0.0019299916457384825, "ewc_loss_diag": 1.2740492820739746e-06, "ewc_loss_parallel": 6.5588269535510335e-06, "grad_norm": 5.057866096496582, "learning_rate": 1.2674862229758372e-07, "loss": 0.621, "mean_token_accuracy": 0.8114095330238342, "num_tokens": 11460580.0, "step": 300 }, { "epoch": 0.03829029385574354, "ewc_loss": 0.0019300032872706652, "ewc_loss_diag": 1.2740492820739746e-06, "ewc_loss_parallel": 6.5589433688728604e-06, "grad_norm": 4.0948381423950195, "learning_rate": 1.271725307333616e-07, "loss": 0.611, "mean_token_accuracy": 0.8145578503608704, "num_tokens": 11501031.0, "step": 301 }, { "epoch": 0.038417504134334056, "ewc_loss": 0.0019219373352825642, "ewc_loss_diag": 1.2740492820739746e-06, "ewc_loss_parallel": 6.478284831246128e-06, "grad_norm": 4.080338954925537, "learning_rate": 1.2759643916913946e-07, "loss": 0.6207, "mean_token_accuracy": 0.8104627132415771, "num_tokens": 11544053.0, "step": 302 }, { "epoch": 0.03854471441292456, "ewc_loss": 0.0019183261319994926, "ewc_loss_diag": 1.2740492820739746e-06, "ewc_loss_parallel": 6.4421719798701815e-06, "grad_norm": 4.7597975730896, "learning_rate": 1.2802034760491733e-07, "loss": 0.6361, "mean_token_accuracy": 0.8055626749992371, "num_tokens": 11576944.0, "step": 303 }, { "epoch": 0.03867192469151508, "ewc_loss": 0.0019461903721094131, "ewc_loss_diag": 1.296401023864746e-06, "ewc_loss_parallel": 6.491932253993582e-06, "grad_norm": 4.8938188552856445, "learning_rate": 1.284442560406952e-07, "loss": 0.6869, "mean_token_accuracy": 0.7905875444412231, "num_tokens": 11619735.0, "step": 304 }, { "epoch": 0.038799134970105585, "ewc_loss": 0.001998110208660364, "ewc_loss_diag": 1.341104507446289e-06, "ewc_loss_parallel": 6.5533677116036415e-06, "grad_norm": 8.373167037963867, "learning_rate": 1.2886816447647308e-07, "loss": 0.7231, "mean_token_accuracy": 0.7784852981567383, "num_tokens": 11655025.0, "step": 305 }, { "epoch": 0.03892634524869609, "ewc_loss": 0.001976409927010536, "ewc_loss_diag": 1.296401023864746e-06, "ewc_loss_parallel": 6.7941273300675675e-06, "grad_norm": 4.80946683883667, "learning_rate": 1.2929207291225095e-07, "loss": 0.6216, "mean_token_accuracy": 0.8111472129821777, "num_tokens": 11691970.0, "step": 306 }, { "epoch": 0.03905355552728661, "ewc_loss": 0.0019859462045133114, "ewc_loss_diag": 1.30385160446167e-06, "ewc_loss_parallel": 6.813198524469044e-06, "grad_norm": 5.662745475769043, "learning_rate": 1.2971598134802882e-07, "loss": 0.6374, "mean_token_accuracy": 0.8052245378494263, "num_tokens": 11729561.0, "step": 307 }, { "epoch": 0.039180765805877114, "ewc_loss": 0.0019890572875738144, "ewc_loss_diag": 1.30385160446167e-06, "ewc_loss_parallel": 6.844308245490538e-06, "grad_norm": 4.358199596405029, "learning_rate": 1.301398897838067e-07, "loss": 0.6549, "mean_token_accuracy": 0.7981117963790894, "num_tokens": 11765707.0, "step": 308 }, { "epoch": 0.03930797608446762, "ewc_loss": 0.0020042313262820244, "ewc_loss_diag": 1.3262033462524414e-06, "ewc_loss_parallel": 6.767166723875562e-06, "grad_norm": 4.012332916259766, "learning_rate": 1.3056379821958457e-07, "loss": 0.6642, "mean_token_accuracy": 0.8024654984474182, "num_tokens": 11806223.0, "step": 309 }, { "epoch": 0.039435186363058136, "ewc_loss": 0.0020039312075823545, "ewc_loss_diag": 1.3336539268493652e-06, "ewc_loss_parallel": 6.687872428301489e-06, "grad_norm": 5.121763706207275, "learning_rate": 1.3098770665536244e-07, "loss": 0.6195, "mean_token_accuracy": 0.8118550777435303, "num_tokens": 11845477.0, "step": 310 }, { "epoch": 0.039562396641648644, "ewc_loss": 0.002032008022069931, "ewc_loss_diag": 1.3560056686401367e-06, "ewc_loss_parallel": 6.739757736795582e-06, "grad_norm": 5.768850803375244, "learning_rate": 1.3141161509114031e-07, "loss": 0.6038, "mean_token_accuracy": 0.8097208738327026, "num_tokens": 11877973.0, "step": 311 }, { "epoch": 0.03968960692023916, "ewc_loss": 0.002034232020378113, "ewc_loss_diag": 1.3485550880432129e-06, "ewc_loss_parallel": 6.838291028543608e-06, "grad_norm": 5.525730133056641, "learning_rate": 1.3183552352691819e-07, "loss": 0.6102, "mean_token_accuracy": 0.8133679032325745, "num_tokens": 11912093.0, "step": 312 }, { "epoch": 0.039816817198829665, "ewc_loss": 0.002057216828688979, "ewc_loss_diag": 1.3634562492370605e-06, "ewc_loss_parallel": 6.915551239217166e-06, "grad_norm": 4.692998886108398, "learning_rate": 1.3225943196269603e-07, "loss": 0.6362, "mean_token_accuracy": 0.8093804717063904, "num_tokens": 11952541.0, "step": 313 }, { "epoch": 0.03994402747742017, "ewc_loss": 0.0020633565727621317, "ewc_loss_diag": 1.3709068298339844e-06, "ewc_loss_parallel": 6.90065598973888e-06, "grad_norm": 4.319879531860352, "learning_rate": 1.3268334039847393e-07, "loss": 0.6129, "mean_token_accuracy": 0.8103069067001343, "num_tokens": 11990414.0, "step": 314 }, { "epoch": 0.04007123775601069, "ewc_loss": 0.0020663258619606495, "ewc_loss_diag": 1.3783574104309082e-06, "ewc_loss_parallel": 6.854053935967386e-06, "grad_norm": 5.499081611633301, "learning_rate": 1.3310724883425178e-07, "loss": 0.6808, "mean_token_accuracy": 0.7906445264816284, "num_tokens": 12025925.0, "step": 315 }, { "epoch": 0.040198448034601195, "ewc_loss": 0.002087721601128578, "ewc_loss_diag": 1.3932585716247559e-06, "ewc_loss_parallel": 6.915423909958918e-06, "grad_norm": 4.914856433868408, "learning_rate": 1.3353115727002968e-07, "loss": 0.6484, "mean_token_accuracy": 0.8022128343582153, "num_tokens": 12068879.0, "step": 316 }, { "epoch": 0.04032565831319171, "ewc_loss": 0.002089101355522871, "ewc_loss_diag": 1.3932585716247559e-06, "ewc_loss_parallel": 6.929220489837462e-06, "grad_norm": 4.618359088897705, "learning_rate": 1.3395506570580752e-07, "loss": 0.6149, "mean_token_accuracy": 0.8110072612762451, "num_tokens": 12111931.0, "step": 317 }, { "epoch": 0.040452868591782216, "ewc_loss": 0.0020889549050480127, "ewc_loss_diag": 1.3932585716247559e-06, "ewc_loss_parallel": 6.9277566581149586e-06, "grad_norm": 6.364570617675781, "learning_rate": 1.3437897414158542e-07, "loss": 0.6814, "mean_token_accuracy": 0.7903624773025513, "num_tokens": 12140431.0, "step": 318 }, { "epoch": 0.040580078870372724, "ewc_loss": 0.0021109930239617825, "ewc_loss_diag": 1.4007091522216797e-06, "ewc_loss_parallel": 7.071843810990686e-06, "grad_norm": 5.018182277679443, "learning_rate": 1.3480288257736327e-07, "loss": 0.6122, "mean_token_accuracy": 0.8112808465957642, "num_tokens": 12173932.0, "step": 319 }, { "epoch": 0.04070728914896324, "ewc_loss": 0.002120434306561947, "ewc_loss_diag": 1.4081597328186035e-06, "ewc_loss_parallel": 7.089964583428809e-06, "grad_norm": 4.132867813110352, "learning_rate": 1.3522679101314117e-07, "loss": 0.5895, "mean_token_accuracy": 0.8162294626235962, "num_tokens": 12210040.0, "step": 320 }, { "epoch": 0.040834499427553746, "ewc_loss": 0.0021197302266955376, "ewc_loss_diag": 1.4156103134155273e-06, "ewc_loss_parallel": 7.006628948147409e-06, "grad_norm": 5.292703628540039, "learning_rate": 1.35650699448919e-07, "loss": 0.6231, "mean_token_accuracy": 0.8099105358123779, "num_tokens": 12248556.0, "step": 321 }, { "epoch": 0.04096170970614425, "ewc_loss": 0.002139379968866706, "ewc_loss_diag": 1.430511474609375e-06, "ewc_loss_parallel": 7.050537533359602e-06, "grad_norm": 4.283611297607422, "learning_rate": 1.360746078846969e-07, "loss": 0.6116, "mean_token_accuracy": 0.8111529350280762, "num_tokens": 12285664.0, "step": 322 }, { "epoch": 0.04108891998473477, "ewc_loss": 0.0021430286578834057, "ewc_loss_diag": 1.4454126358032227e-06, "ewc_loss_parallel": 7.0107312239997555e-06, "grad_norm": 5.108185768127441, "learning_rate": 1.3649851632047476e-07, "loss": 0.6649, "mean_token_accuracy": 0.797126829624176, "num_tokens": 12326964.0, "step": 323 }, { "epoch": 0.041216130263325275, "ewc_loss": 0.002142289886251092, "ewc_loss_diag": 1.430511474609375e-06, "ewc_loss_parallel": 7.079636816342827e-06, "grad_norm": 4.160231113433838, "learning_rate": 1.3692242475625266e-07, "loss": 0.5744, "mean_token_accuracy": 0.8252138495445251, "num_tokens": 12366541.0, "step": 324 }, { "epoch": 0.04134334054191579, "ewc_loss": 0.002145751379430294, "ewc_loss_diag": 1.4454126358032227e-06, "ewc_loss_parallel": 7.0379592216340825e-06, "grad_norm": 5.57349967956543, "learning_rate": 1.373463331920305e-07, "loss": 0.6366, "mean_token_accuracy": 0.8101495504379272, "num_tokens": 12405664.0, "step": 325 }, { "epoch": 0.0414705508205063, "ewc_loss": 0.0021721699740737677, "ewc_loss_diag": 1.4603137969970703e-06, "ewc_loss_parallel": 7.1495555857836735e-06, "grad_norm": 5.988322734832764, "learning_rate": 1.377702416278084e-07, "loss": 0.7001, "mean_token_accuracy": 0.7874166965484619, "num_tokens": 12445039.0, "step": 326 }, { "epoch": 0.041597761099096804, "ewc_loss": 0.0021841758862137794, "ewc_loss_diag": 1.4603137969970703e-06, "ewc_loss_parallel": 7.26961388863856e-06, "grad_norm": 4.401947021484375, "learning_rate": 1.3819415006358625e-07, "loss": 0.5698, "mean_token_accuracy": 0.8248598575592041, "num_tokens": 12485481.0, "step": 327 }, { "epoch": 0.04172497137768732, "ewc_loss": 0.0021867011673748493, "ewc_loss_diag": 1.4677643775939941e-06, "ewc_loss_parallel": 7.2185753197118174e-06, "grad_norm": 6.471611022949219, "learning_rate": 1.3861805849936415e-07, "loss": 0.5948, "mean_token_accuracy": 0.8199969530105591, "num_tokens": 12530272.0, "step": 328 }, { "epoch": 0.041852181656277826, "ewc_loss": 0.0021984409540891647, "ewc_loss_diag": 1.4677643775939941e-06, "ewc_loss_parallel": 7.335971531574614e-06, "grad_norm": 7.244236946105957, "learning_rate": 1.39041966935142e-07, "loss": 0.6407, "mean_token_accuracy": 0.8038028478622437, "num_tokens": 12565854.0, "step": 329 }, { "epoch": 0.04197939193486834, "ewc_loss": 0.0022132927551865578, "ewc_loss_diag": 1.4677643775939941e-06, "ewc_loss_parallel": 7.484488833142677e-06, "grad_norm": 3.632405996322632, "learning_rate": 1.394658753709199e-07, "loss": 0.6266, "mean_token_accuracy": 0.8093032836914062, "num_tokens": 12607313.0, "step": 330 }, { "epoch": 0.04210660221345885, "ewc_loss": 0.002181477379053831, "ewc_loss_diag": 1.4603137969970703e-06, "ewc_loss_parallel": 7.242629180836957e-06, "grad_norm": 5.298318386077881, "learning_rate": 1.3988978380669774e-07, "loss": 0.664, "mean_token_accuracy": 0.7971288561820984, "num_tokens": 12642057.0, "step": 331 }, { "epoch": 0.042233812492049355, "ewc_loss": 0.002198406495153904, "ewc_loss_diag": 1.475214958190918e-06, "ewc_loss_parallel": 7.2593334152770694e-06, "grad_norm": 3.879850387573242, "learning_rate": 1.403136922424756e-07, "loss": 0.6552, "mean_token_accuracy": 0.7999149560928345, "num_tokens": 12684118.0, "step": 332 }, { "epoch": 0.04236102277063987, "ewc_loss": 0.002200222574174404, "ewc_loss_diag": 1.4901161193847656e-06, "ewc_loss_parallel": 7.124906005628873e-06, "grad_norm": 4.192472457885742, "learning_rate": 1.4073760067825348e-07, "loss": 0.6202, "mean_token_accuracy": 0.809715986251831, "num_tokens": 12721945.0, "step": 333 }, { "epoch": 0.04248823304923038, "ewc_loss": 0.002198595553636551, "ewc_loss_diag": 1.4901161193847656e-06, "ewc_loss_parallel": 7.108635600161506e-06, "grad_norm": 4.124397277832031, "learning_rate": 1.4116150911403136e-07, "loss": 0.635, "mean_token_accuracy": 0.8051115870475769, "num_tokens": 12764175.0, "step": 334 }, { "epoch": 0.04261544332782089, "ewc_loss": 0.0022081094793975353, "ewc_loss_diag": 1.4975666999816895e-06, "ewc_loss_parallel": 7.127482604118995e-06, "grad_norm": 4.675732612609863, "learning_rate": 1.4158541754980923e-07, "loss": 0.5848, "mean_token_accuracy": 0.8218055963516235, "num_tokens": 12801600.0, "step": 335 }, { "epoch": 0.0427426536064114, "ewc_loss": 0.002231821184977889, "ewc_loss_diag": 1.5124678611755371e-06, "ewc_loss_parallel": 7.2120105869544204e-06, "grad_norm": 3.835737466812134, "learning_rate": 1.420093259855871e-07, "loss": 0.6169, "mean_token_accuracy": 0.8166512250900269, "num_tokens": 12843584.0, "step": 336 }, { "epoch": 0.042869863885001906, "ewc_loss": 0.0022448499221354723, "ewc_loss_diag": 1.5273690223693848e-06, "ewc_loss_parallel": 7.18971068636165e-06, "grad_norm": 4.8085713386535645, "learning_rate": 1.4243323442136497e-07, "loss": 0.6307, "mean_token_accuracy": 0.8066296577453613, "num_tokens": 12878105.0, "step": 337 }, { "epoch": 0.04299707416359242, "ewc_loss": 0.00226215529255569, "ewc_loss_diag": 1.5348196029663086e-06, "ewc_loss_parallel": 7.2864690991991665e-06, "grad_norm": 4.191989898681641, "learning_rate": 1.4285714285714285e-07, "loss": 0.6378, "mean_token_accuracy": 0.8056105971336365, "num_tokens": 12911712.0, "step": 338 }, { "epoch": 0.04312428444218293, "ewc_loss": 0.0022619101218879223, "ewc_loss_diag": 1.5348196029663086e-06, "ewc_loss_parallel": 7.284019375219941e-06, "grad_norm": 4.3836798667907715, "learning_rate": 1.4328105129292072e-07, "loss": 0.6194, "mean_token_accuracy": 0.812039852142334, "num_tokens": 12951303.0, "step": 339 }, { "epoch": 0.043251494720773435, "ewc_loss": 0.0022799877915531397, "ewc_loss_diag": 1.5497207641601562e-06, "ewc_loss_parallel": 7.312206435017288e-06, "grad_norm": 4.6724677085876465, "learning_rate": 1.437049597286986e-07, "loss": 0.6061, "mean_token_accuracy": 0.8144128322601318, "num_tokens": 12990544.0, "step": 340 }, { "epoch": 0.04337870499936395, "ewc_loss": 0.0022791465744376183, "ewc_loss_diag": 1.5422701835632324e-06, "ewc_loss_parallel": 7.3800893005682155e-06, "grad_norm": 5.265099048614502, "learning_rate": 1.4412886816447646e-07, "loss": 0.6639, "mean_token_accuracy": 0.7983046770095825, "num_tokens": 13023766.0, "step": 341 }, { "epoch": 0.04350591527795446, "ewc_loss": 0.002292018849402666, "ewc_loss_diag": 1.5422701835632324e-06, "ewc_loss_parallel": 7.508811904699542e-06, "grad_norm": 4.3964433670043945, "learning_rate": 1.4455277660025434e-07, "loss": 0.5786, "mean_token_accuracy": 0.8223549723625183, "num_tokens": 13056337.0, "step": 342 }, { "epoch": 0.04363312555654497, "ewc_loss": 0.0022973159793764353, "ewc_loss_diag": 1.5497207641601562e-06, "ewc_loss_parallel": 7.485488367819926e-06, "grad_norm": 5.621120452880859, "learning_rate": 1.449766850360322e-07, "loss": 0.638, "mean_token_accuracy": 0.8056567311286926, "num_tokens": 13097345.0, "step": 343 }, { "epoch": 0.04376033583513548, "ewc_loss": 0.0023143247235566378, "ewc_loss_diag": 1.55717134475708e-06, "ewc_loss_parallel": 7.579283192171715e-06, "grad_norm": 5.733930587768555, "learning_rate": 1.4540059347181008e-07, "loss": 0.6221, "mean_token_accuracy": 0.8093791007995605, "num_tokens": 13127332.0, "step": 344 }, { "epoch": 0.043887546113725986, "ewc_loss": 0.002337496494874358, "ewc_loss_diag": 1.5720725059509277e-06, "ewc_loss_parallel": 7.658412869204767e-06, "grad_norm": 4.158660888671875, "learning_rate": 1.4582450190758795e-07, "loss": 0.6516, "mean_token_accuracy": 0.7994648814201355, "num_tokens": 13170760.0, "step": 345 }, { "epoch": 0.0440147563923165, "ewc_loss": 0.002331044524908066, "ewc_loss_diag": 1.5795230865478516e-06, "ewc_loss_parallel": 7.517599442508072e-06, "grad_norm": 5.174320697784424, "learning_rate": 1.4624841034336583e-07, "loss": 0.6193, "mean_token_accuracy": 0.8128564357757568, "num_tokens": 13207061.0, "step": 346 }, { "epoch": 0.04414196667090701, "ewc_loss": 0.002372893039137125, "ewc_loss_diag": 1.6167759895324707e-06, "ewc_loss_parallel": 7.554613603133475e-06, "grad_norm": 4.213150978088379, "learning_rate": 1.466723187791437e-07, "loss": 0.5891, "mean_token_accuracy": 0.8191709518432617, "num_tokens": 13247903.0, "step": 347 }, { "epoch": 0.04426917694949752, "ewc_loss": 0.002357886638492346, "ewc_loss_diag": 1.6093254089355469e-06, "ewc_loss_parallel": 7.480844033125322e-06, "grad_norm": 4.20125675201416, "learning_rate": 1.4709622721492157e-07, "loss": 0.5993, "mean_token_accuracy": 0.8123566508293152, "num_tokens": 13287193.0, "step": 348 }, { "epoch": 0.04439638722808803, "ewc_loss": 0.00236276863142848, "ewc_loss_diag": 1.6167759895324707e-06, "ewc_loss_parallel": 7.4533709266688675e-06, "grad_norm": 5.20890998840332, "learning_rate": 1.4752013565069942e-07, "loss": 0.6037, "mean_token_accuracy": 0.8157600164413452, "num_tokens": 13323112.0, "step": 349 }, { "epoch": 0.04452359750667854, "ewc_loss": 0.0023902892135083675, "ewc_loss_diag": 1.6316771507263184e-06, "ewc_loss_parallel": 7.575987638119841e-06, "grad_norm": 4.053287029266357, "learning_rate": 1.4794404408647732e-07, "loss": 0.5991, "mean_token_accuracy": 0.817131519317627, "num_tokens": 13359653.0, "step": 350 }, { "epoch": 0.04465080778526905, "ewc_loss": 0.0023913688492029905, "ewc_loss_diag": 1.6391277313232422e-06, "ewc_loss_parallel": 7.5104908319190145e-06, "grad_norm": 4.6615986824035645, "learning_rate": 1.4836795252225516e-07, "loss": 0.57, "mean_token_accuracy": 0.8223857879638672, "num_tokens": 13400571.0, "step": 351 }, { "epoch": 0.04477801806385956, "ewc_loss": 0.002395056188106537, "ewc_loss_diag": 1.6391277313232422e-06, "ewc_loss_parallel": 7.547364930360345e-06, "grad_norm": 4.127704620361328, "learning_rate": 1.4879186095803306e-07, "loss": 0.5478, "mean_token_accuracy": 0.8267698884010315, "num_tokens": 13437656.0, "step": 352 }, { "epoch": 0.04490522834245007, "ewc_loss": 0.0023970066104084253, "ewc_loss_diag": 1.646578311920166e-06, "ewc_loss_parallel": 7.49057426219224e-06, "grad_norm": 4.562974452972412, "learning_rate": 1.492157693938109e-07, "loss": 0.6066, "mean_token_accuracy": 0.8141034841537476, "num_tokens": 13474688.0, "step": 353 }, { "epoch": 0.04503243862104058, "ewc_loss": 0.002402676735073328, "ewc_loss_diag": 1.646578311920166e-06, "ewc_loss_parallel": 7.547274435637519e-06, "grad_norm": 3.38634991645813, "learning_rate": 1.496396778295888e-07, "loss": 0.5449, "mean_token_accuracy": 0.830081045627594, "num_tokens": 13518256.0, "step": 354 }, { "epoch": 0.04515964889963109, "ewc_loss": 0.002388607943430543, "ewc_loss_diag": 1.646578311920166e-06, "ewc_loss_parallel": 7.406586973957019e-06, "grad_norm": 4.072988986968994, "learning_rate": 1.5006358626536665e-07, "loss": 0.6314, "mean_token_accuracy": 0.8087977766990662, "num_tokens": 13554315.0, "step": 355 }, { "epoch": 0.0452868591782216, "ewc_loss": 0.002401746576651931, "ewc_loss_diag": 1.6540288925170898e-06, "ewc_loss_parallel": 7.461679615516914e-06, "grad_norm": 4.0372209548950195, "learning_rate": 1.5048749470114455e-07, "loss": 0.6519, "mean_token_accuracy": 0.8018075823783875, "num_tokens": 13594388.0, "step": 356 }, { "epoch": 0.04541406945681211, "ewc_loss": 0.00241667777299881, "ewc_loss_diag": 1.6614794731140137e-06, "ewc_loss_parallel": 7.534697942901403e-06, "grad_norm": 3.7933509349823, "learning_rate": 1.509114031369224e-07, "loss": 0.578, "mean_token_accuracy": 0.8189692497253418, "num_tokens": 13633704.0, "step": 357 }, { "epoch": 0.04554127973540262, "ewc_loss": 0.0024422877468168736, "ewc_loss_diag": 1.6838312149047852e-06, "ewc_loss_parallel": 7.561914571851958e-06, "grad_norm": 4.61201810836792, "learning_rate": 1.513353115727003e-07, "loss": 0.6279, "mean_token_accuracy": 0.8028702139854431, "num_tokens": 13671183.0, "step": 358 }, { "epoch": 0.04566849001399313, "ewc_loss": 0.002465927042067051, "ewc_loss_diag": 1.691281795501709e-06, "ewc_loss_parallel": 7.72201474319445e-06, "grad_norm": 4.085071563720703, "learning_rate": 1.5175922000847814e-07, "loss": 0.5828, "mean_token_accuracy": 0.8210347294807434, "num_tokens": 13709640.0, "step": 359 }, { "epoch": 0.04579570029258364, "ewc_loss": 0.0024674616288393736, "ewc_loss_diag": 1.691281795501709e-06, "ewc_loss_parallel": 7.737359737802763e-06, "grad_norm": 4.518608093261719, "learning_rate": 1.5218312844425604e-07, "loss": 0.6064, "mean_token_accuracy": 0.8068974018096924, "num_tokens": 13751114.0, "step": 360 }, { "epoch": 0.045922910571174154, "ewc_loss": 0.002482689917087555, "ewc_loss_diag": 1.6987323760986328e-06, "ewc_loss_parallel": 7.813348929630592e-06, "grad_norm": 4.088008403778076, "learning_rate": 1.526070368800339e-07, "loss": 0.536, "mean_token_accuracy": 0.8282406330108643, "num_tokens": 13786448.0, "step": 361 }, { "epoch": 0.04605012084976466, "ewc_loss": 0.0024790079332888126, "ewc_loss_diag": 1.6987323760986328e-06, "ewc_loss_parallel": 7.776528946124017e-06, "grad_norm": 4.1305012702941895, "learning_rate": 1.530309453158118e-07, "loss": 0.5595, "mean_token_accuracy": 0.8307600021362305, "num_tokens": 13827663.0, "step": 362 }, { "epoch": 0.04617733112835517, "ewc_loss": 0.0025001931935548782, "ewc_loss_diag": 1.7210841178894043e-06, "ewc_loss_parallel": 7.759499567328021e-06, "grad_norm": 4.86142635345459, "learning_rate": 1.5345485375158963e-07, "loss": 0.6386, "mean_token_accuracy": 0.8052630424499512, "num_tokens": 13866078.0, "step": 363 }, { "epoch": 0.04630454140694568, "ewc_loss": 0.0025122021324932575, "ewc_loss_diag": 1.7210841178894043e-06, "ewc_loss_parallel": 7.879588338255417e-06, "grad_norm": 3.744164228439331, "learning_rate": 1.5387876218736753e-07, "loss": 0.5157, "mean_token_accuracy": 0.8380203247070312, "num_tokens": 13909767.0, "step": 364 }, { "epoch": 0.04643175168553619, "ewc_loss": 0.00249933497980237, "ewc_loss_diag": 1.7210841178894043e-06, "ewc_loss_parallel": 7.750919394311495e-06, "grad_norm": 4.345168113708496, "learning_rate": 1.5430267062314538e-07, "loss": 0.6235, "mean_token_accuracy": 0.8108599781990051, "num_tokens": 13948937.0, "step": 365 }, { "epoch": 0.0465589619641267, "ewc_loss": 0.002509519224986434, "ewc_loss_diag": 1.7285346984863281e-06, "ewc_loss_parallel": 7.776466190989595e-06, "grad_norm": 5.513547897338867, "learning_rate": 1.5472657905892328e-07, "loss": 0.6348, "mean_token_accuracy": 0.8045364618301392, "num_tokens": 13984039.0, "step": 366 }, { "epoch": 0.04668617224271721, "ewc_loss": 0.0025453665293753147, "ewc_loss_diag": 1.7508864402770996e-06, "ewc_loss_parallel": 7.982353054103442e-06, "grad_norm": 5.101582050323486, "learning_rate": 1.5515048749470113e-07, "loss": 0.6169, "mean_token_accuracy": 0.808611273765564, "num_tokens": 14018162.0, "step": 367 }, { "epoch": 0.04681338252130772, "ewc_loss": 0.0025535044260323048, "ewc_loss_diag": 1.7508864402770996e-06, "ewc_loss_parallel": 8.063730092544574e-06, "grad_norm": 3.721095561981201, "learning_rate": 1.55574395930479e-07, "loss": 0.6167, "mean_token_accuracy": 0.8095629215240479, "num_tokens": 14056493.0, "step": 368 }, { "epoch": 0.046940592799898234, "ewc_loss": 0.0025490073021501303, "ewc_loss_diag": 1.7657876014709473e-06, "ewc_loss_parallel": 7.866171472414862e-06, "grad_norm": 5.998199939727783, "learning_rate": 1.5599830436625687e-07, "loss": 0.6098, "mean_token_accuracy": 0.8125245571136475, "num_tokens": 14097530.0, "step": 369 }, { "epoch": 0.04706780307848874, "ewc_loss": 0.0025845845229923725, "ewc_loss_diag": 1.780688762664795e-06, "ewc_loss_parallel": 8.069356226769742e-06, "grad_norm": 4.173799514770508, "learning_rate": 1.5642221280203474e-07, "loss": 0.6283, "mean_token_accuracy": 0.8101462125778198, "num_tokens": 14136240.0, "step": 370 }, { "epoch": 0.04719501335707925, "ewc_loss": 0.00257173553109169, "ewc_loss_diag": 1.780688762664795e-06, "ewc_loss_parallel": 7.94086645328207e-06, "grad_norm": 4.703190326690674, "learning_rate": 1.5684612123781262e-07, "loss": 0.54, "mean_token_accuracy": 0.8321893215179443, "num_tokens": 14171010.0, "step": 371 }, { "epoch": 0.04732222363566976, "ewc_loss": 0.002586883958429098, "ewc_loss_diag": 1.7955899238586426e-06, "ewc_loss_parallel": 7.939762326714117e-06, "grad_norm": 4.3250861167907715, "learning_rate": 1.572700296735905e-07, "loss": 0.6118, "mean_token_accuracy": 0.8181824684143066, "num_tokens": 14206985.0, "step": 372 }, { "epoch": 0.04744943391426027, "ewc_loss": 0.002581520937383175, "ewc_loss_diag": 1.7955899238586426e-06, "ewc_loss_parallel": 7.886133062129375e-06, "grad_norm": 4.856561660766602, "learning_rate": 1.576939381093684e-07, "loss": 0.6088, "mean_token_accuracy": 0.8098884224891663, "num_tokens": 14236728.0, "step": 373 }, { "epoch": 0.047576644192850785, "ewc_loss": 0.002587189432233572, "ewc_loss_diag": 1.7955899238586426e-06, "ewc_loss_parallel": 7.942818228912074e-06, "grad_norm": 3.8733386993408203, "learning_rate": 1.5811784654514623e-07, "loss": 0.5361, "mean_token_accuracy": 0.8344887495040894, "num_tokens": 14270758.0, "step": 374 }, { "epoch": 0.04770385447144129, "ewc_loss": 0.0025898213498294353, "ewc_loss_diag": 1.8104910850524902e-06, "ewc_loss_parallel": 7.81654853199143e-06, "grad_norm": 3.7101755142211914, "learning_rate": 1.5854175498092413e-07, "loss": 0.6535, "mean_token_accuracy": 0.7973814606666565, "num_tokens": 14315002.0, "step": 375 }, { "epoch": 0.0478310647500318, "ewc_loss": 0.002565117320045829, "ewc_loss_diag": 1.7955899238586426e-06, "ewc_loss_parallel": 7.72209659771761e-06, "grad_norm": 4.533292770385742, "learning_rate": 1.5896566341670198e-07, "loss": 0.6238, "mean_token_accuracy": 0.8051834106445312, "num_tokens": 14353385.0, "step": 376 }, { "epoch": 0.047958275028622314, "ewc_loss": 0.0026077073998749256, "ewc_loss_diag": 1.8253922462463379e-06, "ewc_loss_parallel": 7.842821105441544e-06, "grad_norm": 4.268980503082275, "learning_rate": 1.5938957185247988e-07, "loss": 0.6563, "mean_token_accuracy": 0.7946015000343323, "num_tokens": 14391702.0, "step": 377 }, { "epoch": 0.04808548530721282, "ewc_loss": 0.002620459534227848, "ewc_loss_diag": 1.8328428268432617e-06, "ewc_loss_parallel": 7.894047485024203e-06, "grad_norm": 4.058464050292969, "learning_rate": 1.5981348028825772e-07, "loss": 0.6236, "mean_token_accuracy": 0.8077424764633179, "num_tokens": 14427876.0, "step": 378 }, { "epoch": 0.048212695585803336, "ewc_loss": 0.002643134444952011, "ewc_loss_diag": 1.8551945686340332e-06, "ewc_loss_parallel": 7.891915629443247e-06, "grad_norm": 5.080303192138672, "learning_rate": 1.6023738872403562e-07, "loss": 0.71, "mean_token_accuracy": 0.7810299396514893, "num_tokens": 14460349.0, "step": 379 }, { "epoch": 0.048339905864393844, "ewc_loss": 0.0026671818923205137, "ewc_loss_diag": 1.862645149230957e-06, "ewc_loss_parallel": 8.056095794017892e-06, "grad_norm": 4.27569580078125, "learning_rate": 1.6066129715981347e-07, "loss": 0.5604, "mean_token_accuracy": 0.828324556350708, "num_tokens": 14497763.0, "step": 380 }, { "epoch": 0.04846711614298435, "ewc_loss": 0.0026637567207217216, "ewc_loss_diag": 1.862645149230957e-06, "ewc_loss_parallel": 8.021844223549124e-06, "grad_norm": 3.2191038131713867, "learning_rate": 1.6108520559559137e-07, "loss": 0.5547, "mean_token_accuracy": 0.8265942931175232, "num_tokens": 14537172.0, "step": 381 }, { "epoch": 0.048594326421574865, "ewc_loss": 0.002654679585248232, "ewc_loss_diag": 1.8775463104248047e-06, "ewc_loss_parallel": 7.778486178722233e-06, "grad_norm": 4.160955905914307, "learning_rate": 1.6150911403136921e-07, "loss": 0.6405, "mean_token_accuracy": 0.8074515461921692, "num_tokens": 14565515.0, "step": 382 }, { "epoch": 0.04872153670016537, "ewc_loss": 0.002681280020624399, "ewc_loss_diag": 1.8924474716186523e-06, "ewc_loss_parallel": 7.89190198702272e-06, "grad_norm": 3.072071075439453, "learning_rate": 1.619330224671471e-07, "loss": 0.6318, "mean_token_accuracy": 0.807174026966095, "num_tokens": 14608104.0, "step": 383 }, { "epoch": 0.04884874697875588, "ewc_loss": 0.0026832292787730694, "ewc_loss_diag": 1.9073486328125e-06, "ewc_loss_parallel": 7.758805622870568e-06, "grad_norm": 4.881096363067627, "learning_rate": 1.6235693090292496e-07, "loss": 0.6405, "mean_token_accuracy": 0.8074766993522644, "num_tokens": 14645328.0, "step": 384 }, { "epoch": 0.048975957257346395, "ewc_loss": 0.002735124435275793, "ewc_loss_diag": 1.9222497940063477e-06, "ewc_loss_parallel": 8.125171007122844e-06, "grad_norm": 4.560837745666504, "learning_rate": 1.6278083933870286e-07, "loss": 0.6728, "mean_token_accuracy": 0.79021155834198, "num_tokens": 14678792.0, "step": 385 }, { "epoch": 0.0491031675359369, "ewc_loss": 0.0027810935862362385, "ewc_loss_diag": 1.952052116394043e-06, "ewc_loss_parallel": 8.279685062007047e-06, "grad_norm": 3.521118402481079, "learning_rate": 1.632047477744807e-07, "loss": 0.5961, "mean_token_accuracy": 0.8169417381286621, "num_tokens": 14715095.0, "step": 386 }, { "epoch": 0.049230377814527417, "ewc_loss": 0.0027683016378432512, "ewc_loss_diag": 1.952052116394043e-06, "ewc_loss_parallel": 8.151765541697387e-06, "grad_norm": 4.435033321380615, "learning_rate": 1.6362865621025858e-07, "loss": 0.6161, "mean_token_accuracy": 0.8107086420059204, "num_tokens": 14753641.0, "step": 387 }, { "epoch": 0.049357588093117924, "ewc_loss": 0.0027788172010332346, "ewc_loss_diag": 1.952052116394043e-06, "ewc_loss_parallel": 8.256921319116373e-06, "grad_norm": 4.574680805206299, "learning_rate": 1.6405256464603645e-07, "loss": 0.5879, "mean_token_accuracy": 0.8169327974319458, "num_tokens": 14788816.0, "step": 388 }, { "epoch": 0.04948479837170843, "ewc_loss": 0.002804759657010436, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 8.363758752238937e-06, "grad_norm": 4.205597877502441, "learning_rate": 1.6447647308181432e-07, "loss": 0.6309, "mean_token_accuracy": 0.8045027256011963, "num_tokens": 14821607.0, "step": 389 }, { "epoch": 0.049612008650298946, "ewc_loss": 0.0028027177322655916, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 8.343338777194731e-06, "grad_norm": 3.518385887145996, "learning_rate": 1.649003815175922e-07, "loss": 0.649, "mean_token_accuracy": 0.8023456335067749, "num_tokens": 14861298.0, "step": 390 }, { "epoch": 0.04973921892888945, "ewc_loss": 0.0027835271321237087, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 8.15143357613124e-06, "grad_norm": 3.89518666267395, "learning_rate": 1.6532428995337007e-07, "loss": 0.5657, "mean_token_accuracy": 0.8235635757446289, "num_tokens": 14899951.0, "step": 391 }, { "epoch": 0.04986642920747997, "ewc_loss": 0.0027838347014039755, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 8.154509487212636e-06, "grad_norm": 3.46427321434021, "learning_rate": 1.6574819838914794e-07, "loss": 0.547, "mean_token_accuracy": 0.831057608127594, "num_tokens": 14937169.0, "step": 392 }, { "epoch": 0.049993639486070475, "ewc_loss": 0.0027803690172731876, "ewc_loss_diag": 1.9669532775878906e-06, "ewc_loss_parallel": 8.119853191601578e-06, "grad_norm": 3.7780051231384277, "learning_rate": 1.661721068249258e-07, "loss": 0.5988, "mean_token_accuracy": 0.8149349689483643, "num_tokens": 14975549.0, "step": 393 }, { "epoch": 0.05012084976466098, "ewc_loss": 0.002804576186463237, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.209336556319613e-06, "grad_norm": 3.6790361404418945, "learning_rate": 1.6659601526070368e-07, "loss": 0.6265, "mean_token_accuracy": 0.8078100085258484, "num_tokens": 15016630.0, "step": 394 }, { "epoch": 0.0502480600432515, "ewc_loss": 0.0028082020580768585, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.24559356260579e-06, "grad_norm": 3.610701322555542, "learning_rate": 1.6701992369648156e-07, "loss": 0.6083, "mean_token_accuracy": 0.8113041520118713, "num_tokens": 15057822.0, "step": 395 }, { "epoch": 0.050375270321842004, "ewc_loss": 0.002810512902215123, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.268703822977841e-06, "grad_norm": 4.587643146514893, "learning_rate": 1.6744383213225943e-07, "loss": 0.5811, "mean_token_accuracy": 0.8156644105911255, "num_tokens": 15094385.0, "step": 396 }, { "epoch": 0.05050248060043251, "ewc_loss": 0.002832847647368908, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.492050255881622e-06, "grad_norm": 3.7387471199035645, "learning_rate": 1.678677405680373e-07, "loss": 0.5405, "mean_token_accuracy": 0.8282574415206909, "num_tokens": 15130969.0, "step": 397 }, { "epoch": 0.050629690879023026, "ewc_loss": 0.0028289994224905968, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.453567716060206e-06, "grad_norm": 4.560966968536377, "learning_rate": 1.6829164900381518e-07, "loss": 0.5625, "mean_token_accuracy": 0.8205981254577637, "num_tokens": 15166586.0, "step": 398 }, { "epoch": 0.05075690115761353, "ewc_loss": 0.002841230481863022, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.575879292038735e-06, "grad_norm": 4.218634605407715, "learning_rate": 1.6871555743959305e-07, "loss": 0.5523, "mean_token_accuracy": 0.8246365189552307, "num_tokens": 15209603.0, "step": 399 }, { "epoch": 0.05088411143620405, "ewc_loss": 0.0028377294074743986, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.540868293493986e-06, "grad_norm": 4.902876377105713, "learning_rate": 1.6913946587537092e-07, "loss": 0.5702, "mean_token_accuracy": 0.8205996155738831, "num_tokens": 15251542.0, "step": 400 }, { "epoch": 0.051011321714794555, "ewc_loss": 0.0028476701118052006, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.640276064397767e-06, "grad_norm": 4.318322658538818, "learning_rate": 1.695633743111488e-07, "loss": 0.5836, "mean_token_accuracy": 0.8203972578048706, "num_tokens": 15288940.0, "step": 401 }, { "epoch": 0.05113853199338506, "ewc_loss": 0.0028416947461664677, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.580522262491286e-06, "grad_norm": 4.008792877197266, "learning_rate": 1.6998728274692667e-07, "loss": 0.5511, "mean_token_accuracy": 0.8317705392837524, "num_tokens": 15321965.0, "step": 402 }, { "epoch": 0.05126574227197558, "ewc_loss": 0.0028282038401812315, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.445612365903798e-06, "grad_norm": 4.011483669281006, "learning_rate": 1.7041119118270454e-07, "loss": 0.5602, "mean_token_accuracy": 0.8270667791366577, "num_tokens": 15362895.0, "step": 403 }, { "epoch": 0.051392952550566084, "ewc_loss": 0.002821238711476326, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.375960533157922e-06, "grad_norm": 5.276342868804932, "learning_rate": 1.7083509961848238e-07, "loss": 0.6171, "mean_token_accuracy": 0.8079510927200317, "num_tokens": 15396675.0, "step": 404 }, { "epoch": 0.0515201628291566, "ewc_loss": 0.0028430793900042772, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.594367500336375e-06, "grad_norm": 3.6348328590393066, "learning_rate": 1.7125900805426028e-07, "loss": 0.6378, "mean_token_accuracy": 0.8038303852081299, "num_tokens": 15443070.0, "step": 405 }, { "epoch": 0.051647373107747106, "ewc_loss": 0.0028206247370690107, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.369822353415657e-06, "grad_norm": 3.7056643962860107, "learning_rate": 1.7168291649003813e-07, "loss": 0.5656, "mean_token_accuracy": 0.822810173034668, "num_tokens": 15476747.0, "step": 406 }, { "epoch": 0.051774583386337614, "ewc_loss": 0.0028113804291933775, "ewc_loss_diag": 1.9818544387817383e-06, "ewc_loss_parallel": 8.277378583443351e-06, "grad_norm": 3.9508750438690186, "learning_rate": 1.7210682492581603e-07, "loss": 0.5912, "mean_token_accuracy": 0.8182770609855652, "num_tokens": 15514866.0, "step": 407 }, { "epoch": 0.05190179366492813, "ewc_loss": 0.002829217351973057, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 8.303159120259807e-06, "grad_norm": 3.932130813598633, "learning_rate": 1.7253073336159387e-07, "loss": 0.5683, "mean_token_accuracy": 0.8220327496528625, "num_tokens": 15554245.0, "step": 408 }, { "epoch": 0.052029003943518635, "ewc_loss": 0.002834644401445985, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 8.357429578609299e-06, "grad_norm": 3.43843412399292, "learning_rate": 1.7295464179737177e-07, "loss": 0.5974, "mean_token_accuracy": 0.8162102699279785, "num_tokens": 15593958.0, "step": 409 }, { "epoch": 0.05215621422210915, "ewc_loss": 0.002839109394699335, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 8.24949165689759e-06, "grad_norm": 3.557021141052246, "learning_rate": 1.7337855023314962e-07, "loss": 0.5424, "mean_token_accuracy": 0.8303289413452148, "num_tokens": 15632567.0, "step": 410 }, { "epoch": 0.05228342450069966, "ewc_loss": 0.002823576098307967, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 8.246746801887639e-06, "grad_norm": 4.7549519538879395, "learning_rate": 1.7380245866892752e-07, "loss": 0.5514, "mean_token_accuracy": 0.8269553184509277, "num_tokens": 15668255.0, "step": 411 }, { "epoch": 0.052410634779290165, "ewc_loss": 0.0028559588827192783, "ewc_loss_diag": 1.996755599975586e-06, "ewc_loss_parallel": 8.570576028432697e-06, "grad_norm": 4.15087366104126, "learning_rate": 1.7422636710470536e-07, "loss": 0.6036, "mean_token_accuracy": 0.8119671940803528, "num_tokens": 15705435.0, "step": 412 }, { "epoch": 0.05253784505788068, "ewc_loss": 0.002873288933187723, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 8.59128704178147e-06, "grad_norm": 4.042699813842773, "learning_rate": 1.7465027554048326e-07, "loss": 0.6239, "mean_token_accuracy": 0.8069385290145874, "num_tokens": 15744657.0, "step": 413 }, { "epoch": 0.05266505533647119, "ewc_loss": 0.0028683599084615707, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 8.54199879540829e-06, "grad_norm": 4.419806003570557, "learning_rate": 1.750741839762611e-07, "loss": 0.5688, "mean_token_accuracy": 0.8248926401138306, "num_tokens": 15777374.0, "step": 414 }, { "epoch": 0.052792265615061694, "ewc_loss": 0.0028722165152430534, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 8.580565008742269e-06, "grad_norm": 3.3368406295776367, "learning_rate": 1.75498092412039e-07, "loss": 0.5396, "mean_token_accuracy": 0.8314083218574524, "num_tokens": 15814047.0, "step": 415 }, { "epoch": 0.05291947589365221, "ewc_loss": 0.002846751594915986, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 8.325913768203463e-06, "grad_norm": 3.158163070678711, "learning_rate": 1.7592200084781686e-07, "loss": 0.6205, "mean_token_accuracy": 0.8073702454566956, "num_tokens": 15849110.0, "step": 416 }, { "epoch": 0.053046686172242716, "ewc_loss": 0.0028366290498524904, "ewc_loss_diag": 2.0116567611694336e-06, "ewc_loss_parallel": 8.22468882688554e-06, "grad_norm": 4.000314235687256, "learning_rate": 1.7634590928359475e-07, "loss": 0.5914, "mean_token_accuracy": 0.8193058967590332, "num_tokens": 15888910.0, "step": 417 }, { "epoch": 0.05317389645083323, "ewc_loss": 0.0028772142250090837, "ewc_loss_diag": 2.0265579223632812e-06, "ewc_loss_parallel": 8.47795217850944e-06, "grad_norm": 4.594297409057617, "learning_rate": 1.767698177193726e-07, "loss": 0.5781, "mean_token_accuracy": 0.8190844058990479, "num_tokens": 15925604.0, "step": 418 }, { "epoch": 0.05330110672942374, "ewc_loss": 0.002908683381974697, "ewc_loss_diag": 2.0265579223632812e-06, "ewc_loss_parallel": 8.792643711785786e-06, "grad_norm": 3.6553924083709717, "learning_rate": 1.771937261551505e-07, "loss": 0.5518, "mean_token_accuracy": 0.833730161190033, "num_tokens": 15961401.0, "step": 419 }, { "epoch": 0.053428317008014245, "ewc_loss": 0.002891820389777422, "ewc_loss_diag": 2.0265579223632812e-06, "ewc_loss_parallel": 8.624015208624769e-06, "grad_norm": 4.0682692527771, "learning_rate": 1.7761763459092835e-07, "loss": 0.5829, "mean_token_accuracy": 0.8201053738594055, "num_tokens": 16001029.0, "step": 420 }, { "epoch": 0.05355552728660476, "ewc_loss": 0.0029079962987452745, "ewc_loss_diag": 2.041459083557129e-06, "ewc_loss_parallel": 8.633185643702745e-06, "grad_norm": 3.4985969066619873, "learning_rate": 1.7804154302670624e-07, "loss": 0.5635, "mean_token_accuracy": 0.8244289755821228, "num_tokens": 16036014.0, "step": 421 }, { "epoch": 0.05368273756519527, "ewc_loss": 0.0028943424113094807, "ewc_loss_diag": 2.041459083557129e-06, "ewc_loss_parallel": 8.496645932609681e-06, "grad_norm": 3.6509382724761963, "learning_rate": 1.784654514624841e-07, "loss": 0.5454, "mean_token_accuracy": 0.8270907998085022, "num_tokens": 16075373.0, "step": 422 }, { "epoch": 0.05380994784378578, "ewc_loss": 0.002908326219767332, "ewc_loss_diag": 2.0563602447509766e-06, "ewc_loss_parallel": 8.48389754537493e-06, "grad_norm": 3.321317434310913, "learning_rate": 1.7888935989826196e-07, "loss": 0.5876, "mean_token_accuracy": 0.8144981861114502, "num_tokens": 16111352.0, "step": 423 }, { "epoch": 0.05393715812237629, "ewc_loss": 0.002903042361140251, "ewc_loss_diag": 2.0563602447509766e-06, "ewc_loss_parallel": 8.431058631686028e-06, "grad_norm": 3.5230865478515625, "learning_rate": 1.7931326833403984e-07, "loss": 0.6037, "mean_token_accuracy": 0.8111604452133179, "num_tokens": 16148826.0, "step": 424 }, { "epoch": 0.054064368400966796, "ewc_loss": 0.002927567344158888, "ewc_loss_diag": 2.0712614059448242e-06, "ewc_loss_parallel": 8.523719770892058e-06, "grad_norm": 2.7825303077697754, "learning_rate": 1.797371767698177e-07, "loss": 0.5122, "mean_token_accuracy": 0.8404951095581055, "num_tokens": 16193491.0, "step": 425 }, { "epoch": 0.05419157867955731, "ewc_loss": 0.002908707596361637, "ewc_loss_diag": 2.0712614059448242e-06, "ewc_loss_parallel": 8.335122402058914e-06, "grad_norm": 4.027121543884277, "learning_rate": 1.8016108520559558e-07, "loss": 0.5609, "mean_token_accuracy": 0.8259493708610535, "num_tokens": 16235595.0, "step": 426 }, { "epoch": 0.05431878895814782, "ewc_loss": 0.0029659783467650414, "ewc_loss_diag": 2.086162567138672e-06, "ewc_loss_parallel": 8.755242561164778e-06, "grad_norm": 4.084047794342041, "learning_rate": 1.8058499364137345e-07, "loss": 0.6148, "mean_token_accuracy": 0.8141669631004333, "num_tokens": 16274105.0, "step": 427 }, { "epoch": 0.054445999236738325, "ewc_loss": 0.0029899347573518753, "ewc_loss_diag": 2.086162567138672e-06, "ewc_loss_parallel": 8.994808013085276e-06, "grad_norm": 3.3299810886383057, "learning_rate": 1.8100890207715133e-07, "loss": 0.6581, "mean_token_accuracy": 0.7934385538101196, "num_tokens": 16315886.0, "step": 428 }, { "epoch": 0.05457320951532884, "ewc_loss": 0.0030038761906325817, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 8.829045327729546e-06, "grad_norm": 3.519796133041382, "learning_rate": 1.814328105129292e-07, "loss": 0.5992, "mean_token_accuracy": 0.8127772808074951, "num_tokens": 16353021.0, "step": 429 }, { "epoch": 0.05470041979391935, "ewc_loss": 0.0030025108717381954, "ewc_loss_diag": 2.115964889526367e-06, "ewc_loss_parallel": 8.815392902761232e-06, "grad_norm": 4.2370195388793945, "learning_rate": 1.8185671894870707e-07, "loss": 0.5743, "mean_token_accuracy": 0.818244993686676, "num_tokens": 16380887.0, "step": 430 }, { "epoch": 0.05482763007250986, "ewc_loss": 0.0030424552969634533, "ewc_loss_diag": 2.130866050720215e-06, "ewc_loss_parallel": 9.06224795471644e-06, "grad_norm": 3.259140729904175, "learning_rate": 1.8228062738448494e-07, "loss": 0.63, "mean_token_accuracy": 0.8063021898269653, "num_tokens": 16421066.0, "step": 431 }, { "epoch": 0.05495484035110037, "ewc_loss": 0.0030381851829588413, "ewc_loss_diag": 2.1457672119140625e-06, "ewc_loss_parallel": 8.866960342857055e-06, "grad_norm": 3.808424711227417, "learning_rate": 1.8270453582026282e-07, "loss": 0.5984, "mean_token_accuracy": 0.8126951456069946, "num_tokens": 16457197.0, "step": 432 }, { "epoch": 0.055082050629690876, "ewc_loss": 0.0030475708190351725, "ewc_loss_diag": 2.1457672119140625e-06, "ewc_loss_parallel": 8.960815648606513e-06, "grad_norm": 3.1622610092163086, "learning_rate": 1.831284442560407e-07, "loss": 0.5539, "mean_token_accuracy": 0.8267485499382019, "num_tokens": 16497385.0, "step": 433 }, { "epoch": 0.05520926090828139, "ewc_loss": 0.003031407715752721, "ewc_loss_diag": 2.1457672119140625e-06, "ewc_loss_parallel": 8.799184797680937e-06, "grad_norm": 3.309659957885742, "learning_rate": 1.8355235269181856e-07, "loss": 0.569, "mean_token_accuracy": 0.8234652280807495, "num_tokens": 16540099.0, "step": 434 }, { "epoch": 0.0553364711868719, "ewc_loss": 0.0030534230172634125, "ewc_loss_diag": 2.16066837310791e-06, "ewc_loss_parallel": 8.866749340086244e-06, "grad_norm": 4.009659290313721, "learning_rate": 1.8397626112759643e-07, "loss": 0.5657, "mean_token_accuracy": 0.8230969309806824, "num_tokens": 16572760.0, "step": 435 }, { "epoch": 0.05546368146546241, "ewc_loss": 0.0031128786504268646, "ewc_loss_diag": 2.1904706954956055e-06, "ewc_loss_parallel": 9.156129635812249e-06, "grad_norm": 3.4343252182006836, "learning_rate": 1.844001695633743e-07, "loss": 0.5249, "mean_token_accuracy": 0.8336012363433838, "num_tokens": 16608549.0, "step": 436 }, { "epoch": 0.05559089174405292, "ewc_loss": 0.003105304902419448, "ewc_loss_diag": 2.1904706954956055e-06, "ewc_loss_parallel": 9.08039237401681e-06, "grad_norm": 4.2892680168151855, "learning_rate": 1.8482407799915218e-07, "loss": 0.5622, "mean_token_accuracy": 0.8246119022369385, "num_tokens": 16643800.0, "step": 437 }, { "epoch": 0.05571810202264343, "ewc_loss": 0.0031315372325479984, "ewc_loss_diag": 2.1904706954956055e-06, "ewc_loss_parallel": 9.34271520236507e-06, "grad_norm": 4.3233489990234375, "learning_rate": 1.8524798643493005e-07, "loss": 0.6355, "mean_token_accuracy": 0.8002695441246033, "num_tokens": 16677948.0, "step": 438 }, { "epoch": 0.05584531230123394, "ewc_loss": 0.0031436290591955185, "ewc_loss_diag": 2.1904706954956055e-06, "ewc_loss_parallel": 9.463633432460483e-06, "grad_norm": 3.455705165863037, "learning_rate": 1.8567189487070792e-07, "loss": 0.6092, "mean_token_accuracy": 0.8125574588775635, "num_tokens": 16716905.0, "step": 439 }, { "epoch": 0.05597252257982445, "ewc_loss": 0.0031464635394513607, "ewc_loss_diag": 2.2202730178833008e-06, "ewc_loss_parallel": 9.186804163618945e-06, "grad_norm": 3.59137225151062, "learning_rate": 1.8609580330648577e-07, "loss": 0.5206, "mean_token_accuracy": 0.8354693651199341, "num_tokens": 16756158.0, "step": 440 }, { "epoch": 0.05609973285841496, "ewc_loss": 0.00314309005625546, "ewc_loss_diag": 2.2202730178833008e-06, "ewc_loss_parallel": 9.153069186140783e-06, "grad_norm": 4.000421524047852, "learning_rate": 1.8651971174226367e-07, "loss": 0.6119, "mean_token_accuracy": 0.8135619163513184, "num_tokens": 16794261.0, "step": 441 }, { "epoch": 0.05622694313700547, "ewc_loss": 0.003153948811814189, "ewc_loss_diag": 2.2351741790771484e-06, "ewc_loss_parallel": 9.261655577574857e-06, "grad_norm": 2.977102518081665, "learning_rate": 1.8694362017804152e-07, "loss": 0.5686, "mean_token_accuracy": 0.8211671710014343, "num_tokens": 16834953.0, "step": 442 }, { "epoch": 0.05635415341559598, "ewc_loss": 0.003123294096440077, "ewc_loss_diag": 2.2351741790771484e-06, "ewc_loss_parallel": 8.955108569352888e-06, "grad_norm": 4.087604522705078, "learning_rate": 1.8736752861381941e-07, "loss": 0.5275, "mean_token_accuracy": 0.8357003331184387, "num_tokens": 16873954.0, "step": 443 }, { "epoch": 0.05648136369418649, "ewc_loss": 0.003154237288981676, "ewc_loss_diag": 2.2351741790771484e-06, "ewc_loss_parallel": 9.264539585274179e-06, "grad_norm": 3.592010021209717, "learning_rate": 1.8779143704959726e-07, "loss": 0.5493, "mean_token_accuracy": 0.8258790969848633, "num_tokens": 16906018.0, "step": 444 }, { "epoch": 0.056608573972777, "ewc_loss": 0.003167796414345503, "ewc_loss_diag": 2.250075340270996e-06, "ewc_loss_parallel": 9.247542948287446e-06, "grad_norm": 3.982327699661255, "learning_rate": 1.8821534548537516e-07, "loss": 0.6022, "mean_token_accuracy": 0.814091682434082, "num_tokens": 16938945.0, "step": 445 }, { "epoch": 0.05673578425136751, "ewc_loss": 0.003181068692356348, "ewc_loss_diag": 2.250075340270996e-06, "ewc_loss_parallel": 9.380266419611871e-06, "grad_norm": 3.4127914905548096, "learning_rate": 1.88639253921153e-07, "loss": 0.5612, "mean_token_accuracy": 0.8264243006706238, "num_tokens": 16982214.0, "step": 446 }, { "epoch": 0.05686299452995802, "ewc_loss": 0.003161229193210602, "ewc_loss_diag": 2.250075340270996e-06, "ewc_loss_parallel": 9.18187197385123e-06, "grad_norm": 3.174692392349243, "learning_rate": 1.890631623569309e-07, "loss": 0.5699, "mean_token_accuracy": 0.8215911388397217, "num_tokens": 17024367.0, "step": 447 }, { "epoch": 0.05699020480854853, "ewc_loss": 0.0031503764912486076, "ewc_loss_diag": 2.250075340270996e-06, "ewc_loss_parallel": 9.073344699572772e-06, "grad_norm": 3.7411179542541504, "learning_rate": 1.8948707079270875e-07, "loss": 0.5917, "mean_token_accuracy": 0.8116321563720703, "num_tokens": 17057451.0, "step": 448 }, { "epoch": 0.057117415087139044, "ewc_loss": 0.0031890792306512594, "ewc_loss_diag": 2.2649765014648438e-06, "ewc_loss_parallel": 9.307784239354078e-06, "grad_norm": 4.999667167663574, "learning_rate": 1.8991097922848665e-07, "loss": 0.5432, "mean_token_accuracy": 0.8294850587844849, "num_tokens": 17094633.0, "step": 449 }, { "epoch": 0.05724462536572955, "ewc_loss": 0.003240788821130991, "ewc_loss_diag": 2.2649765014648438e-06, "ewc_loss_parallel": 9.824880180531181e-06, "grad_norm": 3.4224321842193604, "learning_rate": 1.903348876642645e-07, "loss": 0.5976, "mean_token_accuracy": 0.8109928369522095, "num_tokens": 17135892.0, "step": 450 }, { "epoch": 0.05737183564432006, "ewc_loss": 0.0032342884223908186, "ewc_loss_diag": 2.294778823852539e-06, "ewc_loss_parallel": 9.45470037549967e-06, "grad_norm": 3.645259380340576, "learning_rate": 1.907587961000424e-07, "loss": 0.5409, "mean_token_accuracy": 0.8312498927116394, "num_tokens": 17173881.0, "step": 451 }, { "epoch": 0.05749904592291057, "ewc_loss": 0.0032378537580370903, "ewc_loss_diag": 2.3096799850463867e-06, "ewc_loss_parallel": 9.337767551187426e-06, "grad_norm": 3.529710054397583, "learning_rate": 1.9118270453582024e-07, "loss": 0.5895, "mean_token_accuracy": 0.8168673515319824, "num_tokens": 17215668.0, "step": 452 }, { "epoch": 0.05762625620150108, "ewc_loss": 0.0032319859601557255, "ewc_loss_diag": 2.3096799850463867e-06, "ewc_loss_parallel": 9.279087862523738e-06, "grad_norm": 3.244605541229248, "learning_rate": 1.9160661297159814e-07, "loss": 0.6126, "mean_token_accuracy": 0.8112525939941406, "num_tokens": 17254339.0, "step": 453 }, { "epoch": 0.057753466480091595, "ewc_loss": 0.0032548359595239162, "ewc_loss_diag": 2.339482307434082e-06, "ewc_loss_parallel": 9.202411092701368e-06, "grad_norm": 3.4015936851501465, "learning_rate": 1.9203052140737599e-07, "loss": 0.5986, "mean_token_accuracy": 0.8118586540222168, "num_tokens": 17288858.0, "step": 454 }, { "epoch": 0.0578806767586821, "ewc_loss": 0.0032655461691319942, "ewc_loss_diag": 2.339482307434082e-06, "ewc_loss_parallel": 9.309513188782148e-06, "grad_norm": 3.654918909072876, "learning_rate": 1.9245442984315389e-07, "loss": 0.575, "mean_token_accuracy": 0.8232377767562866, "num_tokens": 17330601.0, "step": 455 }, { "epoch": 0.05800788703727261, "ewc_loss": 0.003297105897217989, "ewc_loss_diag": 2.3543834686279297e-06, "ewc_loss_parallel": 9.472524652665015e-06, "grad_norm": 3.442134380340576, "learning_rate": 1.9287833827893173e-07, "loss": 0.5947, "mean_token_accuracy": 0.8137983679771423, "num_tokens": 17367797.0, "step": 456 }, { "epoch": 0.058135097315863124, "ewc_loss": 0.0032970779575407505, "ewc_loss_diag": 2.3543834686279297e-06, "ewc_loss_parallel": 9.472245437791571e-06, "grad_norm": 3.225688934326172, "learning_rate": 1.9330224671470963e-07, "loss": 0.5801, "mean_token_accuracy": 0.8187899589538574, "num_tokens": 17403206.0, "step": 457 }, { "epoch": 0.05826230759445363, "ewc_loss": 0.0032850943971425295, "ewc_loss_diag": 2.3543834686279297e-06, "ewc_loss_parallel": 9.352409506391268e-06, "grad_norm": 3.2587809562683105, "learning_rate": 1.9372615515048748e-07, "loss": 0.6253, "mean_token_accuracy": 0.8083198666572571, "num_tokens": 17437207.0, "step": 458 }, { "epoch": 0.05838951787304414, "ewc_loss": 0.0033034924417734146, "ewc_loss_diag": 2.3692846298217773e-06, "ewc_loss_parallel": 9.383801625517663e-06, "grad_norm": 3.085792064666748, "learning_rate": 1.9415006358626535e-07, "loss": 0.5616, "mean_token_accuracy": 0.8264418840408325, "num_tokens": 17473335.0, "step": 459 }, { "epoch": 0.05851672815163465, "ewc_loss": 0.003305269405245781, "ewc_loss_diag": 2.3692846298217773e-06, "ewc_loss_parallel": 9.401571333000902e-06, "grad_norm": 3.862527370452881, "learning_rate": 1.9457397202204322e-07, "loss": 0.5525, "mean_token_accuracy": 0.8235137462615967, "num_tokens": 17505104.0, "step": 460 }, { "epoch": 0.05864393843022516, "ewc_loss": 0.003342454321682453, "ewc_loss_diag": 2.3692846298217773e-06, "ewc_loss_parallel": 9.773421879799571e-06, "grad_norm": 3.37754225730896, "learning_rate": 1.949978804578211e-07, "loss": 0.5352, "mean_token_accuracy": 0.8340467810630798, "num_tokens": 17539052.0, "step": 461 }, { "epoch": 0.058771148708815675, "ewc_loss": 0.0033540052827447653, "ewc_loss_diag": 2.384185791015625e-06, "ewc_loss_parallel": 9.736341780808289e-06, "grad_norm": 2.864123582839966, "learning_rate": 1.9542178889359897e-07, "loss": 0.6092, "mean_token_accuracy": 0.8092330098152161, "num_tokens": 17578942.0, "step": 462 }, { "epoch": 0.05889835898740618, "ewc_loss": 0.003345490200445056, "ewc_loss_diag": 2.3990869522094727e-06, "ewc_loss_parallel": 9.49860259424895e-06, "grad_norm": 3.430108070373535, "learning_rate": 1.9584569732937684e-07, "loss": 0.5867, "mean_token_accuracy": 0.820766270160675, "num_tokens": 17611931.0, "step": 463 }, { "epoch": 0.05902556926599669, "ewc_loss": 0.0033713988959789276, "ewc_loss_diag": 2.3990869522094727e-06, "ewc_loss_parallel": 9.757689440448303e-06, "grad_norm": 3.5813357830047607, "learning_rate": 1.962696057651547e-07, "loss": 0.4937, "mean_token_accuracy": 0.8450038433074951, "num_tokens": 17648764.0, "step": 464 }, { "epoch": 0.059152779544587204, "ewc_loss": 0.0033926498144865036, "ewc_loss_diag": 2.3990869522094727e-06, "ewc_loss_parallel": 9.97019924398046e-06, "grad_norm": 3.708550453186035, "learning_rate": 1.9669351420093258e-07, "loss": 0.6037, "mean_token_accuracy": 0.8116068840026855, "num_tokens": 17683099.0, "step": 465 }, { "epoch": 0.05927998982317771, "ewc_loss": 0.003404866671189666, "ewc_loss_diag": 2.3990869522094727e-06, "ewc_loss_parallel": 1.009236802929081e-05, "grad_norm": 3.3611879348754883, "learning_rate": 1.9711742263671046e-07, "loss": 0.5892, "mean_token_accuracy": 0.820915699005127, "num_tokens": 17727120.0, "step": 466 }, { "epoch": 0.059407200101768226, "ewc_loss": 0.0034192318562418222, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 9.930843589245342e-06, "grad_norm": 3.6998605728149414, "learning_rate": 1.9754133107248833e-07, "loss": 0.5428, "mean_token_accuracy": 0.8236098289489746, "num_tokens": 17761676.0, "step": 467 }, { "epoch": 0.059534410380358734, "ewc_loss": 0.003431859891861677, "ewc_loss_diag": 2.428889274597168e-06, "ewc_loss_parallel": 1.0057123290607706e-05, "grad_norm": 2.9842381477355957, "learning_rate": 1.979652395082662e-07, "loss": 0.6113, "mean_token_accuracy": 0.8114162683486938, "num_tokens": 17802311.0, "step": 468 }, { "epoch": 0.05966162065894924, "ewc_loss": 0.0034369814675301313, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 9.80316417553695e-06, "grad_norm": 3.074521064758301, "learning_rate": 1.9838914794404408e-07, "loss": 0.5028, "mean_token_accuracy": 0.8386564254760742, "num_tokens": 17839048.0, "step": 469 }, { "epoch": 0.059788830937539755, "ewc_loss": 0.0034541497007012367, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 9.822259926295374e-06, "grad_norm": 3.33284854888916, "learning_rate": 1.9881305637982195e-07, "loss": 0.5739, "mean_token_accuracy": 0.8211065530776978, "num_tokens": 17881510.0, "step": 470 }, { "epoch": 0.05991604121613026, "ewc_loss": 0.003470932599157095, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 9.990087164624128e-06, "grad_norm": 4.040644645690918, "learning_rate": 1.9923696481559982e-07, "loss": 0.5986, "mean_token_accuracy": 0.8155204653739929, "num_tokens": 17915080.0, "step": 471 }, { "epoch": 0.06004325149472077, "ewc_loss": 0.0035111838951706886, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 1.0392602234787773e-05, "grad_norm": 3.1708600521087646, "learning_rate": 1.996608732513777e-07, "loss": 0.6315, "mean_token_accuracy": 0.8052581548690796, "num_tokens": 17957972.0, "step": 472 }, { "epoch": 0.060170461773311285, "ewc_loss": 0.0034818940330296755, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 1.0099702194565907e-05, "grad_norm": 3.5196192264556885, "learning_rate": 2.0008478168715557e-07, "loss": 0.5041, "mean_token_accuracy": 0.8380740284919739, "num_tokens": 17998008.0, "step": 473 }, { "epoch": 0.06029767205190179, "ewc_loss": 0.003467596136033535, "ewc_loss_diag": 2.4586915969848633e-06, "ewc_loss_parallel": 1.0109310096595436e-05, "grad_norm": 3.2338521480560303, "learning_rate": 2.0050869012293344e-07, "loss": 0.5098, "mean_token_accuracy": 0.8358805775642395, "num_tokens": 18032427.0, "step": 474 }, { "epoch": 0.060424882330492306, "ewc_loss": 0.0034713742788881063, "ewc_loss_diag": 2.473592758178711e-06, "ewc_loss_parallel": 9.99450458039064e-06, "grad_norm": 2.9943535327911377, "learning_rate": 2.009325985587113e-07, "loss": 0.5404, "mean_token_accuracy": 0.8296852111816406, "num_tokens": 18069848.0, "step": 475 }, { "epoch": 0.060552092609082814, "ewc_loss": 0.0034928172826766968, "ewc_loss_diag": 2.5033950805664062e-06, "ewc_loss_parallel": 9.903759746521246e-06, "grad_norm": 3.113913059234619, "learning_rate": 2.0135650699448918e-07, "loss": 0.6324, "mean_token_accuracy": 0.8032423853874207, "num_tokens": 18109240.0, "step": 476 }, { "epoch": 0.06067930288767332, "ewc_loss": 0.003518416080623865, "ewc_loss_diag": 2.518296241760254e-06, "ewc_loss_parallel": 1.000716019916581e-05, "grad_norm": 3.3265750408172607, "learning_rate": 2.0178041543026706e-07, "loss": 0.6028, "mean_token_accuracy": 0.8144127726554871, "num_tokens": 18146849.0, "step": 477 }, { "epoch": 0.060806513166263836, "ewc_loss": 0.0035495301708579063, "ewc_loss_diag": 2.5331974029541016e-06, "ewc_loss_parallel": 1.0165711501031183e-05, "grad_norm": 3.879936695098877, "learning_rate": 2.022043238660449e-07, "loss": 0.6335, "mean_token_accuracy": 0.8027031421661377, "num_tokens": 18186872.0, "step": 478 }, { "epoch": 0.06093372344485434, "ewc_loss": 0.0035802386701107025, "ewc_loss_diag": 2.5331974029541016e-06, "ewc_loss_parallel": 1.0472797839611303e-05, "grad_norm": 3.1708314418792725, "learning_rate": 2.026282323018228e-07, "loss": 0.6085, "mean_token_accuracy": 0.8123996257781982, "num_tokens": 18222299.0, "step": 479 }, { "epoch": 0.06106093372344486, "ewc_loss": 0.0035693012177944183, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 1.0210835171164945e-05, "grad_norm": 2.819204330444336, "learning_rate": 2.0305214073760065e-07, "loss": 0.6125, "mean_token_accuracy": 0.8130348920822144, "num_tokens": 18261645.0, "step": 480 }, { "epoch": 0.061188144002035365, "ewc_loss": 0.0035452605225145817, "ewc_loss_diag": 2.5480985641479492e-06, "ewc_loss_parallel": 9.970428436645307e-06, "grad_norm": 3.1198935508728027, "learning_rate": 2.0347604917337855e-07, "loss": 0.5829, "mean_token_accuracy": 0.8179699182510376, "num_tokens": 18292970.0, "step": 481 }, { "epoch": 0.06131535428062587, "ewc_loss": 0.0035784179344773293, "ewc_loss_diag": 2.562999725341797e-06, "ewc_loss_parallel": 1.0149413355975412e-05, "grad_norm": 3.032057762145996, "learning_rate": 2.038999576091564e-07, "loss": 0.5313, "mean_token_accuracy": 0.8359009027481079, "num_tokens": 18331133.0, "step": 482 }, { "epoch": 0.06144256455921639, "ewc_loss": 0.0035868328996002674, "ewc_loss_diag": 2.562999725341797e-06, "ewc_loss_parallel": 1.0233562534267548e-05, "grad_norm": 3.022618055343628, "learning_rate": 2.043238660449343e-07, "loss": 0.6165, "mean_token_accuracy": 0.8095209002494812, "num_tokens": 18368905.0, "step": 483 }, { "epoch": 0.061569774837806894, "ewc_loss": 0.0036078165285289288, "ewc_loss_diag": 2.5779008865356445e-06, "ewc_loss_parallel": 1.029081340675475e-05, "grad_norm": 3.3461272716522217, "learning_rate": 2.0474777448071214e-07, "loss": 0.5416, "mean_token_accuracy": 0.832404375076294, "num_tokens": 18406362.0, "step": 484 }, { "epoch": 0.0616969851163974, "ewc_loss": 0.0036171022802591324, "ewc_loss_diag": 2.562999725341797e-06, "ewc_loss_parallel": 1.0536256013438106e-05, "grad_norm": 3.0446689128875732, "learning_rate": 2.0517168291649004e-07, "loss": 0.5776, "mean_token_accuracy": 0.8211380839347839, "num_tokens": 18439246.0, "step": 485 }, { "epoch": 0.061824195394987916, "ewc_loss": 0.0036085406318306923, "ewc_loss_diag": 2.562999725341797e-06, "ewc_loss_parallel": 1.045063982019201e-05, "grad_norm": 3.4936890602111816, "learning_rate": 2.0559559135226788e-07, "loss": 0.5825, "mean_token_accuracy": 0.8164374828338623, "num_tokens": 18473408.0, "step": 486 }, { "epoch": 0.06195140567357842, "ewc_loss": 0.003692519385367632, "ewc_loss_diag": 2.6226043701171875e-06, "ewc_loss_parallel": 1.0680077139113564e-05, "grad_norm": 3.6321425437927246, "learning_rate": 2.0601949978804578e-07, "loss": 0.5368, "mean_token_accuracy": 0.8303638696670532, "num_tokens": 18513246.0, "step": 487 }, { "epoch": 0.06207861595216894, "ewc_loss": 0.003706165123730898, "ewc_loss_diag": 2.6226043701171875e-06, "ewc_loss_parallel": 1.0816534995683469e-05, "grad_norm": 3.1562488079071045, "learning_rate": 2.0644340822382363e-07, "loss": 0.5085, "mean_token_accuracy": 0.8386433720588684, "num_tokens": 18549125.0, "step": 488 }, { "epoch": 0.062205826230759445, "ewc_loss": 0.003683242481201887, "ewc_loss_diag": 2.6226043701171875e-06, "ewc_loss_parallel": 1.0587308679532725e-05, "grad_norm": 3.3372583389282227, "learning_rate": 2.0686731665960153e-07, "loss": 0.5588, "mean_token_accuracy": 0.8302863240242004, "num_tokens": 18585306.0, "step": 489 }, { "epoch": 0.06233303650934995, "ewc_loss": 0.0036890716291964054, "ewc_loss_diag": 2.6226043701171875e-06, "ewc_loss_parallel": 1.0645598194969352e-05, "grad_norm": 3.456542491912842, "learning_rate": 2.0729122509537937e-07, "loss": 0.518, "mean_token_accuracy": 0.8381150960922241, "num_tokens": 18624378.0, "step": 490 }, { "epoch": 0.06246024678794047, "ewc_loss": 0.003716144012287259, "ewc_loss_diag": 2.637505531311035e-06, "ewc_loss_parallel": 1.0763735190266743e-05, "grad_norm": 3.2590997219085693, "learning_rate": 2.0771513353115727e-07, "loss": 0.5608, "mean_token_accuracy": 0.8239294290542603, "num_tokens": 18660814.0, "step": 491 }, { "epoch": 0.06258745706653097, "ewc_loss": 0.0037447551731020212, "ewc_loss_diag": 2.6673078536987305e-06, "ewc_loss_parallel": 1.074467036232818e-05, "grad_norm": 3.0768227577209473, "learning_rate": 2.0813904196693512e-07, "loss": 0.5309, "mean_token_accuracy": 0.8340431451797485, "num_tokens": 18702129.0, "step": 492 }, { "epoch": 0.06271466734512149, "ewc_loss": 0.0037314919754862785, "ewc_loss_diag": 2.6673078536987305e-06, "ewc_loss_parallel": 1.0612037840473931e-05, "grad_norm": 3.888676643371582, "learning_rate": 2.0856295040271302e-07, "loss": 0.6004, "mean_token_accuracy": 0.8127939105033875, "num_tokens": 18737496.0, "step": 493 }, { "epoch": 0.06284187762371199, "ewc_loss": 0.0037872614338994026, "ewc_loss_diag": 2.682209014892578e-06, "ewc_loss_parallel": 1.1017144970537629e-05, "grad_norm": 2.969174385070801, "learning_rate": 2.0898685883849086e-07, "loss": 0.5591, "mean_token_accuracy": 0.8283412456512451, "num_tokens": 18779332.0, "step": 494 }, { "epoch": 0.0629690879023025, "ewc_loss": 0.00377003476023674, "ewc_loss_diag": 2.6971101760864258e-06, "ewc_loss_parallel": 1.0692291652958374e-05, "grad_norm": 2.6666314601898193, "learning_rate": 2.0941076727426874e-07, "loss": 0.5446, "mean_token_accuracy": 0.8287274837493896, "num_tokens": 18817976.0, "step": 495 }, { "epoch": 0.06309629818089302, "ewc_loss": 0.0037318323738873005, "ewc_loss_diag": 2.682209014892578e-06, "ewc_loss_parallel": 1.0462853424542118e-05, "grad_norm": 2.6276748180389404, "learning_rate": 2.098346757100466e-07, "loss": 0.5354, "mean_token_accuracy": 0.8313175439834595, "num_tokens": 18862860.0, "step": 496 }, { "epoch": 0.06322350845948353, "ewc_loss": 0.003751581534743309, "ewc_loss_diag": 2.6971101760864258e-06, "ewc_loss_parallel": 1.0507756996958051e-05, "grad_norm": 3.2995684146881104, "learning_rate": 2.1025858414582448e-07, "loss": 0.5757, "mean_token_accuracy": 0.8190782070159912, "num_tokens": 18899025.0, "step": 497 }, { "epoch": 0.06335071873807403, "ewc_loss": 0.0038216104730963707, "ewc_loss_diag": 2.7120113372802734e-06, "ewc_loss_parallel": 1.1055460163333919e-05, "grad_norm": 3.1601438522338867, "learning_rate": 2.1068249258160238e-07, "loss": 0.5416, "mean_token_accuracy": 0.8236838579177856, "num_tokens": 18933118.0, "step": 498 }, { "epoch": 0.06347792901666455, "ewc_loss": 0.0038458886556327343, "ewc_loss_diag": 2.726912498474121e-06, "ewc_loss_parallel": 1.1145653843414038e-05, "grad_norm": 3.410125494003296, "learning_rate": 2.1110640101738023e-07, "loss": 0.5425, "mean_token_accuracy": 0.8309837579727173, "num_tokens": 18969165.0, "step": 499 }, { "epoch": 0.06360513929525506, "ewc_loss": 0.0038589737378060818, "ewc_loss_diag": 2.726912498474121e-06, "ewc_loss_parallel": 1.1276505574642215e-05, "grad_norm": 3.0754873752593994, "learning_rate": 2.1153030945315813e-07, "loss": 0.5517, "mean_token_accuracy": 0.8257801532745361, "num_tokens": 19003882.0, "step": 500 }, { "epoch": 0.06373234957384556, "ewc_loss": 0.003868822008371353, "ewc_loss_diag": 2.7567148208618164e-06, "ewc_loss_parallel": 1.1069811989727896e-05, "grad_norm": 3.097599506378174, "learning_rate": 2.1195421788893597e-07, "loss": 0.5518, "mean_token_accuracy": 0.8287639617919922, "num_tokens": 19037540.0, "step": 501 }, { "epoch": 0.06385955985243608, "ewc_loss": 0.0038839601911604404, "ewc_loss_diag": 2.771615982055664e-06, "ewc_loss_parallel": 1.1068604180763941e-05, "grad_norm": 2.9484610557556152, "learning_rate": 2.1237812632471387e-07, "loss": 0.5631, "mean_token_accuracy": 0.8204498887062073, "num_tokens": 19077270.0, "step": 502 }, { "epoch": 0.06398677013102659, "ewc_loss": 0.0038760448805987835, "ewc_loss_diag": 2.771615982055664e-06, "ewc_loss_parallel": 1.0989451766363345e-05, "grad_norm": 3.1495280265808105, "learning_rate": 2.1280203476049172e-07, "loss": 0.6495, "mean_token_accuracy": 0.7939567565917969, "num_tokens": 19117579.0, "step": 503 }, { "epoch": 0.06411398040961709, "ewc_loss": 0.003908381797373295, "ewc_loss_diag": 2.7865171432495117e-06, "ewc_loss_parallel": 1.1160236681462266e-05, "grad_norm": 2.9667022228240967, "learning_rate": 2.1322594319626962e-07, "loss": 0.5896, "mean_token_accuracy": 0.8150712251663208, "num_tokens": 19156599.0, "step": 504 }, { "epoch": 0.0642411906882076, "ewc_loss": 0.003920430317521095, "ewc_loss_diag": 2.8014183044433594e-06, "ewc_loss_parallel": 1.1128128790005576e-05, "grad_norm": 3.1838626861572266, "learning_rate": 2.1364985163204746e-07, "loss": 0.5798, "mean_token_accuracy": 0.8175379633903503, "num_tokens": 19187367.0, "step": 505 }, { "epoch": 0.06436840096679812, "ewc_loss": 0.003966881427913904, "ewc_loss_diag": 2.8312206268310547e-06, "ewc_loss_parallel": 1.1287466804787982e-05, "grad_norm": 3.1497244834899902, "learning_rate": 2.1407376006782536e-07, "loss": 0.5161, "mean_token_accuracy": 0.8359538316726685, "num_tokens": 19223916.0, "step": 506 }, { "epoch": 0.06449561124538863, "ewc_loss": 0.0039722248911857605, "ewc_loss_diag": 2.8312206268310547e-06, "ewc_loss_parallel": 1.1340901437506545e-05, "grad_norm": 2.9760940074920654, "learning_rate": 2.144976685036032e-07, "loss": 0.5304, "mean_token_accuracy": 0.833823561668396, "num_tokens": 19260336.0, "step": 507 }, { "epoch": 0.06462282152397913, "ewc_loss": 0.003976789303123951, "ewc_loss_diag": 2.8461217880249023e-06, "ewc_loss_parallel": 1.1233959412493277e-05, "grad_norm": 2.907121181488037, "learning_rate": 2.149215769393811e-07, "loss": 0.592, "mean_token_accuracy": 0.8167400360107422, "num_tokens": 19299895.0, "step": 508 }, { "epoch": 0.06475003180256965, "ewc_loss": 0.0039898972027003765, "ewc_loss_diag": 2.86102294921875e-06, "ewc_loss_parallel": 1.1212448953301646e-05, "grad_norm": 3.053846597671509, "learning_rate": 2.1534548537515895e-07, "loss": 0.5344, "mean_token_accuracy": 0.8307197690010071, "num_tokens": 19336614.0, "step": 509 }, { "epoch": 0.06487724208116016, "ewc_loss": 0.004020295105874538, "ewc_loss_diag": 2.8908252716064453e-06, "ewc_loss_parallel": 1.1363842531864066e-05, "grad_norm": 2.95607328414917, "learning_rate": 2.1576939381093685e-07, "loss": 0.5852, "mean_token_accuracy": 0.8187041282653809, "num_tokens": 19378683.0, "step": 510 }, { "epoch": 0.06500445235975066, "ewc_loss": 0.004066110588610172, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 1.1364231795596424e-05, "grad_norm": 3.0296993255615234, "learning_rate": 2.161933022467147e-07, "loss": 0.569, "mean_token_accuracy": 0.8193347454071045, "num_tokens": 19422598.0, "step": 511 }, { "epoch": 0.06513166263834118, "ewc_loss": 0.004076512064784765, "ewc_loss_diag": 2.9355287551879883e-06, "ewc_loss_parallel": 1.1468244338175282e-05, "grad_norm": 2.6795499324798584, "learning_rate": 2.166172106824926e-07, "loss": 0.5596, "mean_token_accuracy": 0.8280370831489563, "num_tokens": 19466972.0, "step": 512 }, { "epoch": 0.06525887291693169, "ewc_loss": 0.004073937423527241, "ewc_loss_diag": 2.950429916381836e-06, "ewc_loss_parallel": 1.1289908798062243e-05, "grad_norm": 3.354904890060425, "learning_rate": 2.1704111911827044e-07, "loss": 0.5495, "mean_token_accuracy": 0.8231508731842041, "num_tokens": 19503809.0, "step": 513 }, { "epoch": 0.0653860831955222, "ewc_loss": 0.004154370166361332, "ewc_loss_diag": 2.9802322387695312e-06, "ewc_loss_parallel": 1.1789063137257472e-05, "grad_norm": 2.7843871116638184, "learning_rate": 2.1746502755404831e-07, "loss": 0.6052, "mean_token_accuracy": 0.8172324299812317, "num_tokens": 19545713.0, "step": 514 }, { "epoch": 0.06551329347411271, "ewc_loss": 0.00414213165640831, "ewc_loss_diag": 2.995133399963379e-06, "ewc_loss_parallel": 1.1514092875586357e-05, "grad_norm": 2.877659797668457, "learning_rate": 2.178889359898262e-07, "loss": 0.5917, "mean_token_accuracy": 0.819057285785675, "num_tokens": 19582272.0, "step": 515 }, { "epoch": 0.06564050375270322, "ewc_loss": 0.004145100712776184, "ewc_loss_diag": 2.995133399963379e-06, "ewc_loss_parallel": 1.1543782420631032e-05, "grad_norm": 2.6953189373016357, "learning_rate": 2.1831284442560406e-07, "loss": 0.6071, "mean_token_accuracy": 0.8126010894775391, "num_tokens": 19625574.0, "step": 516 }, { "epoch": 0.06576771403129372, "ewc_loss": 0.004168424755334854, "ewc_loss_diag": 3.0249357223510742e-06, "ewc_loss_parallel": 1.1471843208710197e-05, "grad_norm": 2.9416701793670654, "learning_rate": 2.1873675286138193e-07, "loss": 0.5946, "mean_token_accuracy": 0.8123615384101868, "num_tokens": 19667791.0, "step": 517 }, { "epoch": 0.06589492430988424, "ewc_loss": 0.004164651036262512, "ewc_loss_diag": 2.995133399963379e-06, "ewc_loss_parallel": 1.1739283763745334e-05, "grad_norm": 3.116586446762085, "learning_rate": 2.191606612971598e-07, "loss": 0.6229, "mean_token_accuracy": 0.8065471053123474, "num_tokens": 19708289.0, "step": 518 }, { "epoch": 0.06602213458847475, "ewc_loss": 0.004202334210276604, "ewc_loss_diag": 3.0100345611572266e-06, "ewc_loss_parallel": 1.1963528777414467e-05, "grad_norm": 2.859858989715576, "learning_rate": 2.1958456973293768e-07, "loss": 0.6019, "mean_token_accuracy": 0.8081693649291992, "num_tokens": 19747444.0, "step": 519 }, { "epoch": 0.06614934486706527, "ewc_loss": 0.004187216050922871, "ewc_loss_diag": 3.0100345611572266e-06, "ewc_loss_parallel": 1.1812344382633455e-05, "grad_norm": 2.8844332695007324, "learning_rate": 2.2000847816871555e-07, "loss": 0.4905, "mean_token_accuracy": 0.8415383100509644, "num_tokens": 19787061.0, "step": 520 }, { "epoch": 0.06627655514565577, "ewc_loss": 0.004204038064926863, "ewc_loss_diag": 3.0249357223510742e-06, "ewc_loss_parallel": 1.182797768706223e-05, "grad_norm": 2.794785499572754, "learning_rate": 2.2043238660449342e-07, "loss": 0.5326, "mean_token_accuracy": 0.8341606855392456, "num_tokens": 19826893.0, "step": 521 }, { "epoch": 0.06640376542424628, "ewc_loss": 0.004217760171741247, "ewc_loss_diag": 3.039836883544922e-06, "ewc_loss_parallel": 1.1812611774075776e-05, "grad_norm": 2.8643486499786377, "learning_rate": 2.208562950402713e-07, "loss": 0.5257, "mean_token_accuracy": 0.8315991163253784, "num_tokens": 19864285.0, "step": 522 }, { "epoch": 0.0665309757028368, "ewc_loss": 0.004210916347801685, "ewc_loss_diag": 3.0249357223510742e-06, "ewc_loss_parallel": 1.1896762771357317e-05, "grad_norm": 2.777371644973755, "learning_rate": 2.2128020347604917e-07, "loss": 0.561, "mean_token_accuracy": 0.8250152468681335, "num_tokens": 19903806.0, "step": 523 }, { "epoch": 0.0666581859814273, "ewc_loss": 0.004251124802976847, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 1.1841083505714778e-05, "grad_norm": 2.7057785987854004, "learning_rate": 2.2170411191182704e-07, "loss": 0.5385, "mean_token_accuracy": 0.8292194604873657, "num_tokens": 19950382.0, "step": 524 }, { "epoch": 0.06678539626001781, "ewc_loss": 0.0042534079402685165, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 1.186391273222398e-05, "grad_norm": 2.8254244327545166, "learning_rate": 2.221280203476049e-07, "loss": 0.5624, "mean_token_accuracy": 0.8215856552124023, "num_tokens": 19984765.0, "step": 525 }, { "epoch": 0.06691260653860832, "ewc_loss": 0.004267833661288023, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 1.2008172234345693e-05, "grad_norm": 2.864499807357788, "learning_rate": 2.2255192878338279e-07, "loss": 0.602, "mean_token_accuracy": 0.8145250082015991, "num_tokens": 20021081.0, "step": 526 }, { "epoch": 0.06703981681719882, "ewc_loss": 0.004278817214071751, "ewc_loss_diag": 3.069639205932617e-06, "ewc_loss_parallel": 1.2118008271500003e-05, "grad_norm": 3.246086835861206, "learning_rate": 2.2297583721916066e-07, "loss": 0.573, "mean_token_accuracy": 0.8197579383850098, "num_tokens": 20055860.0, "step": 527 }, { "epoch": 0.06716702709578934, "ewc_loss": 0.004326630383729935, "ewc_loss_diag": 3.084540367126465e-06, "ewc_loss_parallel": 1.2443549167073797e-05, "grad_norm": 2.8487188816070557, "learning_rate": 2.2339974565493853e-07, "loss": 0.53, "mean_token_accuracy": 0.8348056674003601, "num_tokens": 20095023.0, "step": 528 }, { "epoch": 0.06729423737437985, "ewc_loss": 0.004300939850509167, "ewc_loss_diag": 3.084540367126465e-06, "ewc_loss_parallel": 1.2186646927148104e-05, "grad_norm": 3.033388376235962, "learning_rate": 2.238236540907164e-07, "loss": 0.5839, "mean_token_accuracy": 0.8179565072059631, "num_tokens": 20134620.0, "step": 529 }, { "epoch": 0.06742144765297035, "ewc_loss": 0.004307577386498451, "ewc_loss_diag": 3.084540367126465e-06, "ewc_loss_parallel": 1.225302094098879e-05, "grad_norm": 3.1545519828796387, "learning_rate": 2.2424756252649428e-07, "loss": 0.5536, "mean_token_accuracy": 0.8236511945724487, "num_tokens": 20169690.0, "step": 530 }, { "epoch": 0.06754865793156087, "ewc_loss": 0.004319524392485619, "ewc_loss_diag": 3.084540367126465e-06, "ewc_loss_parallel": 1.2372492165013682e-05, "grad_norm": 3.198810577392578, "learning_rate": 2.2467147096227215e-07, "loss": 0.5463, "mean_token_accuracy": 0.828449010848999, "num_tokens": 20203609.0, "step": 531 }, { "epoch": 0.06767586821015138, "ewc_loss": 0.004359125625342131, "ewc_loss_diag": 3.11434268951416e-06, "ewc_loss_parallel": 1.2463327948353253e-05, "grad_norm": 3.0808634757995605, "learning_rate": 2.2509537939805002e-07, "loss": 0.552, "mean_token_accuracy": 0.8242069482803345, "num_tokens": 20234616.0, "step": 532 }, { "epoch": 0.0678030784887419, "ewc_loss": 0.004352118819952011, "ewc_loss_diag": 3.11434268951416e-06, "ewc_loss_parallel": 1.239325683854986e-05, "grad_norm": 2.8505446910858154, "learning_rate": 2.2551928783382787e-07, "loss": 0.5442, "mean_token_accuracy": 0.8258719444274902, "num_tokens": 20271634.0, "step": 533 }, { "epoch": 0.0679302887673324, "ewc_loss": 0.0043379440903663635, "ewc_loss_diag": 3.11434268951416e-06, "ewc_loss_parallel": 1.2251510270289145e-05, "grad_norm": 2.9603431224823, "learning_rate": 2.2594319626960577e-07, "loss": 0.5716, "mean_token_accuracy": 0.8243416547775269, "num_tokens": 20309683.0, "step": 534 }, { "epoch": 0.06805749904592291, "ewc_loss": 0.004346305504441261, "ewc_loss_diag": 3.11434268951416e-06, "ewc_loss_parallel": 1.2335127394180745e-05, "grad_norm": 3.0033881664276123, "learning_rate": 2.263671047053836e-07, "loss": 0.4858, "mean_token_accuracy": 0.8411418795585632, "num_tokens": 20346013.0, "step": 535 }, { "epoch": 0.06818470932451343, "ewc_loss": 0.004388018511235714, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 1.244707982550608e-05, "grad_norm": 3.3849146366119385, "learning_rate": 2.267910131411615e-07, "loss": 0.5427, "mean_token_accuracy": 0.8320711851119995, "num_tokens": 20387613.0, "step": 536 }, { "epoch": 0.06831191960310393, "ewc_loss": 0.004416811279952526, "ewc_loss_diag": 3.1441450119018555e-06, "ewc_loss_parallel": 1.2735009477182757e-05, "grad_norm": 3.0446419715881348, "learning_rate": 2.2721492157693936e-07, "loss": 0.5419, "mean_token_accuracy": 0.8289101123809814, "num_tokens": 20417579.0, "step": 537 }, { "epoch": 0.06843912988169444, "ewc_loss": 0.00443073408678174, "ewc_loss_diag": 3.1739473342895508e-06, "ewc_loss_parallel": 1.256905852642376e-05, "grad_norm": 2.6402692794799805, "learning_rate": 2.2763883001271726e-07, "loss": 0.5145, "mean_token_accuracy": 0.8352155089378357, "num_tokens": 20455100.0, "step": 538 }, { "epoch": 0.06856634016028496, "ewc_loss": 0.004406332038342953, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 1.2172452443337534e-05, "grad_norm": 2.632373094558716, "learning_rate": 2.280627384484951e-07, "loss": 0.576, "mean_token_accuracy": 0.8176059126853943, "num_tokens": 20496458.0, "step": 539 }, { "epoch": 0.06869355043887546, "ewc_loss": 0.0044115702621638775, "ewc_loss_diag": 3.1888484954833984e-06, "ewc_loss_parallel": 1.2224834790686145e-05, "grad_norm": 3.0737264156341553, "learning_rate": 2.28486646884273e-07, "loss": 0.5509, "mean_token_accuracy": 0.8252452611923218, "num_tokens": 20533758.0, "step": 540 }, { "epoch": 0.06882076071746597, "ewc_loss": 0.004491019994020462, "ewc_loss_diag": 3.2186508178710938e-06, "ewc_loss_parallel": 1.2714154763671104e-05, "grad_norm": 3.06501841545105, "learning_rate": 2.2891055532005085e-07, "loss": 0.5956, "mean_token_accuracy": 0.815894365310669, "num_tokens": 20567259.0, "step": 541 }, { "epoch": 0.06894797099605648, "ewc_loss": 0.004517350345849991, "ewc_loss_diag": 3.2335519790649414e-06, "ewc_loss_parallel": 1.2824872101191431e-05, "grad_norm": 3.157501459121704, "learning_rate": 2.2933446375582875e-07, "loss": 0.5768, "mean_token_accuracy": 0.8210927248001099, "num_tokens": 20599333.0, "step": 542 }, { "epoch": 0.06907518127464699, "ewc_loss": 0.004570087417960167, "ewc_loss_diag": 3.2782554626464844e-06, "ewc_loss_parallel": 1.2894474821223412e-05, "grad_norm": 2.9610400199890137, "learning_rate": 2.297583721916066e-07, "loss": 0.515, "mean_token_accuracy": 0.8379881978034973, "num_tokens": 20640195.0, "step": 543 }, { "epoch": 0.0692023915532375, "ewc_loss": 0.004570589866489172, "ewc_loss_diag": 3.293156623840332e-06, "ewc_loss_parallel": 1.2746912943839561e-05, "grad_norm": 2.6711180210113525, "learning_rate": 2.301822806273845e-07, "loss": 0.5349, "mean_token_accuracy": 0.8311406373977661, "num_tokens": 20682617.0, "step": 544 }, { "epoch": 0.06932960183182801, "ewc_loss": 0.004543165676295757, "ewc_loss_diag": 3.293156623840332e-06, "ewc_loss_parallel": 1.2472672096919268e-05, "grad_norm": 3.168767213821411, "learning_rate": 2.3060618906316234e-07, "loss": 0.5861, "mean_token_accuracy": 0.8197387456893921, "num_tokens": 20718032.0, "step": 545 }, { "epoch": 0.06945681211041853, "ewc_loss": 0.004603992681950331, "ewc_loss_diag": 3.3080577850341797e-06, "ewc_loss_parallel": 1.2928353498864453e-05, "grad_norm": 2.820096254348755, "learning_rate": 2.3103009749894024e-07, "loss": 0.5516, "mean_token_accuracy": 0.8268533945083618, "num_tokens": 20755190.0, "step": 546 }, { "epoch": 0.06958402238900903, "ewc_loss": 0.004570790566504002, "ewc_loss_diag": 3.293156623840332e-06, "ewc_loss_parallel": 1.2748919289151672e-05, "grad_norm": 2.6062512397766113, "learning_rate": 2.3145400593471808e-07, "loss": 0.5757, "mean_token_accuracy": 0.8178278207778931, "num_tokens": 20799088.0, "step": 547 }, { "epoch": 0.06971123266759954, "ewc_loss": 0.0045680515468120575, "ewc_loss_diag": 3.3080577850341797e-06, "ewc_loss_parallel": 1.256894574908074e-05, "grad_norm": 3.0717623233795166, "learning_rate": 2.3187791437049598e-07, "loss": 0.6049, "mean_token_accuracy": 0.8094191551208496, "num_tokens": 20831819.0, "step": 548 }, { "epoch": 0.06983844294619006, "ewc_loss": 0.004647730849683285, "ewc_loss_diag": 3.337860107421875e-06, "ewc_loss_parallel": 1.3060560377198271e-05, "grad_norm": 2.97390079498291, "learning_rate": 2.3230182280627383e-07, "loss": 0.534, "mean_token_accuracy": 0.8325163125991821, "num_tokens": 20861291.0, "step": 549 }, { "epoch": 0.06996565322478056, "ewc_loss": 0.004683961626142263, "ewc_loss_diag": 3.3676624298095703e-06, "ewc_loss_parallel": 1.311769119638484e-05, "grad_norm": 2.6817026138305664, "learning_rate": 2.327257312420517e-07, "loss": 0.5122, "mean_token_accuracy": 0.8395944833755493, "num_tokens": 20897446.0, "step": 550 }, { "epoch": 0.07009286350337107, "ewc_loss": 0.0046538254246115685, "ewc_loss_diag": 3.3676624298095703e-06, "ewc_loss_parallel": 1.281633012695238e-05, "grad_norm": 2.884331464767456, "learning_rate": 2.3314963967782957e-07, "loss": 0.5354, "mean_token_accuracy": 0.8275765180587769, "num_tokens": 20934423.0, "step": 551 }, { "epoch": 0.07022007378196159, "ewc_loss": 0.004670843482017517, "ewc_loss_diag": 3.3676624298095703e-06, "ewc_loss_parallel": 1.2986510228074621e-05, "grad_norm": 2.641345500946045, "learning_rate": 2.3357354811360745e-07, "loss": 0.5542, "mean_token_accuracy": 0.828999400138855, "num_tokens": 20978394.0, "step": 552 }, { "epoch": 0.07034728406055209, "ewc_loss": 0.004664319101721048, "ewc_loss_diag": 3.3676624298095703e-06, "ewc_loss_parallel": 1.2921266716148239e-05, "grad_norm": 3.4335739612579346, "learning_rate": 2.3399745654938532e-07, "loss": 0.5791, "mean_token_accuracy": 0.8197259306907654, "num_tokens": 21010669.0, "step": 553 }, { "epoch": 0.0704744943391426, "ewc_loss": 0.004752486944198608, "ewc_loss_diag": 3.382563591003418e-06, "ewc_loss_parallel": 1.3650360415340401e-05, "grad_norm": 2.743420362472534, "learning_rate": 2.344213649851632e-07, "loss": 0.5486, "mean_token_accuracy": 0.8295013904571533, "num_tokens": 21047696.0, "step": 554 }, { "epoch": 0.07060170461773312, "ewc_loss": 0.004718882497400045, "ewc_loss_diag": 3.3974647521972656e-06, "ewc_loss_parallel": 1.316172620136058e-05, "grad_norm": 2.6795005798339844, "learning_rate": 2.3484527342094106e-07, "loss": 0.6299, "mean_token_accuracy": 0.8074655532836914, "num_tokens": 21091437.0, "step": 555 }, { "epoch": 0.07072891489632362, "ewc_loss": 0.004708362277597189, "ewc_loss_diag": 3.3974647521972656e-06, "ewc_loss_parallel": 1.30565222207224e-05, "grad_norm": 2.745577573776245, "learning_rate": 2.3526918185671894e-07, "loss": 0.5906, "mean_token_accuracy": 0.8157890439033508, "num_tokens": 21128880.0, "step": 556 }, { "epoch": 0.07085612517491413, "ewc_loss": 0.0047651007771492004, "ewc_loss_diag": 3.4421682357788086e-06, "ewc_loss_parallel": 1.316614361712709e-05, "grad_norm": 2.6216940879821777, "learning_rate": 2.356930902924968e-07, "loss": 0.5414, "mean_token_accuracy": 0.8301309943199158, "num_tokens": 21168523.0, "step": 557 }, { "epoch": 0.07098333545350465, "ewc_loss": 0.004764167591929436, "ewc_loss_diag": 3.4421682357788086e-06, "ewc_loss_parallel": 1.3156813110981602e-05, "grad_norm": 2.893045663833618, "learning_rate": 2.3611699872827468e-07, "loss": 0.6273, "mean_token_accuracy": 0.8040851354598999, "num_tokens": 21211376.0, "step": 558 }, { "epoch": 0.07111054573209516, "ewc_loss": 0.004792944528162479, "ewc_loss_diag": 3.4421682357788086e-06, "ewc_loss_parallel": 1.3444581782096066e-05, "grad_norm": 2.715564489364624, "learning_rate": 2.3654090716405255e-07, "loss": 0.5613, "mean_token_accuracy": 0.8231539726257324, "num_tokens": 21249162.0, "step": 559 }, { "epoch": 0.07123775601068566, "ewc_loss": 0.004783885553479195, "ewc_loss_diag": 3.4421682357788086e-06, "ewc_loss_parallel": 1.3353994290810078e-05, "grad_norm": 2.8940815925598145, "learning_rate": 2.3696481559983043e-07, "loss": 0.5866, "mean_token_accuracy": 0.8159786462783813, "num_tokens": 21282351.0, "step": 560 }, { "epoch": 0.07136496628927617, "ewc_loss": 0.004816478118300438, "ewc_loss_diag": 3.4570693969726562e-06, "ewc_loss_parallel": 1.3527328519558068e-05, "grad_norm": 2.8536462783813477, "learning_rate": 2.373887240356083e-07, "loss": 0.5782, "mean_token_accuracy": 0.8189290761947632, "num_tokens": 21319508.0, "step": 561 }, { "epoch": 0.07149217656786669, "ewc_loss": 0.00481743598356843, "ewc_loss_diag": 3.4570693969726562e-06, "ewc_loss_parallel": 1.3536909136746544e-05, "grad_norm": 2.704585313796997, "learning_rate": 2.3781263247138617e-07, "loss": 0.543, "mean_token_accuracy": 0.83013916015625, "num_tokens": 21357111.0, "step": 562 }, { "epoch": 0.07161938684645719, "ewc_loss": 0.004803337622433901, "ewc_loss_diag": 3.4570693969726562e-06, "ewc_loss_parallel": 1.3395923815551214e-05, "grad_norm": 2.773585796356201, "learning_rate": 2.3823654090716404e-07, "loss": 0.5442, "mean_token_accuracy": 0.8262442350387573, "num_tokens": 21393222.0, "step": 563 }, { "epoch": 0.0717465971250477, "ewc_loss": 0.004850565455853939, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 1.3563025277107954e-05, "grad_norm": 2.6048669815063477, "learning_rate": 2.386604493429419e-07, "loss": 0.5, "mean_token_accuracy": 0.839512825012207, "num_tokens": 21429595.0, "step": 564 }, { "epoch": 0.07187380740363822, "ewc_loss": 0.004842126742005348, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 1.3478640539688058e-05, "grad_norm": 2.86651611328125, "learning_rate": 2.390843577787198e-07, "loss": 0.5944, "mean_token_accuracy": 0.8100628852844238, "num_tokens": 21467341.0, "step": 565 }, { "epoch": 0.07200101768222872, "ewc_loss": 0.004873302765190601, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 1.3790402590529993e-05, "grad_norm": 2.5896496772766113, "learning_rate": 2.3950826621449766e-07, "loss": 0.4742, "mean_token_accuracy": 0.850327730178833, "num_tokens": 21510927.0, "step": 566 }, { "epoch": 0.07212822796081923, "ewc_loss": 0.004853997379541397, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 1.3597345969174057e-05, "grad_norm": 2.6147730350494385, "learning_rate": 2.3993217465027556e-07, "loss": 0.5485, "mean_token_accuracy": 0.8287221193313599, "num_tokens": 21551604.0, "step": 567 }, { "epoch": 0.07225543823940975, "ewc_loss": 0.004854545462876558, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 1.3602827493741643e-05, "grad_norm": 2.9562618732452393, "learning_rate": 2.403560830860534e-07, "loss": 0.5983, "mean_token_accuracy": 0.816819429397583, "num_tokens": 21591871.0, "step": 568 }, { "epoch": 0.07238264851800025, "ewc_loss": 0.00489878049120307, "ewc_loss_diag": 3.4868717193603516e-06, "ewc_loss_parallel": 1.4045176612853538e-05, "grad_norm": 2.944499969482422, "learning_rate": 2.4077999152183125e-07, "loss": 0.5967, "mean_token_accuracy": 0.8132229447364807, "num_tokens": 21629810.0, "step": 569 }, { "epoch": 0.07250985879659076, "ewc_loss": 0.0049217212945222855, "ewc_loss_diag": 3.516674041748047e-06, "ewc_loss_parallel": 1.412199799233349e-05, "grad_norm": 2.6526901721954346, "learning_rate": 2.4120389995760915e-07, "loss": 0.495, "mean_token_accuracy": 0.8458235859870911, "num_tokens": 21668706.0, "step": 570 }, { "epoch": 0.07263706907518128, "ewc_loss": 0.004902567248791456, "ewc_loss_diag": 3.5315752029418945e-06, "ewc_loss_parallel": 1.3777870663034264e-05, "grad_norm": 2.8315587043762207, "learning_rate": 2.41627808393387e-07, "loss": 0.5384, "mean_token_accuracy": 0.8281297087669373, "num_tokens": 21712725.0, "step": 571 }, { "epoch": 0.07276427935377179, "ewc_loss": 0.004920443054288626, "ewc_loss_diag": 3.5315752029418945e-06, "ewc_loss_parallel": 1.3956627299194224e-05, "grad_norm": 2.8257312774658203, "learning_rate": 2.420517168291649e-07, "loss": 0.5495, "mean_token_accuracy": 0.8307194709777832, "num_tokens": 21746086.0, "step": 572 }, { "epoch": 0.07289148963236229, "ewc_loss": 0.004944291897118092, "ewc_loss_diag": 3.546476364135742e-06, "ewc_loss_parallel": 1.4042529983271379e-05, "grad_norm": 2.9294650554656982, "learning_rate": 2.4247562526494274e-07, "loss": 0.5091, "mean_token_accuracy": 0.8364750146865845, "num_tokens": 21779573.0, "step": 573 }, { "epoch": 0.0730186999109528, "ewc_loss": 0.004971648566424847, "ewc_loss_diag": 3.56137752532959e-06, "ewc_loss_parallel": 1.4163510059006512e-05, "grad_norm": 2.685328722000122, "learning_rate": 2.4289953370072064e-07, "loss": 0.5809, "mean_token_accuracy": 0.8170228004455566, "num_tokens": 21818766.0, "step": 574 }, { "epoch": 0.07314591018954332, "ewc_loss": 0.004945903085172176, "ewc_loss_diag": 3.56137752532959e-06, "ewc_loss_parallel": 1.390605484630214e-05, "grad_norm": 2.577819347381592, "learning_rate": 2.433234421364985e-07, "loss": 0.5363, "mean_token_accuracy": 0.8342773914337158, "num_tokens": 21855539.0, "step": 575 }, { "epoch": 0.07327312046813382, "ewc_loss": 0.004931433126330376, "ewc_loss_diag": 3.56137752532959e-06, "ewc_loss_parallel": 1.376135060127126e-05, "grad_norm": 2.6554200649261475, "learning_rate": 2.437473505722764e-07, "loss": 0.5036, "mean_token_accuracy": 0.8382765650749207, "num_tokens": 21892567.0, "step": 576 }, { "epoch": 0.07340033074672433, "ewc_loss": 0.004953994881361723, "ewc_loss_diag": 3.56137752532959e-06, "ewc_loss_parallel": 1.3986970770929474e-05, "grad_norm": 2.752890110015869, "learning_rate": 2.4417125900805423e-07, "loss": 0.5212, "mean_token_accuracy": 0.8277429342269897, "num_tokens": 21928607.0, "step": 577 }, { "epoch": 0.07352754102531485, "ewc_loss": 0.004991399589926004, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 1.4208430911821779e-05, "grad_norm": 2.9356980323791504, "learning_rate": 2.4459516744383213e-07, "loss": 0.568, "mean_token_accuracy": 0.822330117225647, "num_tokens": 21963781.0, "step": 578 }, { "epoch": 0.07365475130390535, "ewc_loss": 0.005009841173887253, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 1.4392845514521468e-05, "grad_norm": 2.692788600921631, "learning_rate": 2.4501907587961e-07, "loss": 0.5282, "mean_token_accuracy": 0.8337565660476685, "num_tokens": 21999140.0, "step": 579 }, { "epoch": 0.07378196158249586, "ewc_loss": 0.004986846819519997, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 1.4162900697556324e-05, "grad_norm": 2.547257423400879, "learning_rate": 2.454429843153879e-07, "loss": 0.6268, "mean_token_accuracy": 0.8032773733139038, "num_tokens": 22038703.0, "step": 580 }, { "epoch": 0.07390917186108638, "ewc_loss": 0.004968906287103891, "ewc_loss_diag": 3.5762786865234375e-06, "ewc_loss_parallel": 1.3983495591674e-05, "grad_norm": 2.84704327583313, "learning_rate": 2.458668927511657e-07, "loss": 0.5423, "mean_token_accuracy": 0.8225415945053101, "num_tokens": 22071052.0, "step": 581 }, { "epoch": 0.07403638213967688, "ewc_loss": 0.005026509985327721, "ewc_loss_diag": 3.591179847717285e-06, "ewc_loss_parallel": 1.4406945410883054e-05, "grad_norm": 2.681792736053467, "learning_rate": 2.462908011869436e-07, "loss": 0.507, "mean_token_accuracy": 0.8392808437347412, "num_tokens": 22109629.0, "step": 582 }, { "epoch": 0.0741635924182674, "ewc_loss": 0.005031932145357132, "ewc_loss_diag": 3.606081008911133e-06, "ewc_loss_parallel": 1.4308579920907505e-05, "grad_norm": 2.6442525386810303, "learning_rate": 2.4671470962272147e-07, "loss": 0.5194, "mean_token_accuracy": 0.8371351957321167, "num_tokens": 22151544.0, "step": 583 }, { "epoch": 0.07429080269685791, "ewc_loss": 0.005055790301412344, "ewc_loss_diag": 3.635883331298828e-06, "ewc_loss_parallel": 1.4241986718843691e-05, "grad_norm": 2.8516294956207275, "learning_rate": 2.4713861805849937e-07, "loss": 0.4968, "mean_token_accuracy": 0.840111255645752, "num_tokens": 22189241.0, "step": 584 }, { "epoch": 0.07441801297544842, "ewc_loss": 0.0050874678418040276, "ewc_loss_diag": 3.635883331298828e-06, "ewc_loss_parallel": 1.4558759175997693e-05, "grad_norm": 2.5902557373046875, "learning_rate": 2.475625264942772e-07, "loss": 0.5123, "mean_token_accuracy": 0.8368891477584839, "num_tokens": 22229138.0, "step": 585 }, { "epoch": 0.07454522325403892, "ewc_loss": 0.005059720017015934, "ewc_loss_diag": 3.635883331298828e-06, "ewc_loss_parallel": 1.4281280527939089e-05, "grad_norm": 2.703521251678467, "learning_rate": 2.479864349300551e-07, "loss": 0.5501, "mean_token_accuracy": 0.8281601667404175, "num_tokens": 22264930.0, "step": 586 }, { "epoch": 0.07467243353262944, "ewc_loss": 0.0050743501633405685, "ewc_loss_diag": 3.635883331298828e-06, "ewc_loss_parallel": 1.442758093617158e-05, "grad_norm": 2.7677619457244873, "learning_rate": 2.4841034336583296e-07, "loss": 0.4946, "mean_token_accuracy": 0.8377065658569336, "num_tokens": 22297162.0, "step": 587 }, { "epoch": 0.07479964381121995, "ewc_loss": 0.005091412924230099, "ewc_loss_diag": 3.635883331298828e-06, "ewc_loss_parallel": 1.4598208508687094e-05, "grad_norm": 2.645443916320801, "learning_rate": 2.488342518016108e-07, "loss": 0.5749, "mean_token_accuracy": 0.815924346446991, "num_tokens": 22334779.0, "step": 588 }, { "epoch": 0.07492685408981045, "ewc_loss": 0.005078314337879419, "ewc_loss_diag": 3.635883331298828e-06, "ewc_loss_parallel": 1.4467225810221862e-05, "grad_norm": 2.6998324394226074, "learning_rate": 2.492581602373887e-07, "loss": 0.5344, "mean_token_accuracy": 0.8325545787811279, "num_tokens": 22372820.0, "step": 589 }, { "epoch": 0.07505406436840097, "ewc_loss": 0.005115818697959185, "ewc_loss_diag": 3.6656856536865234e-06, "ewc_loss_parallel": 1.4537092283717357e-05, "grad_norm": 2.664719820022583, "learning_rate": 2.4968206867316655e-07, "loss": 0.5541, "mean_token_accuracy": 0.8250399231910706, "num_tokens": 22417841.0, "step": 590 }, { "epoch": 0.07518127464699148, "ewc_loss": 0.005102017428725958, "ewc_loss_diag": 3.6507844924926758e-06, "ewc_loss_parallel": 1.4551668755302671e-05, "grad_norm": 2.770956039428711, "learning_rate": 2.5010597710894445e-07, "loss": 0.4694, "mean_token_accuracy": 0.8514500260353088, "num_tokens": 22449133.0, "step": 591 }, { "epoch": 0.07530848492558198, "ewc_loss": 0.005133615806698799, "ewc_loss_diag": 3.6656856536865234e-06, "ewc_loss_parallel": 1.471506584493909e-05, "grad_norm": 2.7182579040527344, "learning_rate": 2.505298855447223e-07, "loss": 0.5341, "mean_token_accuracy": 0.8324072957038879, "num_tokens": 22491145.0, "step": 592 }, { "epoch": 0.0754356952041725, "ewc_loss": 0.0051277391612529755, "ewc_loss_diag": 3.6656856536865234e-06, "ewc_loss_parallel": 1.4656300663773436e-05, "grad_norm": 2.611748456954956, "learning_rate": 2.509537939805002e-07, "loss": 0.5463, "mean_token_accuracy": 0.8296725153923035, "num_tokens": 22531738.0, "step": 593 }, { "epoch": 0.07556290548276301, "ewc_loss": 0.005148932337760925, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 1.456305653846357e-05, "grad_norm": 4.888679027557373, "learning_rate": 2.513777024162781e-07, "loss": 0.5256, "mean_token_accuracy": 0.8363882899284363, "num_tokens": 22563005.0, "step": 594 }, { "epoch": 0.07569011576135352, "ewc_loss": 0.005323400720953941, "ewc_loss_diag": 3.6656856536865234e-06, "ewc_loss_parallel": 1.6612912077107467e-05, "grad_norm": 2.6713085174560547, "learning_rate": 2.5180161085205594e-07, "loss": 0.5082, "mean_token_accuracy": 0.8401010036468506, "num_tokens": 22601333.0, "step": 595 }, { "epoch": 0.07581732603994402, "ewc_loss": 0.005158159416168928, "ewc_loss_diag": 3.6656856536865234e-06, "ewc_loss_parallel": 1.4960502085159533e-05, "grad_norm": 2.552212953567505, "learning_rate": 2.522255192878338e-07, "loss": 0.5591, "mean_token_accuracy": 0.8229911923408508, "num_tokens": 22639528.0, "step": 596 }, { "epoch": 0.07594453631853454, "ewc_loss": 0.005130899138748646, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 1.4382719200511929e-05, "grad_norm": 2.6337220668792725, "learning_rate": 2.526494277236117e-07, "loss": 0.5423, "mean_token_accuracy": 0.8290376663208008, "num_tokens": 22675290.0, "step": 597 }, { "epoch": 0.07607174659712505, "ewc_loss": 0.005166454240679741, "ewc_loss_diag": 3.6954879760742188e-06, "ewc_loss_parallel": 1.473827342124423e-05, "grad_norm": 2.7496635913848877, "learning_rate": 2.530733361593896e-07, "loss": 0.5186, "mean_token_accuracy": 0.8343245983123779, "num_tokens": 22712823.0, "step": 598 }, { "epoch": 0.07619895687571555, "ewc_loss": 0.005205918103456497, "ewc_loss_diag": 3.7103891372680664e-06, "ewc_loss_parallel": 1.4980325431679375e-05, "grad_norm": 2.7020206451416016, "learning_rate": 2.5349724459516743e-07, "loss": 0.5411, "mean_token_accuracy": 0.829877495765686, "num_tokens": 22750916.0, "step": 599 }, { "epoch": 0.07632616715430607, "ewc_loss": 0.005194214638322592, "ewc_loss_diag": 3.7103891372680664e-06, "ewc_loss_parallel": 1.486328892497113e-05, "grad_norm": 2.8349246978759766, "learning_rate": 2.539211530309453e-07, "loss": 0.4921, "mean_token_accuracy": 0.8412375450134277, "num_tokens": 22782537.0, "step": 600 }, { "epoch": 0.07645337743289658, "ewc_loss": 0.005208122543990612, "ewc_loss_diag": 3.7103891372680664e-06, "ewc_loss_parallel": 1.5002368854766246e-05, "grad_norm": 2.9303090572357178, "learning_rate": 2.543450614667232e-07, "loss": 0.5821, "mean_token_accuracy": 0.818371057510376, "num_tokens": 22814805.0, "step": 601 }, { "epoch": 0.07658058771148708, "ewc_loss": 0.005264306906610727, "ewc_loss_diag": 3.7550926208496094e-06, "ewc_loss_parallel": 1.5106449609447736e-05, "grad_norm": 2.7696309089660645, "learning_rate": 2.547689699025011e-07, "loss": 0.5813, "mean_token_accuracy": 0.8165832757949829, "num_tokens": 22852654.0, "step": 602 }, { "epoch": 0.0767077979900776, "ewc_loss": 0.005242356099188328, "ewc_loss_diag": 3.7550926208496094e-06, "ewc_loss_parallel": 1.4886939425196033e-05, "grad_norm": 2.6108529567718506, "learning_rate": 2.551928783382789e-07, "loss": 0.5345, "mean_token_accuracy": 0.8285019397735596, "num_tokens": 22894205.0, "step": 603 }, { "epoch": 0.07683500826866811, "ewc_loss": 0.005238103214651346, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 1.4691823707835283e-05, "grad_norm": 2.735907793045044, "learning_rate": 2.5561678677405677e-07, "loss": 0.5014, "mean_token_accuracy": 0.838787317276001, "num_tokens": 22924095.0, "step": 604 }, { "epoch": 0.07696221854725861, "ewc_loss": 0.005265445914119482, "ewc_loss_diag": 3.769993782043457e-06, "ewc_loss_parallel": 1.4965249647502787e-05, "grad_norm": 2.5322139263153076, "learning_rate": 2.5604069520983467e-07, "loss": 0.5831, "mean_token_accuracy": 0.8141430616378784, "num_tokens": 22960857.0, "step": 605 }, { "epoch": 0.07708942882584913, "ewc_loss": 0.005279993638396263, "ewc_loss_diag": 3.7997961044311523e-06, "ewc_loss_parallel": 1.4805549653829075e-05, "grad_norm": 2.6850781440734863, "learning_rate": 2.564646036456125e-07, "loss": 0.5372, "mean_token_accuracy": 0.8276073932647705, "num_tokens": 23004670.0, "step": 606 }, { "epoch": 0.07721663910443964, "ewc_loss": 0.005303895100951195, "ewc_loss_diag": 3.7997961044311523e-06, "ewc_loss_parallel": 1.5044566680444404e-05, "grad_norm": 2.7681331634521484, "learning_rate": 2.568885120813904e-07, "loss": 0.5358, "mean_token_accuracy": 0.8286987543106079, "num_tokens": 23041999.0, "step": 607 }, { "epoch": 0.07734384938303016, "ewc_loss": 0.005318734794855118, "ewc_loss_diag": 3.7997961044311523e-06, "ewc_loss_parallel": 1.519296256446978e-05, "grad_norm": 2.6015608310699463, "learning_rate": 2.5731242051716826e-07, "loss": 0.5248, "mean_token_accuracy": 0.8355540633201599, "num_tokens": 23080704.0, "step": 608 }, { "epoch": 0.07747105966162066, "ewc_loss": 0.005348715465515852, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 1.5035006072139367e-05, "grad_norm": 2.6712827682495117, "learning_rate": 2.5773632895294616e-07, "loss": 0.5599, "mean_token_accuracy": 0.8250390887260437, "num_tokens": 23118662.0, "step": 609 }, { "epoch": 0.07759826994021117, "ewc_loss": 0.005359752103686333, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 1.514537234470481e-05, "grad_norm": 2.6803066730499268, "learning_rate": 2.58160237388724e-07, "loss": 0.4824, "mean_token_accuracy": 0.8434030413627625, "num_tokens": 23150367.0, "step": 610 }, { "epoch": 0.07772548021880168, "ewc_loss": 0.005368117708712816, "ewc_loss_diag": 3.844499588012695e-06, "ewc_loss_parallel": 1.5229027667373884e-05, "grad_norm": 2.6431503295898438, "learning_rate": 2.585841458245019e-07, "loss": 0.5406, "mean_token_accuracy": 0.8280950784683228, "num_tokens": 23185836.0, "step": 611 }, { "epoch": 0.07785269049739219, "ewc_loss": 0.005397531669586897, "ewc_loss_diag": 3.874301910400391e-06, "ewc_loss_parallel": 1.5217991858662572e-05, "grad_norm": 2.6211116313934326, "learning_rate": 2.5900805426027975e-07, "loss": 0.4827, "mean_token_accuracy": 0.8459687829017639, "num_tokens": 23219346.0, "step": 612 }, { "epoch": 0.0779799007759827, "ewc_loss": 0.00540164252743125, "ewc_loss_diag": 3.874301910400391e-06, "ewc_loss_parallel": 1.525910192867741e-05, "grad_norm": 2.5876760482788086, "learning_rate": 2.5943196269605765e-07, "loss": 0.5527, "mean_token_accuracy": 0.8309828042984009, "num_tokens": 23257569.0, "step": 613 }, { "epoch": 0.07810711105457321, "ewc_loss": 0.005402434151619673, "ewc_loss_diag": 3.874301910400391e-06, "ewc_loss_parallel": 1.526701817056164e-05, "grad_norm": 2.3733298778533936, "learning_rate": 2.598558711318355e-07, "loss": 0.5444, "mean_token_accuracy": 0.8303651809692383, "num_tokens": 23302944.0, "step": 614 }, { "epoch": 0.07823432133316371, "ewc_loss": 0.005413796752691269, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 1.5075469491421245e-05, "grad_norm": 2.5878467559814453, "learning_rate": 2.602797795676134e-07, "loss": 0.5508, "mean_token_accuracy": 0.8267035484313965, "num_tokens": 23341660.0, "step": 615 }, { "epoch": 0.07836153161175423, "ewc_loss": 0.005456399172544479, "ewc_loss_diag": 3.904104232788086e-06, "ewc_loss_parallel": 1.5501491361646913e-05, "grad_norm": 2.733278512954712, "learning_rate": 2.6070368800339124e-07, "loss": 0.5462, "mean_token_accuracy": 0.822334885597229, "num_tokens": 23375956.0, "step": 616 }, { "epoch": 0.07848874189034474, "ewc_loss": 0.005513847339898348, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 1.5770798199810088e-05, "grad_norm": 2.632556676864624, "learning_rate": 2.6112759643916914e-07, "loss": 0.4995, "mean_token_accuracy": 0.839727520942688, "num_tokens": 23407568.0, "step": 617 }, { "epoch": 0.07861595216893524, "ewc_loss": 0.005498790182173252, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 1.5620227713952772e-05, "grad_norm": 2.7559027671813965, "learning_rate": 2.61551504874947e-07, "loss": 0.5306, "mean_token_accuracy": 0.8322747945785522, "num_tokens": 23440261.0, "step": 618 }, { "epoch": 0.07874316244752576, "ewc_loss": 0.0055144778452813625, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 1.5777102817082778e-05, "grad_norm": 2.6015427112579346, "learning_rate": 2.619754133107249e-07, "loss": 0.5146, "mean_token_accuracy": 0.8385897278785706, "num_tokens": 23481706.0, "step": 619 }, { "epoch": 0.07887037272611627, "ewc_loss": 0.005502990446984768, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 1.5662226360291243e-05, "grad_norm": 2.7874629497528076, "learning_rate": 2.623993217465028e-07, "loss": 0.574, "mean_token_accuracy": 0.815879225730896, "num_tokens": 23515410.0, "step": 620 }, { "epoch": 0.07899758300470679, "ewc_loss": 0.005528130568563938, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 1.5913627066765912e-05, "grad_norm": 2.577106475830078, "learning_rate": 2.6282323018228063e-07, "loss": 0.5576, "mean_token_accuracy": 0.8275436162948608, "num_tokens": 23554627.0, "step": 621 }, { "epoch": 0.07912479328329729, "ewc_loss": 0.005504420958459377, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 1.567653271195013e-05, "grad_norm": 2.652231454849243, "learning_rate": 2.632471386180585e-07, "loss": 0.5814, "mean_token_accuracy": 0.8164174556732178, "num_tokens": 23593400.0, "step": 622 }, { "epoch": 0.0792520035618878, "ewc_loss": 0.005516168661415577, "ewc_loss_diag": 3.933906555175781e-06, "ewc_loss_parallel": 1.5794008504599333e-05, "grad_norm": 2.7547359466552734, "learning_rate": 2.6367104705383637e-07, "loss": 0.5785, "mean_token_accuracy": 0.8164645433425903, "num_tokens": 23628242.0, "step": 623 }, { "epoch": 0.07937921384047832, "ewc_loss": 0.005566700827330351, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 1.599415554665029e-05, "grad_norm": 2.5231447219848633, "learning_rate": 2.6409495548961427e-07, "loss": 0.5652, "mean_token_accuracy": 0.8250038623809814, "num_tokens": 23671041.0, "step": 624 }, { "epoch": 0.07950642411906882, "ewc_loss": 0.005539611913263798, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 1.5723268006695434e-05, "grad_norm": 2.760195732116699, "learning_rate": 2.6451886392539206e-07, "loss": 0.542, "mean_token_accuracy": 0.8319783210754395, "num_tokens": 23701904.0, "step": 625 }, { "epoch": 0.07963363439765933, "ewc_loss": 0.005572570953518152, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 1.6052857972681522e-05, "grad_norm": 2.6312620639801025, "learning_rate": 2.6494277236116996e-07, "loss": 0.5072, "mean_token_accuracy": 0.8383849263191223, "num_tokens": 23742374.0, "step": 626 }, { "epoch": 0.07976084467624985, "ewc_loss": 0.005561502650380135, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 1.5942176105454564e-05, "grad_norm": 2.7027251720428467, "learning_rate": 2.6536668079694786e-07, "loss": 0.6119, "mean_token_accuracy": 0.809138298034668, "num_tokens": 23780216.0, "step": 627 }, { "epoch": 0.07988805495484035, "ewc_loss": 0.00556921074166894, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 1.601925578142982e-05, "grad_norm": 2.846308708190918, "learning_rate": 2.6579058923272576e-07, "loss": 0.4669, "mean_token_accuracy": 0.8524499535560608, "num_tokens": 23813641.0, "step": 628 }, { "epoch": 0.08001526523343086, "ewc_loss": 0.00559714762493968, "ewc_loss_diag": 3.9637088775634766e-06, "ewc_loss_parallel": 1.6298623449984007e-05, "grad_norm": 2.683004856109619, "learning_rate": 2.6621449766850356e-07, "loss": 0.5511, "mean_token_accuracy": 0.8269011378288269, "num_tokens": 23850189.0, "step": 629 }, { "epoch": 0.08014247551202137, "ewc_loss": 0.005606731865555048, "ewc_loss_diag": 3.993511199951172e-06, "ewc_loss_parallel": 1.6089290511445142e-05, "grad_norm": 2.5862276554107666, "learning_rate": 2.6663840610428145e-07, "loss": 0.5065, "mean_token_accuracy": 0.8383074402809143, "num_tokens": 23884410.0, "step": 630 }, { "epoch": 0.08026968579061187, "ewc_loss": 0.00559036061167717, "ewc_loss_diag": 3.993511199951172e-06, "ewc_loss_parallel": 1.5925579646136612e-05, "grad_norm": 2.53692364692688, "learning_rate": 2.6706231454005935e-07, "loss": 0.5149, "mean_token_accuracy": 0.8367977738380432, "num_tokens": 23930139.0, "step": 631 }, { "epoch": 0.08039689606920239, "ewc_loss": 0.00561746209859848, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.5891417206148617e-05, "grad_norm": 2.4532229900360107, "learning_rate": 2.6748622297583725e-07, "loss": 0.5141, "mean_token_accuracy": 0.8371856212615967, "num_tokens": 23973124.0, "step": 632 }, { "epoch": 0.0805241063477929, "ewc_loss": 0.005614264402538538, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.5859441191423684e-05, "grad_norm": 2.558206558227539, "learning_rate": 2.6791013141161505e-07, "loss": 0.522, "mean_token_accuracy": 0.8367107510566711, "num_tokens": 24008676.0, "step": 633 }, { "epoch": 0.08065131662638342, "ewc_loss": 0.00563998706638813, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.611666812095791e-05, "grad_norm": 2.78346586227417, "learning_rate": 2.6833403984739294e-07, "loss": 0.5222, "mean_token_accuracy": 0.8343185782432556, "num_tokens": 24044880.0, "step": 634 }, { "epoch": 0.08077852690497392, "ewc_loss": 0.005676478147506714, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6481575585203245e-05, "grad_norm": 2.452594757080078, "learning_rate": 2.6875794828317084e-07, "loss": 0.5122, "mean_token_accuracy": 0.8412503004074097, "num_tokens": 24090140.0, "step": 635 }, { "epoch": 0.08090573718356443, "ewc_loss": 0.005628817714750767, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6004971257643774e-05, "grad_norm": 2.7385854721069336, "learning_rate": 2.6918185671894874e-07, "loss": 0.5949, "mean_token_accuracy": 0.8160176873207092, "num_tokens": 24123311.0, "step": 636 }, { "epoch": 0.08103294746215495, "ewc_loss": 0.005673862062394619, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6455418517580256e-05, "grad_norm": 2.612761974334717, "learning_rate": 2.6960576515472654e-07, "loss": 0.6288, "mean_token_accuracy": 0.802363395690918, "num_tokens": 24164019.0, "step": 637 }, { "epoch": 0.08116015774074545, "ewc_loss": 0.005665385629981756, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6370651792385615e-05, "grad_norm": 2.5188255310058594, "learning_rate": 2.7002967359050443e-07, "loss": 0.4964, "mean_token_accuracy": 0.8426387310028076, "num_tokens": 24205986.0, "step": 638 }, { "epoch": 0.08128736801933596, "ewc_loss": 0.005645963363349438, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.617643101781141e-05, "grad_norm": 2.45569109916687, "learning_rate": 2.7045358202628233e-07, "loss": 0.553, "mean_token_accuracy": 0.8227502107620239, "num_tokens": 24251956.0, "step": 639 }, { "epoch": 0.08141457829792648, "ewc_loss": 0.005645499564707279, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6171794413821772e-05, "grad_norm": 2.5378239154815674, "learning_rate": 2.7087749046206023e-07, "loss": 0.5008, "mean_token_accuracy": 0.8387399911880493, "num_tokens": 24284990.0, "step": 640 }, { "epoch": 0.08154178857651698, "ewc_loss": 0.005668954458087683, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6406340364483185e-05, "grad_norm": 2.6293883323669434, "learning_rate": 2.71301398897838e-07, "loss": 0.5565, "mean_token_accuracy": 0.824049711227417, "num_tokens": 24323305.0, "step": 641 }, { "epoch": 0.08166899885510749, "ewc_loss": 0.005683070048689842, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.654749758017715e-05, "grad_norm": 2.4457314014434814, "learning_rate": 2.717253073336159e-07, "loss": 0.4687, "mean_token_accuracy": 0.8485608696937561, "num_tokens": 24365847.0, "step": 642 }, { "epoch": 0.081796209133698, "ewc_loss": 0.0056619844399392605, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.633664214750752e-05, "grad_norm": 2.488736629486084, "learning_rate": 2.721492157693938e-07, "loss": 0.4748, "mean_token_accuracy": 0.8474642038345337, "num_tokens": 24407269.0, "step": 643 }, { "epoch": 0.0819234194122885, "ewc_loss": 0.005672031547874212, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6437114027212374e-05, "grad_norm": 2.5543322563171387, "learning_rate": 2.7257312420517167e-07, "loss": 0.5307, "mean_token_accuracy": 0.8316729664802551, "num_tokens": 24444990.0, "step": 644 }, { "epoch": 0.08205062969087902, "ewc_loss": 0.005688471719622612, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.660151428950485e-05, "grad_norm": 2.50510573387146, "learning_rate": 2.729970326409495e-07, "loss": 0.56, "mean_token_accuracy": 0.8257230520248413, "num_tokens": 24487293.0, "step": 645 }, { "epoch": 0.08217783996946953, "ewc_loss": 0.00568226957693696, "ewc_loss_diag": 4.023313522338867e-06, "ewc_loss_parallel": 1.6539492207812145e-05, "grad_norm": 2.6508514881134033, "learning_rate": 2.734209410767274e-07, "loss": 0.5226, "mean_token_accuracy": 0.8346421122550964, "num_tokens": 24521403.0, "step": 646 }, { "epoch": 0.08230505024806005, "ewc_loss": 0.0057431841269135475, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.684346170804929e-05, "grad_norm": 2.5624871253967285, "learning_rate": 2.738448495125053e-07, "loss": 0.5333, "mean_token_accuracy": 0.8337758183479309, "num_tokens": 24562740.0, "step": 647 }, { "epoch": 0.08243226052665055, "ewc_loss": 0.005730268079787493, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.671430072747171e-05, "grad_norm": 2.677097797393799, "learning_rate": 2.7426875794828316e-07, "loss": 0.5397, "mean_token_accuracy": 0.8260064125061035, "num_tokens": 24597507.0, "step": 648 }, { "epoch": 0.08255947080524106, "ewc_loss": 0.0057470290921628475, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.6881913325050846e-05, "grad_norm": 2.62334942817688, "learning_rate": 2.74692666384061e-07, "loss": 0.5016, "mean_token_accuracy": 0.8377602696418762, "num_tokens": 24635629.0, "step": 649 }, { "epoch": 0.08268668108383158, "ewc_loss": 0.005744411610066891, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.6855734429555014e-05, "grad_norm": 2.518331289291382, "learning_rate": 2.751165748198389e-07, "loss": 0.5276, "mean_token_accuracy": 0.83098304271698, "num_tokens": 24676087.0, "step": 650 }, { "epoch": 0.08281389136242208, "ewc_loss": 0.005731450393795967, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.6726124158594757e-05, "grad_norm": 2.722742795944214, "learning_rate": 2.755404832556168e-07, "loss": 0.5441, "mean_token_accuracy": 0.8274711966514587, "num_tokens": 24713652.0, "step": 651 }, { "epoch": 0.0829411016410126, "ewc_loss": 0.00576750747859478, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.7086696971091442e-05, "grad_norm": 2.603286027908325, "learning_rate": 2.7596439169139465e-07, "loss": 0.5272, "mean_token_accuracy": 0.834417998790741, "num_tokens": 24750130.0, "step": 652 }, { "epoch": 0.08306831191960311, "ewc_loss": 0.005757593084126711, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.698755113466177e-05, "grad_norm": 2.6041691303253174, "learning_rate": 2.763883001271725e-07, "loss": 0.5765, "mean_token_accuracy": 0.8182723522186279, "num_tokens": 24787337.0, "step": 653 }, { "epoch": 0.08319552219819361, "ewc_loss": 0.0057524689473211765, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.6936310203163885e-05, "grad_norm": 2.7114417552948, "learning_rate": 2.768122085629504e-07, "loss": 0.5689, "mean_token_accuracy": 0.813666820526123, "num_tokens": 24822736.0, "step": 654 }, { "epoch": 0.08332273247678412, "ewc_loss": 0.00577936414629221, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.7205262338393368e-05, "grad_norm": 2.6408400535583496, "learning_rate": 2.772361169987283e-07, "loss": 0.5116, "mean_token_accuracy": 0.8405423760414124, "num_tokens": 24858743.0, "step": 655 }, { "epoch": 0.08344994275537464, "ewc_loss": 0.00576791213825345, "ewc_loss_diag": 4.0531158447265625e-06, "ewc_loss_parallel": 1.7090742403524928e-05, "grad_norm": 2.5235605239868164, "learning_rate": 2.7766002543450614e-07, "loss": 0.5194, "mean_token_accuracy": 0.8327023983001709, "num_tokens": 24901286.0, "step": 656 }, { "epoch": 0.08357715303396514, "ewc_loss": 0.0057815732434391975, "ewc_loss_diag": 4.082918167114258e-06, "ewc_loss_parallel": 1.692218029347714e-05, "grad_norm": 2.5629940032958984, "learning_rate": 2.78083933870284e-07, "loss": 0.587, "mean_token_accuracy": 0.8165576457977295, "num_tokens": 24943457.0, "step": 657 }, { "epoch": 0.08370436331255565, "ewc_loss": 0.005824710242450237, "ewc_loss_diag": 4.112720489501953e-06, "ewc_loss_parallel": 1.7048374502337538e-05, "grad_norm": 2.5855836868286133, "learning_rate": 2.785078423060619e-07, "loss": 0.5275, "mean_token_accuracy": 0.8300952315330505, "num_tokens": 24979247.0, "step": 658 }, { "epoch": 0.08383157359114617, "ewc_loss": 0.00583404116332531, "ewc_loss_diag": 4.112720489501953e-06, "ewc_loss_parallel": 1.714168320177123e-05, "grad_norm": 2.5250086784362793, "learning_rate": 2.789317507418398e-07, "loss": 0.5408, "mean_token_accuracy": 0.8281879425048828, "num_tokens": 25017456.0, "step": 659 }, { "epoch": 0.08395878386973668, "ewc_loss": 0.0058270529843866825, "ewc_loss_diag": 4.112720489501953e-06, "ewc_loss_parallel": 1.70717994478764e-05, "grad_norm": 2.57096266746521, "learning_rate": 2.7935565917761763e-07, "loss": 0.4972, "mean_token_accuracy": 0.8392358422279358, "num_tokens": 25054682.0, "step": 660 }, { "epoch": 0.08408599414832718, "ewc_loss": 0.005871735513210297, "ewc_loss_diag": 4.1425228118896484e-06, "ewc_loss_parallel": 1.721345142868813e-05, "grad_norm": 2.776066303253174, "learning_rate": 2.797795676133955e-07, "loss": 0.5693, "mean_token_accuracy": 0.8180745244026184, "num_tokens": 25092501.0, "step": 661 }, { "epoch": 0.0842132044269177, "ewc_loss": 0.005909896455705166, "ewc_loss_diag": 4.1425228118896484e-06, "ewc_loss_parallel": 1.7595060853636824e-05, "grad_norm": 2.582703113555908, "learning_rate": 2.802034760491734e-07, "loss": 0.4401, "mean_token_accuracy": 0.8589496612548828, "num_tokens": 25129253.0, "step": 662 }, { "epoch": 0.08434041470550821, "ewc_loss": 0.005907990038394928, "ewc_loss_diag": 4.172325134277344e-06, "ewc_loss_parallel": 1.7270816897507757e-05, "grad_norm": 2.3998453617095947, "learning_rate": 2.806273844849512e-07, "loss": 0.4783, "mean_token_accuracy": 0.8475316762924194, "num_tokens": 25169119.0, "step": 663 }, { "epoch": 0.08446762498409871, "ewc_loss": 0.005911884363740683, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7004585970425978e-05, "grad_norm": 2.5959229469299316, "learning_rate": 2.810512929207291e-07, "loss": 0.5483, "mean_token_accuracy": 0.821247398853302, "num_tokens": 25207654.0, "step": 664 }, { "epoch": 0.08459483526268922, "ewc_loss": 0.0059600574895739555, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7486314391135238e-05, "grad_norm": 2.5483691692352295, "learning_rate": 2.8147520135650697e-07, "loss": 0.4946, "mean_token_accuracy": 0.8387889862060547, "num_tokens": 25243258.0, "step": 665 }, { "epoch": 0.08472204554127974, "ewc_loss": 0.00595458410680294, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7431582818971947e-05, "grad_norm": 2.480545997619629, "learning_rate": 2.8189910979228487e-07, "loss": 0.5335, "mean_token_accuracy": 0.8320662975311279, "num_tokens": 25281301.0, "step": 666 }, { "epoch": 0.08484925581987024, "ewc_loss": 0.005943442694842815, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7320169718004763e-05, "grad_norm": 2.650413990020752, "learning_rate": 2.823230182280627e-07, "loss": 0.4866, "mean_token_accuracy": 0.8459231853485107, "num_tokens": 25313500.0, "step": 667 }, { "epoch": 0.08497646609846075, "ewc_loss": 0.005979639478027821, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7682137695373967e-05, "grad_norm": 2.518671751022339, "learning_rate": 2.827469266638406e-07, "loss": 0.5216, "mean_token_accuracy": 0.8339242935180664, "num_tokens": 25355121.0, "step": 668 }, { "epoch": 0.08510367637705127, "ewc_loss": 0.005961972288787365, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7505462892586365e-05, "grad_norm": 2.4874114990234375, "learning_rate": 2.8317083509961846e-07, "loss": 0.5376, "mean_token_accuracy": 0.8300864696502686, "num_tokens": 25398868.0, "step": 669 }, { "epoch": 0.08523088665564178, "ewc_loss": 0.005960946902632713, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7495209249318577e-05, "grad_norm": 2.4844939708709717, "learning_rate": 2.8359474353539636e-07, "loss": 0.5167, "mean_token_accuracy": 0.8344746828079224, "num_tokens": 25438670.0, "step": 670 }, { "epoch": 0.08535809693423228, "ewc_loss": 0.005970980040729046, "ewc_loss_diag": 4.202127456665039e-06, "ewc_loss_parallel": 1.7595544704818167e-05, "grad_norm": 2.6187009811401367, "learning_rate": 2.840186519711742e-07, "loss": 0.5626, "mean_token_accuracy": 0.8255400657653809, "num_tokens": 25479243.0, "step": 671 }, { "epoch": 0.0854853072128228, "ewc_loss": 0.0060286978259682655, "ewc_loss_diag": 4.231929779052734e-06, "ewc_loss_parallel": 1.7867541828309186e-05, "grad_norm": 2.6827545166015625, "learning_rate": 2.844425604069521e-07, "loss": 0.5475, "mean_token_accuracy": 0.8252048492431641, "num_tokens": 25514615.0, "step": 672 }, { "epoch": 0.08561251749141331, "ewc_loss": 0.006044439040124416, "ewc_loss_diag": 4.231929779052734e-06, "ewc_loss_parallel": 1.8024958990281448e-05, "grad_norm": 2.7114756107330322, "learning_rate": 2.8486646884272995e-07, "loss": 0.5238, "mean_token_accuracy": 0.8328584432601929, "num_tokens": 25552753.0, "step": 673 }, { "epoch": 0.08573972777000381, "ewc_loss": 0.006039747036993504, "ewc_loss_diag": 4.231929779052734e-06, "ewc_loss_parallel": 1.7978038158616982e-05, "grad_norm": 2.6328959465026855, "learning_rate": 2.8529037727850785e-07, "loss": 0.4622, "mean_token_accuracy": 0.8495683670043945, "num_tokens": 25587415.0, "step": 674 }, { "epoch": 0.08586693804859433, "ewc_loss": 0.006056895013898611, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 1.7844342437456362e-05, "grad_norm": 2.5061275959014893, "learning_rate": 2.857142857142857e-07, "loss": 0.4907, "mean_token_accuracy": 0.845600426197052, "num_tokens": 25625962.0, "step": 675 }, { "epoch": 0.08599414832718484, "ewc_loss": 0.006039445288479328, "ewc_loss_diag": 4.26173210144043e-06, "ewc_loss_parallel": 1.7669844964984804e-05, "grad_norm": 2.660098075866699, "learning_rate": 2.861381941500636e-07, "loss": 0.5391, "mean_token_accuracy": 0.8287765383720398, "num_tokens": 25669680.0, "step": 676 }, { "epoch": 0.08612135860577534, "ewc_loss": 0.006132352165877819, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.7988561012316495e-05, "grad_norm": 2.601210117340088, "learning_rate": 2.8656210258584144e-07, "loss": 0.5732, "mean_token_accuracy": 0.8151777982711792, "num_tokens": 25709221.0, "step": 677 }, { "epoch": 0.08624856888436586, "ewc_loss": 0.0061229439452290535, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.7894479242386296e-05, "grad_norm": 2.6961233615875244, "learning_rate": 2.869860110216193e-07, "loss": 0.5749, "mean_token_accuracy": 0.8152078986167908, "num_tokens": 25741880.0, "step": 678 }, { "epoch": 0.08637577916295637, "ewc_loss": 0.006135952193289995, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.8024562450591475e-05, "grad_norm": 2.4629461765289307, "learning_rate": 2.874099194573972e-07, "loss": 0.5083, "mean_token_accuracy": 0.8381617665290833, "num_tokens": 25786282.0, "step": 679 }, { "epoch": 0.08650298944154687, "ewc_loss": 0.006103919818997383, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.770423841662705e-05, "grad_norm": 2.678689479827881, "learning_rate": 2.878338278931751e-07, "loss": 0.5532, "mean_token_accuracy": 0.8264942765235901, "num_tokens": 25820229.0, "step": 680 }, { "epoch": 0.08663019972013739, "ewc_loss": 0.006141891703009605, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.808395427360665e-05, "grad_norm": 2.56451416015625, "learning_rate": 2.8825773632895293e-07, "loss": 0.49, "mean_token_accuracy": 0.8474065065383911, "num_tokens": 25856465.0, "step": 681 }, { "epoch": 0.0867574099987279, "ewc_loss": 0.006127429660409689, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.793933734006714e-05, "grad_norm": 2.4538769721984863, "learning_rate": 2.886816447647308e-07, "loss": 0.487, "mean_token_accuracy": 0.8440670967102051, "num_tokens": 25897407.0, "step": 682 }, { "epoch": 0.08688462027731841, "ewc_loss": 0.006110763642936945, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.777267607394606e-05, "grad_norm": 2.525470733642578, "learning_rate": 2.891055532005087e-07, "loss": 0.5113, "mean_token_accuracy": 0.8369737267494202, "num_tokens": 25938825.0, "step": 683 }, { "epoch": 0.08701183055590891, "ewc_loss": 0.006132833659648895, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.7993374058278278e-05, "grad_norm": 2.745255947113037, "learning_rate": 2.8952946163628657e-07, "loss": 0.5409, "mean_token_accuracy": 0.8254595994949341, "num_tokens": 25974944.0, "step": 684 }, { "epoch": 0.08713904083449943, "ewc_loss": 0.006177718285471201, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.844222060753964e-05, "grad_norm": 2.591054677963257, "learning_rate": 2.899533700720644e-07, "loss": 0.5001, "mean_token_accuracy": 0.8388562202453613, "num_tokens": 26012864.0, "step": 685 }, { "epoch": 0.08726625111308994, "ewc_loss": 0.006145179737359285, "ewc_loss_diag": 4.32133674621582e-06, "ewc_loss_parallel": 1.811683614505455e-05, "grad_norm": 2.6052756309509277, "learning_rate": 2.9037727850784227e-07, "loss": 0.5452, "mean_token_accuracy": 0.826443076133728, "num_tokens": 26049779.0, "step": 686 }, { "epoch": 0.08739346139168044, "ewc_loss": 0.0062029543332755566, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 1.808423075999599e-05, "grad_norm": 2.4958224296569824, "learning_rate": 2.9080118694362016e-07, "loss": 0.5166, "mean_token_accuracy": 0.8361297249794006, "num_tokens": 26089095.0, "step": 687 }, { "epoch": 0.08752067167027096, "ewc_loss": 0.006188635714352131, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 1.794104537111707e-05, "grad_norm": 2.4181125164031982, "learning_rate": 2.9122509537939806e-07, "loss": 0.5151, "mean_token_accuracy": 0.8369066119194031, "num_tokens": 26132553.0, "step": 688 }, { "epoch": 0.08764788194886147, "ewc_loss": 0.006185004487633705, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 1.7904732885654084e-05, "grad_norm": 2.5542900562286377, "learning_rate": 2.916490038151759e-07, "loss": 0.567, "mean_token_accuracy": 0.8164980411529541, "num_tokens": 26171301.0, "step": 689 }, { "epoch": 0.08777509222745197, "ewc_loss": 0.0062162550166249275, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 1.8217235265183263e-05, "grad_norm": 2.572340726852417, "learning_rate": 2.9207291225095376e-07, "loss": 0.5534, "mean_token_accuracy": 0.8224550485610962, "num_tokens": 26217871.0, "step": 690 }, { "epoch": 0.08790230250604249, "ewc_loss": 0.006227448582649231, "ewc_loss_diag": 4.380941390991211e-06, "ewc_loss_parallel": 1.832917405408807e-05, "grad_norm": 2.5052692890167236, "learning_rate": 2.9249682068673166e-07, "loss": 0.5175, "mean_token_accuracy": 0.8338668346405029, "num_tokens": 26255927.0, "step": 691 }, { "epoch": 0.088029512784633, "ewc_loss": 0.006241248920559883, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 1.8161998013965786e-05, "grad_norm": 2.5082900524139404, "learning_rate": 2.9292072912250955e-07, "loss": 0.5002, "mean_token_accuracy": 0.84263014793396, "num_tokens": 26296041.0, "step": 692 }, { "epoch": 0.0881567230632235, "ewc_loss": 0.006242102477699518, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 1.817053635022603e-05, "grad_norm": 2.6719274520874023, "learning_rate": 2.933446375582874e-07, "loss": 0.5812, "mean_token_accuracy": 0.8170375227928162, "num_tokens": 26333785.0, "step": 693 }, { "epoch": 0.08828393334181402, "ewc_loss": 0.006280349567532539, "ewc_loss_diag": 4.410743713378906e-06, "ewc_loss_parallel": 1.8553009795141406e-05, "grad_norm": 2.5552289485931396, "learning_rate": 2.9376854599406525e-07, "loss": 0.4891, "mean_token_accuracy": 0.8413069844245911, "num_tokens": 26368837.0, "step": 694 }, { "epoch": 0.08841114362040453, "ewc_loss": 0.006288168020546436, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.832601810747292e-05, "grad_norm": 2.681410074234009, "learning_rate": 2.9419245442984315e-07, "loss": 0.5631, "mean_token_accuracy": 0.8208223581314087, "num_tokens": 26402151.0, "step": 695 }, { "epoch": 0.08853835389899505, "ewc_loss": 0.0063131703063845634, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.85760400199797e-05, "grad_norm": 2.567211151123047, "learning_rate": 2.9461636286562104e-07, "loss": 0.5172, "mean_token_accuracy": 0.8331259489059448, "num_tokens": 26435550.0, "step": 696 }, { "epoch": 0.08866556417758555, "ewc_loss": 0.006296291947364807, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.8407254174235277e-05, "grad_norm": 2.416262626647949, "learning_rate": 2.9504027130139884e-07, "loss": 0.5073, "mean_token_accuracy": 0.8388311862945557, "num_tokens": 26479299.0, "step": 697 }, { "epoch": 0.08879277445617606, "ewc_loss": 0.006274128332734108, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.818561759137083e-05, "grad_norm": 2.7054014205932617, "learning_rate": 2.9546417973717674e-07, "loss": 0.5437, "mean_token_accuracy": 0.8279417753219604, "num_tokens": 26519380.0, "step": 698 }, { "epoch": 0.08891998473476657, "ewc_loss": 0.006335875950753689, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.8803097191266716e-05, "grad_norm": 2.4827773571014404, "learning_rate": 2.9588808817295464e-07, "loss": 0.4695, "mean_token_accuracy": 0.8474151492118835, "num_tokens": 26557306.0, "step": 699 }, { "epoch": 0.08904719501335707, "ewc_loss": 0.006294907536357641, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.8393411664874293e-05, "grad_norm": 2.706632375717163, "learning_rate": 2.9631199660873253e-07, "loss": 0.5397, "mean_token_accuracy": 0.8294013738632202, "num_tokens": 26593549.0, "step": 700 }, { "epoch": 0.08917440529194759, "ewc_loss": 0.006336485035717487, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.8809187167789787e-05, "grad_norm": 2.5413029193878174, "learning_rate": 2.9673590504451033e-07, "loss": 0.4649, "mean_token_accuracy": 0.8506419658660889, "num_tokens": 26629926.0, "step": 701 }, { "epoch": 0.0893016155705381, "ewc_loss": 0.006309008691459894, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.8534423361415975e-05, "grad_norm": 2.5080199241638184, "learning_rate": 2.9715981348028823e-07, "loss": 0.4961, "mean_token_accuracy": 0.8430153131484985, "num_tokens": 26668652.0, "step": 702 }, { "epoch": 0.0894288258491286, "ewc_loss": 0.00630614161491394, "ewc_loss_diag": 4.4405460357666016e-06, "ewc_loss_parallel": 1.8505752450437285e-05, "grad_norm": 2.5976645946502686, "learning_rate": 2.975837219160661e-07, "loss": 0.5575, "mean_token_accuracy": 0.82222580909729, "num_tokens": 26708084.0, "step": 703 }, { "epoch": 0.08955603612771912, "ewc_loss": 0.006333509460091591, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.8779432139126584e-05, "grad_norm": 2.4856526851654053, "learning_rate": 2.98007630351844e-07, "loss": 0.5245, "mean_token_accuracy": 0.833886444568634, "num_tokens": 26750134.0, "step": 704 }, { "epoch": 0.08968324640630963, "ewc_loss": 0.006313358899205923, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.8577924493001774e-05, "grad_norm": 2.6683602333068848, "learning_rate": 2.984315387876218e-07, "loss": 0.5326, "mean_token_accuracy": 0.8301417827606201, "num_tokens": 26785410.0, "step": 705 }, { "epoch": 0.08981045668490013, "ewc_loss": 0.006354496814310551, "ewc_loss_diag": 4.470348358154297e-06, "ewc_loss_parallel": 1.8989301679539494e-05, "grad_norm": 2.662073850631714, "learning_rate": 2.988554472233997e-07, "loss": 0.5378, "mean_token_accuracy": 0.83122718334198, "num_tokens": 26825226.0, "step": 706 }, { "epoch": 0.08993766696349065, "ewc_loss": 0.006417504511773586, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.9009030438610353e-05, "grad_norm": 2.479646682739258, "learning_rate": 2.992793556591776e-07, "loss": 0.5296, "mean_token_accuracy": 0.8312132954597473, "num_tokens": 26865401.0, "step": 707 }, { "epoch": 0.09006487724208116, "ewc_loss": 0.00638231448829174, "ewc_loss_diag": 4.5299530029296875e-06, "ewc_loss_parallel": 1.8657126929610968e-05, "grad_norm": 2.592247724533081, "learning_rate": 2.997032640949555e-07, "loss": 0.5407, "mean_token_accuracy": 0.8297523260116577, "num_tokens": 26905365.0, "step": 708 }, { "epoch": 0.09019208752067168, "ewc_loss": 0.006448186468333006, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.9010672986041754e-05, "grad_norm": 2.5537233352661133, "learning_rate": 3.001271725307333e-07, "loss": 0.4813, "mean_token_accuracy": 0.8446317911148071, "num_tokens": 26941348.0, "step": 709 }, { "epoch": 0.09031929779926218, "ewc_loss": 0.0064726523123681545, "ewc_loss_diag": 4.589557647705078e-06, "ewc_loss_parallel": 1.8950155208585784e-05, "grad_norm": 2.702298164367676, "learning_rate": 3.005510809665112e-07, "loss": 0.5177, "mean_token_accuracy": 0.8354272246360779, "num_tokens": 26975803.0, "step": 710 }, { "epoch": 0.09044650807785269, "ewc_loss": 0.006473137065768242, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.9260176486568525e-05, "grad_norm": 2.627964735031128, "learning_rate": 3.009749894022891e-07, "loss": 0.4635, "mean_token_accuracy": 0.8515307903289795, "num_tokens": 27014931.0, "step": 711 }, { "epoch": 0.0905737183564432, "ewc_loss": 0.006452987436205149, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.9058683392358944e-05, "grad_norm": 2.5451934337615967, "learning_rate": 3.01398897838067e-07, "loss": 0.5184, "mean_token_accuracy": 0.8376079797744751, "num_tokens": 27055505.0, "step": 712 }, { "epoch": 0.0907009286350337, "ewc_loss": 0.006436010356992483, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.8888913473347202e-05, "grad_norm": 2.5522427558898926, "learning_rate": 3.018228062738448e-07, "loss": 0.5468, "mean_token_accuracy": 0.8293629884719849, "num_tokens": 27092121.0, "step": 713 }, { "epoch": 0.09082813891362422, "ewc_loss": 0.006447585299611092, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.900466304505244e-05, "grad_norm": 2.5230910778045654, "learning_rate": 3.022467147096227e-07, "loss": 0.4844, "mean_token_accuracy": 0.8448600769042969, "num_tokens": 27129827.0, "step": 714 }, { "epoch": 0.09095534919221474, "ewc_loss": 0.006479104049503803, "ewc_loss_diag": 4.589557647705078e-06, "ewc_loss_parallel": 1.9014674762729555e-05, "grad_norm": 2.43766450881958, "learning_rate": 3.026706231454006e-07, "loss": 0.5005, "mean_token_accuracy": 0.8391018509864807, "num_tokens": 27174977.0, "step": 715 }, { "epoch": 0.09108255947080524, "ewc_loss": 0.006435314193367958, "ewc_loss_diag": 4.559755325317383e-06, "ewc_loss_parallel": 1.8881952200899832e-05, "grad_norm": 2.6491613388061523, "learning_rate": 3.0309453158117844e-07, "loss": 0.5062, "mean_token_accuracy": 0.8387212753295898, "num_tokens": 27211472.0, "step": 716 }, { "epoch": 0.09120976974939575, "ewc_loss": 0.006518136709928513, "ewc_loss_diag": 4.589557647705078e-06, "ewc_loss_parallel": 1.9404998965910636e-05, "grad_norm": 2.6024234294891357, "learning_rate": 3.035184400169563e-07, "loss": 0.5855, "mean_token_accuracy": 0.8177322149276733, "num_tokens": 27249160.0, "step": 717 }, { "epoch": 0.09133698002798626, "ewc_loss": 0.006506284698843956, "ewc_loss_diag": 4.589557647705078e-06, "ewc_loss_parallel": 1.9286482711322606e-05, "grad_norm": 2.692126512527466, "learning_rate": 3.039423484527342e-07, "loss": 0.498, "mean_token_accuracy": 0.8402334451675415, "num_tokens": 27287946.0, "step": 718 }, { "epoch": 0.09146419030657676, "ewc_loss": 0.006555590778589249, "ewc_loss_diag": 4.6193599700927734e-06, "ewc_loss_parallel": 1.947436248883605e-05, "grad_norm": 2.6259188652038574, "learning_rate": 3.043662568885121e-07, "loss": 0.5294, "mean_token_accuracy": 0.8277172446250916, "num_tokens": 27323388.0, "step": 719 }, { "epoch": 0.09159140058516728, "ewc_loss": 0.00654661376029253, "ewc_loss_diag": 4.6193599700927734e-06, "ewc_loss_parallel": 1.9384591723792255e-05, "grad_norm": 2.806285858154297, "learning_rate": 3.0479016532428993e-07, "loss": 0.5787, "mean_token_accuracy": 0.8155341148376465, "num_tokens": 27357524.0, "step": 720 }, { "epoch": 0.0917186108637578, "ewc_loss": 0.006639285013079643, "ewc_loss_diag": 4.678964614868164e-06, "ewc_loss_parallel": 1.9700953998835757e-05, "grad_norm": 2.610844373703003, "learning_rate": 3.052140737600678e-07, "loss": 0.4872, "mean_token_accuracy": 0.8423157930374146, "num_tokens": 27392037.0, "step": 721 }, { "epoch": 0.09184582114234831, "ewc_loss": 0.006664498709142208, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9342740415595472e-05, "grad_norm": 2.5661520957946777, "learning_rate": 3.056379821958457e-07, "loss": 0.5007, "mean_token_accuracy": 0.8402131795883179, "num_tokens": 27429149.0, "step": 722 }, { "epoch": 0.09197303142093881, "ewc_loss": 0.006667438894510269, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9372142560314387e-05, "grad_norm": 2.611884832382202, "learning_rate": 3.060618906316236e-07, "loss": 0.5024, "mean_token_accuracy": 0.8412947058677673, "num_tokens": 27465023.0, "step": 723 }, { "epoch": 0.09210024169952932, "ewc_loss": 0.006679484620690346, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9492601495585404e-05, "grad_norm": 2.6247546672821045, "learning_rate": 3.064857990674014e-07, "loss": 0.4948, "mean_token_accuracy": 0.8415403366088867, "num_tokens": 27501268.0, "step": 724 }, { "epoch": 0.09222745197811984, "ewc_loss": 0.006682909559458494, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9526849428075366e-05, "grad_norm": 2.5722033977508545, "learning_rate": 3.0690970750317927e-07, "loss": 0.5463, "mean_token_accuracy": 0.8278112411499023, "num_tokens": 27539937.0, "step": 725 }, { "epoch": 0.09235466225671034, "ewc_loss": 0.006676110439002514, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.945885560417082e-05, "grad_norm": 2.675474166870117, "learning_rate": 3.0733361593895717e-07, "loss": 0.5087, "mean_token_accuracy": 0.8381243944168091, "num_tokens": 27574576.0, "step": 726 }, { "epoch": 0.09248187253530085, "ewc_loss": 0.0066685304045677185, "ewc_loss_diag": 4.708766937255859e-06, "ewc_loss_parallel": 1.968823380593676e-05, "grad_norm": 2.5356297492980957, "learning_rate": 3.0775752437473507e-07, "loss": 0.517, "mean_token_accuracy": 0.8303155303001404, "num_tokens": 27616036.0, "step": 727 }, { "epoch": 0.09260908281389137, "ewc_loss": 0.006677546538412571, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9473221982480027e-05, "grad_norm": 2.54005765914917, "learning_rate": 3.081814328105129e-07, "loss": 0.5396, "mean_token_accuracy": 0.8289706707000732, "num_tokens": 27662040.0, "step": 728 }, { "epoch": 0.09273629309248187, "ewc_loss": 0.006684785708785057, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9545608665794134e-05, "grad_norm": 2.5569705963134766, "learning_rate": 3.0860534124629076e-07, "loss": 0.569, "mean_token_accuracy": 0.8198715448379517, "num_tokens": 27705136.0, "step": 729 }, { "epoch": 0.09286350337107238, "ewc_loss": 0.006699373945593834, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.969149525393732e-05, "grad_norm": 2.6365206241607666, "learning_rate": 3.0902924968206866e-07, "loss": 0.5181, "mean_token_accuracy": 0.8361090421676636, "num_tokens": 27740564.0, "step": 730 }, { "epoch": 0.0929907136496629, "ewc_loss": 0.006716819945722818, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9865954527631402e-05, "grad_norm": 2.7307229042053223, "learning_rate": 3.0945315811784656e-07, "loss": 0.4927, "mean_token_accuracy": 0.8403840065002441, "num_tokens": 27774788.0, "step": 731 }, { "epoch": 0.0931179239282534, "ewc_loss": 0.006728760898113251, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9985363906016573e-05, "grad_norm": 2.5371007919311523, "learning_rate": 3.098770665536244e-07, "loss": 0.6358, "mean_token_accuracy": 0.7971670627593994, "num_tokens": 27818300.0, "step": 732 }, { "epoch": 0.09324513420684391, "ewc_loss": 0.006692470982670784, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9622464606072754e-05, "grad_norm": 2.631791830062866, "learning_rate": 3.1030097498940225e-07, "loss": 0.4817, "mean_token_accuracy": 0.8455086946487427, "num_tokens": 27857739.0, "step": 733 }, { "epoch": 0.09337234448543442, "ewc_loss": 0.0067131659016013145, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.982941284950357e-05, "grad_norm": 2.5772151947021484, "learning_rate": 3.1072488342518015e-07, "loss": 0.5172, "mean_token_accuracy": 0.8295641541481018, "num_tokens": 27893897.0, "step": 734 }, { "epoch": 0.09349955476402494, "ewc_loss": 0.006713647395372391, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.983422953344416e-05, "grad_norm": 2.57601261138916, "learning_rate": 3.11148791860958e-07, "loss": 0.5062, "mean_token_accuracy": 0.8360728025436401, "num_tokens": 27930511.0, "step": 735 }, { "epoch": 0.09362676504261544, "ewc_loss": 0.00671518687158823, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 1.9849623640766367e-05, "grad_norm": 2.7824513912200928, "learning_rate": 3.115727002967359e-07, "loss": 0.5224, "mean_token_accuracy": 0.8330632448196411, "num_tokens": 27961128.0, "step": 736 }, { "epoch": 0.09375397532120595, "ewc_loss": 0.006758221425116062, "ewc_loss_diag": 4.738569259643555e-06, "ewc_loss_parallel": 2.0279971067793667e-05, "grad_norm": 2.5308098793029785, "learning_rate": 3.1199660873251374e-07, "loss": 0.5233, "mean_token_accuracy": 0.8338878154754639, "num_tokens": 28000748.0, "step": 737 }, { "epoch": 0.09388118559979647, "ewc_loss": 0.00673828786239028, "ewc_loss_diag": 4.76837158203125e-06, "ewc_loss_parallel": 1.977545616682619e-05, "grad_norm": 2.4264190196990967, "learning_rate": 3.1242051716829164e-07, "loss": 0.5573, "mean_token_accuracy": 0.8217171430587769, "num_tokens": 28046455.0, "step": 738 }, { "epoch": 0.09400839587838697, "ewc_loss": 0.0067916931584477425, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 1.969915501831565e-05, "grad_norm": 2.6352174282073975, "learning_rate": 3.128444256040695e-07, "loss": 0.5326, "mean_token_accuracy": 0.8315836191177368, "num_tokens": 28087026.0, "step": 739 }, { "epoch": 0.09413560615697748, "ewc_loss": 0.006850873585790396, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 2.0290963220759295e-05, "grad_norm": 2.5440876483917236, "learning_rate": 3.132683340398474e-07, "loss": 0.4574, "mean_token_accuracy": 0.8500470519065857, "num_tokens": 28126382.0, "step": 740 }, { "epoch": 0.094262816435568, "ewc_loss": 0.0068293483927845955, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 2.007570947171189e-05, "grad_norm": 2.5618302822113037, "learning_rate": 3.1369224247562523e-07, "loss": 0.4829, "mean_token_accuracy": 0.8467128276824951, "num_tokens": 28161198.0, "step": 741 }, { "epoch": 0.0943900267141585, "ewc_loss": 0.006835319101810455, "ewc_loss_diag": 4.827976226806641e-06, "ewc_loss_parallel": 2.013541779888328e-05, "grad_norm": 2.44740891456604, "learning_rate": 3.1411615091140313e-07, "loss": 0.4529, "mean_token_accuracy": 0.8545899391174316, "num_tokens": 28204612.0, "step": 742 }, { "epoch": 0.09451723699274901, "ewc_loss": 0.006852450780570507, "ewc_loss_diag": 4.857778549194336e-06, "ewc_loss_parallel": 2.000156200665515e-05, "grad_norm": 2.540931224822998, "learning_rate": 3.14540059347181e-07, "loss": 0.5069, "mean_token_accuracy": 0.837501049041748, "num_tokens": 28241050.0, "step": 743 }, { "epoch": 0.09464444727133953, "ewc_loss": 0.00693717785179615, "ewc_loss_diag": 4.9173831939697266e-06, "ewc_loss_parallel": 2.0238478100509383e-05, "grad_norm": 2.549753189086914, "learning_rate": 3.149639677829589e-07, "loss": 0.4677, "mean_token_accuracy": 0.8487178087234497, "num_tokens": 28278032.0, "step": 744 }, { "epoch": 0.09477165754993004, "ewc_loss": 0.006946278735995293, "ewc_loss_diag": 4.9173831939697266e-06, "ewc_loss_parallel": 2.032948577834759e-05, "grad_norm": 2.610403537750244, "learning_rate": 3.153878762187368e-07, "loss": 0.5543, "mean_token_accuracy": 0.8282438516616821, "num_tokens": 28315599.0, "step": 745 }, { "epoch": 0.09489886782852054, "ewc_loss": 0.006960477214306593, "ewc_loss_diag": 4.9173831939697266e-06, "ewc_loss_parallel": 2.047147245320957e-05, "grad_norm": 2.576101303100586, "learning_rate": 3.158117846545146e-07, "loss": 0.5786, "mean_token_accuracy": 0.8183678388595581, "num_tokens": 28358504.0, "step": 746 }, { "epoch": 0.09502607810711106, "ewc_loss": 0.006950326729565859, "ewc_loss_diag": 4.9173831939697266e-06, "ewc_loss_parallel": 2.03699655685341e-05, "grad_norm": 2.9373831748962402, "learning_rate": 3.1623569309029247e-07, "loss": 0.496, "mean_token_accuracy": 0.8414754867553711, "num_tokens": 28389626.0, "step": 747 }, { "epoch": 0.09515328838570157, "ewc_loss": 0.0070199742913246155, "ewc_loss_diag": 4.9173831939697266e-06, "ewc_loss_parallel": 2.106644205923658e-05, "grad_norm": 2.513021230697632, "learning_rate": 3.1665960152607037e-07, "loss": 0.4763, "mean_token_accuracy": 0.8458468914031982, "num_tokens": 28430361.0, "step": 748 }, { "epoch": 0.09528049866429207, "ewc_loss": 0.006960080936551094, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 2.016233338508755e-05, "grad_norm": 2.5137674808502197, "learning_rate": 3.1708350996184826e-07, "loss": 0.5139, "mean_token_accuracy": 0.8342280387878418, "num_tokens": 28469792.0, "step": 749 }, { "epoch": 0.09540770894288259, "ewc_loss": 0.006970256567001343, "ewc_loss_diag": 4.947185516357422e-06, "ewc_loss_parallel": 2.0264091290300712e-05, "grad_norm": 2.639116048812866, "learning_rate": 3.175074183976261e-07, "loss": 0.4916, "mean_token_accuracy": 0.8447883129119873, "num_tokens": 28507569.0, "step": 750 }, { "epoch": 0.0955349192214731, "ewc_loss": 0.007039486896246672, "ewc_loss_diag": 4.976987838745117e-06, "ewc_loss_parallel": 2.065121771011036e-05, "grad_norm": 2.6736724376678467, "learning_rate": 3.1793132683340396e-07, "loss": 0.5183, "mean_token_accuracy": 0.8355587720870972, "num_tokens": 28542336.0, "step": 751 }, { "epoch": 0.0956621295000636, "ewc_loss": 0.007069820072501898, "ewc_loss_diag": 5.0067901611328125e-06, "ewc_loss_parallel": 2.0649373254855163e-05, "grad_norm": 2.6644792556762695, "learning_rate": 3.1835523526918186e-07, "loss": 0.4688, "mean_token_accuracy": 0.8486270904541016, "num_tokens": 28578587.0, "step": 752 }, { "epoch": 0.09578933977865411, "ewc_loss": 0.0070890020579099655, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 2.053601565421559e-05, "grad_norm": 2.5874547958374023, "learning_rate": 3.1877914370495975e-07, "loss": 0.5487, "mean_token_accuracy": 0.8261328935623169, "num_tokens": 28617505.0, "step": 753 }, { "epoch": 0.09591655005724463, "ewc_loss": 0.007074919994920492, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 2.039519495156128e-05, "grad_norm": 2.5736374855041504, "learning_rate": 3.1920305214073755e-07, "loss": 0.5526, "mean_token_accuracy": 0.8254085183143616, "num_tokens": 28658526.0, "step": 754 }, { "epoch": 0.09604376033583513, "ewc_loss": 0.007081124931573868, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 2.0457247956073843e-05, "grad_norm": 2.553311586380005, "learning_rate": 3.1962696057651545e-07, "loss": 0.4894, "mean_token_accuracy": 0.8429884314537048, "num_tokens": 28698825.0, "step": 755 }, { "epoch": 0.09617097061442564, "ewc_loss": 0.007082896772772074, "ewc_loss_diag": 5.036592483520508e-06, "ewc_loss_parallel": 2.047496491286438e-05, "grad_norm": 2.6416614055633545, "learning_rate": 3.2005086901229335e-07, "loss": 0.5323, "mean_token_accuracy": 0.8271284699440002, "num_tokens": 28733238.0, "step": 756 }, { "epoch": 0.09629818089301616, "ewc_loss": 0.007134614512324333, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 2.068696812784765e-05, "grad_norm": 2.5112156867980957, "learning_rate": 3.2047477744807125e-07, "loss": 0.5319, "mean_token_accuracy": 0.8330886363983154, "num_tokens": 28771552.0, "step": 757 }, { "epoch": 0.09642539117160667, "ewc_loss": 0.007111701183021069, "ewc_loss_diag": 5.066394805908203e-06, "ewc_loss_parallel": 2.0457835489651188e-05, "grad_norm": 2.5726914405822754, "learning_rate": 3.2089868588384904e-07, "loss": 0.5623, "mean_token_accuracy": 0.8191578388214111, "num_tokens": 28810827.0, "step": 758 }, { "epoch": 0.09655260145019717, "ewc_loss": 0.0071628582663834095, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 2.066422712232452e-05, "grad_norm": 2.620030641555786, "learning_rate": 3.2132259431962694e-07, "loss": 0.4793, "mean_token_accuracy": 0.8471225500106812, "num_tokens": 28849017.0, "step": 759 }, { "epoch": 0.09667981172878769, "ewc_loss": 0.0071775587275624275, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 2.0811232388950884e-05, "grad_norm": 2.6100614070892334, "learning_rate": 3.2174650275540484e-07, "loss": 0.5279, "mean_token_accuracy": 0.837108850479126, "num_tokens": 28887659.0, "step": 760 }, { "epoch": 0.0968070220073782, "ewc_loss": 0.007172838784754276, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 2.0764031432918273e-05, "grad_norm": 2.451040029525757, "learning_rate": 3.2217041119118274e-07, "loss": 0.4935, "mean_token_accuracy": 0.8421916961669922, "num_tokens": 28928983.0, "step": 761 }, { "epoch": 0.0969342322859687, "ewc_loss": 0.007146728225052357, "ewc_loss_diag": 5.0961971282958984e-06, "ewc_loss_parallel": 2.0502924598986283e-05, "grad_norm": 2.6943085193634033, "learning_rate": 3.2259431962696053e-07, "loss": 0.5256, "mean_token_accuracy": 0.8339680433273315, "num_tokens": 28962195.0, "step": 762 }, { "epoch": 0.09706144256455922, "ewc_loss": 0.007238435558974743, "ewc_loss_diag": 5.125999450683594e-06, "ewc_loss_parallel": 2.11148271773709e-05, "grad_norm": 2.6149330139160156, "learning_rate": 3.2301822806273843e-07, "loss": 0.5086, "mean_token_accuracy": 0.8353311419487, "num_tokens": 28997351.0, "step": 763 }, { "epoch": 0.09718865284314973, "ewc_loss": 0.007253971882164478, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 2.096500975312665e-05, "grad_norm": 2.8390564918518066, "learning_rate": 3.2344213649851633e-07, "loss": 0.5341, "mean_token_accuracy": 0.8298189640045166, "num_tokens": 29040988.0, "step": 764 }, { "epoch": 0.09731586312174023, "ewc_loss": 0.007293233647942543, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 2.1357627701945603e-05, "grad_norm": 2.5198304653167725, "learning_rate": 3.238660449342942e-07, "loss": 0.4981, "mean_token_accuracy": 0.8389513492584229, "num_tokens": 29081873.0, "step": 765 }, { "epoch": 0.09744307340033075, "ewc_loss": 0.007223113905638456, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 2.065643093374092e-05, "grad_norm": 2.6420516967773438, "learning_rate": 3.24289953370072e-07, "loss": 0.4713, "mean_token_accuracy": 0.8448607921600342, "num_tokens": 29115360.0, "step": 766 }, { "epoch": 0.09757028367892126, "ewc_loss": 0.00725949089974165, "ewc_loss_diag": 5.155801773071289e-06, "ewc_loss_parallel": 2.1020199710619636e-05, "grad_norm": 2.4943244457244873, "learning_rate": 3.247138618058499e-07, "loss": 0.5152, "mean_token_accuracy": 0.8310728669166565, "num_tokens": 29160139.0, "step": 767 }, { "epoch": 0.09769749395751176, "ewc_loss": 0.007264708634465933, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 2.0767203750438057e-05, "grad_norm": 2.4920854568481445, "learning_rate": 3.251377702416278e-07, "loss": 0.5439, "mean_token_accuracy": 0.8287402391433716, "num_tokens": 29202872.0, "step": 768 }, { "epoch": 0.09782470423610228, "ewc_loss": 0.0072783720679581165, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 2.0903837139485404e-05, "grad_norm": 2.50516414642334, "learning_rate": 3.255616786774057e-07, "loss": 0.4918, "mean_token_accuracy": 0.8426914215087891, "num_tokens": 29246965.0, "step": 769 }, { "epoch": 0.09795191451469279, "ewc_loss": 0.007285265251994133, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 2.0972769561922178e-05, "grad_norm": 2.45876145362854, "learning_rate": 3.259855871131835e-07, "loss": 0.4317, "mean_token_accuracy": 0.859696626663208, "num_tokens": 29289531.0, "step": 770 }, { "epoch": 0.0980791247932833, "ewc_loss": 0.007275445386767387, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 2.0874573237961158e-05, "grad_norm": 2.5060129165649414, "learning_rate": 3.264094955489614e-07, "loss": 0.4876, "mean_token_accuracy": 0.8429261445999146, "num_tokens": 29330484.0, "step": 771 }, { "epoch": 0.0982063350718738, "ewc_loss": 0.0072957538068294525, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 2.1077652490930632e-05, "grad_norm": 2.523472547531128, "learning_rate": 3.268334039847393e-07, "loss": 0.5136, "mean_token_accuracy": 0.8346185684204102, "num_tokens": 29369595.0, "step": 772 }, { "epoch": 0.09833354535046432, "ewc_loss": 0.007297405507415533, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 2.1094172552693635e-05, "grad_norm": 2.5473344326019287, "learning_rate": 3.2725731242051715e-07, "loss": 0.5022, "mean_token_accuracy": 0.8419893383979797, "num_tokens": 29411856.0, "step": 773 }, { "epoch": 0.09846075562905483, "ewc_loss": 0.007300263270735741, "ewc_loss_diag": 5.185604095458984e-06, "ewc_loss_parallel": 2.1122747057233937e-05, "grad_norm": 2.524137496948242, "learning_rate": 3.27681220856295e-07, "loss": 0.4853, "mean_token_accuracy": 0.8453277945518494, "num_tokens": 29454754.0, "step": 774 }, { "epoch": 0.09858796590764533, "ewc_loss": 0.007330210879445076, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 2.1117048163432628e-05, "grad_norm": 2.626742362976074, "learning_rate": 3.281051292920729e-07, "loss": 0.4818, "mean_token_accuracy": 0.8429331183433533, "num_tokens": 29489131.0, "step": 775 }, { "epoch": 0.09871517618623585, "ewc_loss": 0.0073559703305363655, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 2.137464798579458e-05, "grad_norm": 2.559177875518799, "learning_rate": 3.285290377278508e-07, "loss": 0.5356, "mean_token_accuracy": 0.827725887298584, "num_tokens": 29528292.0, "step": 776 }, { "epoch": 0.09884238646482636, "ewc_loss": 0.007342998869717121, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 2.1244928575470112e-05, "grad_norm": 2.628762722015381, "learning_rate": 3.2895294616362864e-07, "loss": 0.4704, "mean_token_accuracy": 0.8510427474975586, "num_tokens": 29564878.0, "step": 777 }, { "epoch": 0.09896959674341686, "ewc_loss": 0.007362179458141327, "ewc_loss_diag": 5.21540641784668e-06, "ewc_loss_parallel": 2.1436735551105812e-05, "grad_norm": 2.619621515274048, "learning_rate": 3.293768545994065e-07, "loss": 0.4888, "mean_token_accuracy": 0.8402643203735352, "num_tokens": 29602826.0, "step": 778 }, { "epoch": 0.09909680702200738, "ewc_loss": 0.007386884652078152, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 2.1378609744715504e-05, "grad_norm": 2.6223301887512207, "learning_rate": 3.298007630351844e-07, "loss": 0.5572, "mean_token_accuracy": 0.8252106308937073, "num_tokens": 29640323.0, "step": 779 }, { "epoch": 0.09922401730059789, "ewc_loss": 0.0073850117623806, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 2.1359881429816596e-05, "grad_norm": 2.4766557216644287, "learning_rate": 3.302246714709623e-07, "loss": 0.4873, "mean_token_accuracy": 0.8426212072372437, "num_tokens": 29683302.0, "step": 780 }, { "epoch": 0.09935122757918839, "ewc_loss": 0.00736096641048789, "ewc_loss_diag": 5.245208740234375e-06, "ewc_loss_parallel": 2.111942922056187e-05, "grad_norm": 2.567488670349121, "learning_rate": 3.3064857990674013e-07, "loss": 0.5118, "mean_token_accuracy": 0.8391557335853577, "num_tokens": 29721085.0, "step": 781 }, { "epoch": 0.0994784378577789, "ewc_loss": 0.007423303090035915, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 2.1437619579955935e-05, "grad_norm": 2.646468162536621, "learning_rate": 3.31072488342518e-07, "loss": 0.4637, "mean_token_accuracy": 0.8479676246643066, "num_tokens": 29753810.0, "step": 782 }, { "epoch": 0.09960564813636942, "ewc_loss": 0.007446131203323603, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 2.1665900931111537e-05, "grad_norm": 2.5333423614501953, "learning_rate": 3.314963967782959e-07, "loss": 0.491, "mean_token_accuracy": 0.8441413044929504, "num_tokens": 29796488.0, "step": 783 }, { "epoch": 0.09973285841495994, "ewc_loss": 0.007414449006319046, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 2.1349078451748937e-05, "grad_norm": 2.5067601203918457, "learning_rate": 3.319203052140738e-07, "loss": 0.5018, "mean_token_accuracy": 0.8386211395263672, "num_tokens": 29838181.0, "step": 784 }, { "epoch": 0.09986006869355044, "ewc_loss": 0.007418986409902573, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 2.1394456780399196e-05, "grad_norm": 3.115612745285034, "learning_rate": 3.323442136498516e-07, "loss": 0.4847, "mean_token_accuracy": 0.8452507853507996, "num_tokens": 29868893.0, "step": 785 }, { "epoch": 0.09998727897214095, "ewc_loss": 0.007557244971394539, "ewc_loss_diag": 5.27501106262207e-06, "ewc_loss_parallel": 2.2777037884225138e-05, "grad_norm": 2.722111225128174, "learning_rate": 3.3276812208562947e-07, "loss": 0.4996, "mean_token_accuracy": 0.838238537311554, "num_tokens": 29903571.0, "step": 786 }, { "epoch": 0.10011448925073146, "ewc_loss": 0.007484286092221737, "ewc_loss_diag": 5.304813385009766e-06, "ewc_loss_parallel": 2.1742274839198217e-05, "grad_norm": 2.6258902549743652, "learning_rate": 3.3319203052140737e-07, "loss": 0.5561, "mean_token_accuracy": 0.8298778533935547, "num_tokens": 29940438.0, "step": 787 }, { "epoch": 0.10024169952932196, "ewc_loss": 0.007492159493267536, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 2.1515836124308407e-05, "grad_norm": 2.6039483547210693, "learning_rate": 3.336159389571852e-07, "loss": 0.4786, "mean_token_accuracy": 0.850387692451477, "num_tokens": 29976538.0, "step": 788 }, { "epoch": 0.10036890980791248, "ewc_loss": 0.007511330768465996, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 2.1707543055526912e-05, "grad_norm": 2.4912445545196533, "learning_rate": 3.340398473929631e-07, "loss": 0.5486, "mean_token_accuracy": 0.8256936073303223, "num_tokens": 30019814.0, "step": 789 }, { "epoch": 0.100496120086503, "ewc_loss": 0.00749247008934617, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 2.1518939320230857e-05, "grad_norm": 2.6523923873901367, "learning_rate": 3.3446375582874096e-07, "loss": 0.4988, "mean_token_accuracy": 0.8413243293762207, "num_tokens": 30057578.0, "step": 790 }, { "epoch": 0.1006233303650935, "ewc_loss": 0.007531048264354467, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 2.1904721506871283e-05, "grad_norm": 2.6373744010925293, "learning_rate": 3.3488766426451886e-07, "loss": 0.5696, "mean_token_accuracy": 0.8263439536094666, "num_tokens": 30094869.0, "step": 791 }, { "epoch": 0.10075054064368401, "ewc_loss": 0.007531541399657726, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 2.19096546061337e-05, "grad_norm": 2.682257652282715, "learning_rate": 3.353115727002967e-07, "loss": 0.5318, "mean_token_accuracy": 0.8311360478401184, "num_tokens": 30136446.0, "step": 792 }, { "epoch": 0.10087775092227452, "ewc_loss": 0.007537125609815121, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 2.1965495761833154e-05, "grad_norm": 2.4752209186553955, "learning_rate": 3.357354811360746e-07, "loss": 0.5252, "mean_token_accuracy": 0.8290590047836304, "num_tokens": 30179226.0, "step": 793 }, { "epoch": 0.10100496120086502, "ewc_loss": 0.00749592762440443, "ewc_loss_diag": 5.334615707397461e-06, "ewc_loss_parallel": 2.1553514670813456e-05, "grad_norm": 2.6044459342956543, "learning_rate": 3.3615938957185245e-07, "loss": 0.5161, "mean_token_accuracy": 0.8345811367034912, "num_tokens": 30213890.0, "step": 794 }, { "epoch": 0.10113217147945554, "ewc_loss": 0.0075785135850310326, "ewc_loss_diag": 5.364418029785156e-06, "ewc_loss_parallel": 2.2074194930610247e-05, "grad_norm": 2.7050604820251465, "learning_rate": 3.3658329800763035e-07, "loss": 0.5665, "mean_token_accuracy": 0.816851019859314, "num_tokens": 30245691.0, "step": 795 }, { "epoch": 0.10125938175804605, "ewc_loss": 0.007637164555490017, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 2.2355532564688474e-05, "grad_norm": 2.5928478240966797, "learning_rate": 3.370072064434082e-07, "loss": 0.522, "mean_token_accuracy": 0.8266403079032898, "num_tokens": 30281738.0, "step": 796 }, { "epoch": 0.10138659203663657, "ewc_loss": 0.007605440448969603, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 2.203829171776306e-05, "grad_norm": 2.5098788738250732, "learning_rate": 3.374311148791861e-07, "loss": 0.487, "mean_token_accuracy": 0.841673731803894, "num_tokens": 30320707.0, "step": 797 }, { "epoch": 0.10151380231522707, "ewc_loss": 0.007598831783980131, "ewc_loss_diag": 5.3942203521728516e-06, "ewc_loss_parallel": 2.197220419475343e-05, "grad_norm": 2.7897958755493164, "learning_rate": 3.3785502331496394e-07, "loss": 0.4936, "mean_token_accuracy": 0.8391207456588745, "num_tokens": 30351379.0, "step": 798 }, { "epoch": 0.10164101259381758, "ewc_loss": 0.007741494569927454, "ewc_loss_diag": 5.453824996948242e-06, "ewc_loss_parallel": 2.2788481146562845e-05, "grad_norm": 2.859153985977173, "learning_rate": 3.3827893175074184e-07, "loss": 0.5185, "mean_token_accuracy": 0.8325222730636597, "num_tokens": 30382395.0, "step": 799 }, { "epoch": 0.1017682228724081, "ewc_loss": 0.0077429600059986115, "ewc_loss_diag": 5.453824996948242e-06, "ewc_loss_parallel": 2.2803136744187213e-05, "grad_norm": 2.636140823364258, "learning_rate": 3.387028401865197e-07, "loss": 0.525, "mean_token_accuracy": 0.8365160226821899, "num_tokens": 30419758.0, "step": 800 }, { "epoch": 0.1018954331509986, "ewc_loss": 0.007745775394141674, "ewc_loss_diag": 5.513429641723633e-06, "ewc_loss_parallel": 2.2220940081751905e-05, "grad_norm": 2.5750181674957275, "learning_rate": 3.391267486222976e-07, "loss": 0.5092, "mean_token_accuracy": 0.8344549536705017, "num_tokens": 30458115.0, "step": 801 }, { "epoch": 0.10202264342958911, "ewc_loss": 0.007751544006168842, "ewc_loss_diag": 5.513429641723633e-06, "ewc_loss_parallel": 2.227862387371715e-05, "grad_norm": 2.747591018676758, "learning_rate": 3.3955065705807543e-07, "loss": 0.5429, "mean_token_accuracy": 0.8245487213134766, "num_tokens": 30489679.0, "step": 802 }, { "epoch": 0.10214985370817962, "ewc_loss": 0.007802936248481274, "ewc_loss_diag": 5.513429641723633e-06, "ewc_loss_parallel": 2.2792544768890366e-05, "grad_norm": 2.678887128829956, "learning_rate": 3.3997456549385333e-07, "loss": 0.4808, "mean_token_accuracy": 0.8462518453598022, "num_tokens": 30524874.0, "step": 803 }, { "epoch": 0.10227706398677013, "ewc_loss": 0.00781523808836937, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.261038389406167e-05, "grad_norm": 2.726694107055664, "learning_rate": 3.403984739296312e-07, "loss": 0.5521, "mean_token_accuracy": 0.825045645236969, "num_tokens": 30558354.0, "step": 804 }, { "epoch": 0.10240427426536064, "ewc_loss": 0.007822558283805847, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.268359276058618e-05, "grad_norm": 2.610072135925293, "learning_rate": 3.408223823654091e-07, "loss": 0.4838, "mean_token_accuracy": 0.8441628813743591, "num_tokens": 30594827.0, "step": 805 }, { "epoch": 0.10253148454395115, "ewc_loss": 0.007799302227795124, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.2451031327364035e-05, "grad_norm": 2.643659830093384, "learning_rate": 3.412462908011869e-07, "loss": 0.5264, "mean_token_accuracy": 0.8376451730728149, "num_tokens": 30635259.0, "step": 806 }, { "epoch": 0.10265869482254165, "ewc_loss": 0.007814193144440651, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.259994471387472e-05, "grad_norm": 2.5083868503570557, "learning_rate": 3.4167019923696477e-07, "loss": 0.5182, "mean_token_accuracy": 0.8398522138595581, "num_tokens": 30681850.0, "step": 807 }, { "epoch": 0.10278590510113217, "ewc_loss": 0.007791244890540838, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.2370455553755164e-05, "grad_norm": 2.5747711658477783, "learning_rate": 3.4209410767274267e-07, "loss": 0.484, "mean_token_accuracy": 0.8475829362869263, "num_tokens": 30721302.0, "step": 808 }, { "epoch": 0.10291311537972268, "ewc_loss": 0.007817784324288368, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.2635846107732505e-05, "grad_norm": 2.677157402038574, "learning_rate": 3.4251801610852057e-07, "loss": 0.5599, "mean_token_accuracy": 0.8193396925926208, "num_tokens": 30761975.0, "step": 809 }, { "epoch": 0.1030403256583132, "ewc_loss": 0.007844027131795883, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.289827898493968e-05, "grad_norm": 2.7262208461761475, "learning_rate": 3.429419245442984e-07, "loss": 0.511, "mean_token_accuracy": 0.8352863788604736, "num_tokens": 30797754.0, "step": 810 }, { "epoch": 0.1031675359369037, "ewc_loss": 0.00784495659172535, "ewc_loss_diag": 5.543231964111328e-06, "ewc_loss_parallel": 2.29075740207918e-05, "grad_norm": 2.594665765762329, "learning_rate": 3.4336583298007626e-07, "loss": 0.4864, "mean_token_accuracy": 0.8451777696609497, "num_tokens": 30834183.0, "step": 811 }, { "epoch": 0.10329474621549421, "ewc_loss": 0.007849115878343582, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 2.264399518026039e-05, "grad_norm": 2.482161045074463, "learning_rate": 3.4378974141585416e-07, "loss": 0.5142, "mean_token_accuracy": 0.8366168737411499, "num_tokens": 30874160.0, "step": 812 }, { "epoch": 0.10342195649408473, "ewc_loss": 0.007842360064387321, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 2.2576436094823293e-05, "grad_norm": 2.473433494567871, "learning_rate": 3.4421364985163206e-07, "loss": 0.4529, "mean_token_accuracy": 0.8531734943389893, "num_tokens": 30916827.0, "step": 813 }, { "epoch": 0.10354916677267523, "ewc_loss": 0.007851717993617058, "ewc_loss_diag": 5.5730342864990234e-06, "ewc_loss_parallel": 2.2670015823678114e-05, "grad_norm": 2.5529274940490723, "learning_rate": 3.446375582874099e-07, "loss": 0.5228, "mean_token_accuracy": 0.8346151113510132, "num_tokens": 30960063.0, "step": 814 }, { "epoch": 0.10367637705126574, "ewc_loss": 0.007910219952464104, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 2.2949852791498415e-05, "grad_norm": 2.6436426639556885, "learning_rate": 3.4506146672318775e-07, "loss": 0.4714, "mean_token_accuracy": 0.8465076088905334, "num_tokens": 30995172.0, "step": 815 }, { "epoch": 0.10380358732985626, "ewc_loss": 0.007926846854388714, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 2.311612479388714e-05, "grad_norm": 2.660780668258667, "learning_rate": 3.4548537515896565e-07, "loss": 0.5283, "mean_token_accuracy": 0.8313417434692383, "num_tokens": 31035089.0, "step": 816 }, { "epoch": 0.10393079760844676, "ewc_loss": 0.007922002114355564, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 2.30676796491025e-05, "grad_norm": 2.5936717987060547, "learning_rate": 3.4590928359474355e-07, "loss": 0.5135, "mean_token_accuracy": 0.8351047039031982, "num_tokens": 31074836.0, "step": 817 }, { "epoch": 0.10405800788703727, "ewc_loss": 0.007908555679023266, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 2.2933210857445374e-05, "grad_norm": 2.5632269382476807, "learning_rate": 3.463331920305214e-07, "loss": 0.5316, "mean_token_accuracy": 0.830707311630249, "num_tokens": 31114700.0, "step": 818 }, { "epoch": 0.10418521816562779, "ewc_loss": 0.007910392247140408, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 2.2951577193452977e-05, "grad_norm": 2.7247533798217773, "learning_rate": 3.4675710046629924e-07, "loss": 0.48, "mean_token_accuracy": 0.8450851440429688, "num_tokens": 31148258.0, "step": 819 }, { "epoch": 0.1043124284442183, "ewc_loss": 0.007953026331961155, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 2.3377922843792476e-05, "grad_norm": 2.69441556930542, "learning_rate": 3.4718100890207714e-07, "loss": 0.5183, "mean_token_accuracy": 0.8362867832183838, "num_tokens": 31185720.0, "step": 820 }, { "epoch": 0.1044396387228088, "ewc_loss": 0.0079403817653656, "ewc_loss_diag": 5.602836608886719e-06, "ewc_loss_parallel": 2.325147761439439e-05, "grad_norm": 2.6764755249023438, "learning_rate": 3.4760491733785504e-07, "loss": 0.5286, "mean_token_accuracy": 0.829131543636322, "num_tokens": 31220353.0, "step": 821 }, { "epoch": 0.10456684900139931, "ewc_loss": 0.007994645275175571, "ewc_loss_diag": 5.662441253662109e-06, "ewc_loss_parallel": 2.318375845788978e-05, "grad_norm": 2.6350345611572266, "learning_rate": 3.480288257736329e-07, "loss": 0.5064, "mean_token_accuracy": 0.8346699476242065, "num_tokens": 31256477.0, "step": 822 }, { "epoch": 0.10469405927998983, "ewc_loss": 0.007990222424268723, "ewc_loss_diag": 5.662441253662109e-06, "ewc_loss_parallel": 2.3139526092563756e-05, "grad_norm": 2.6476964950561523, "learning_rate": 3.4845273420941073e-07, "loss": 0.506, "mean_token_accuracy": 0.840211808681488, "num_tokens": 31294365.0, "step": 823 }, { "epoch": 0.10482126955858033, "ewc_loss": 0.008028696291148663, "ewc_loss_diag": 5.692243576049805e-06, "ewc_loss_parallel": 2.321908868907485e-05, "grad_norm": 2.7523341178894043, "learning_rate": 3.4887664264518863e-07, "loss": 0.5026, "mean_token_accuracy": 0.8362373113632202, "num_tokens": 31329357.0, "step": 824 }, { "epoch": 0.10494847983717084, "ewc_loss": 0.008089011535048485, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 2.351707371417433e-05, "grad_norm": 2.4979543685913086, "learning_rate": 3.4930055108096653e-07, "loss": 0.507, "mean_token_accuracy": 0.8384748697280884, "num_tokens": 31372180.0, "step": 825 }, { "epoch": 0.10507569011576136, "ewc_loss": 0.008025597780942917, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 2.288293580932077e-05, "grad_norm": 2.59142804145813, "learning_rate": 3.497244595167443e-07, "loss": 0.4894, "mean_token_accuracy": 0.8421218395233154, "num_tokens": 31414866.0, "step": 826 }, { "epoch": 0.10520290039435186, "ewc_loss": 0.008070090785622597, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 2.332785697944928e-05, "grad_norm": 2.784759759902954, "learning_rate": 3.501483679525222e-07, "loss": 0.5569, "mean_token_accuracy": 0.8223255276679993, "num_tokens": 31450583.0, "step": 827 }, { "epoch": 0.10533011067294237, "ewc_loss": 0.008115878328680992, "ewc_loss_diag": 5.7220458984375e-06, "ewc_loss_parallel": 2.3785734811099246e-05, "grad_norm": 2.604229211807251, "learning_rate": 3.505722763883001e-07, "loss": 0.4895, "mean_token_accuracy": 0.8414586782455444, "num_tokens": 31490371.0, "step": 828 }, { "epoch": 0.10545732095153289, "ewc_loss": 0.008060067892074585, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 2.322763430129271e-05, "grad_norm": 2.679100751876831, "learning_rate": 3.50996184824078e-07, "loss": 0.5299, "mean_token_accuracy": 0.829298198223114, "num_tokens": 31525951.0, "step": 829 }, { "epoch": 0.10558453123012339, "ewc_loss": 0.008089852519333363, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 2.352547562622931e-05, "grad_norm": 2.58978271484375, "learning_rate": 3.514200932598558e-07, "loss": 0.5703, "mean_token_accuracy": 0.8200325965881348, "num_tokens": 31570109.0, "step": 830 }, { "epoch": 0.1057117415087139, "ewc_loss": 0.008101352490484715, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 2.333530028408859e-05, "grad_norm": 2.6925876140594482, "learning_rate": 3.518440016956337e-07, "loss": 0.5136, "mean_token_accuracy": 0.8356155157089233, "num_tokens": 31610569.0, "step": 831 }, { "epoch": 0.10583895178730442, "ewc_loss": 0.008137189783155918, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 2.3693672119406983e-05, "grad_norm": 2.6199615001678467, "learning_rate": 3.522679101314116e-07, "loss": 0.5255, "mean_token_accuracy": 0.8351683616638184, "num_tokens": 31653585.0, "step": 832 }, { "epoch": 0.10596616206589493, "ewc_loss": 0.008083609864115715, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 2.3463047909899615e-05, "grad_norm": 2.606457233428955, "learning_rate": 3.526918185671895e-07, "loss": 0.5348, "mean_token_accuracy": 0.8300431370735168, "num_tokens": 31696353.0, "step": 833 }, { "epoch": 0.10609337234448543, "ewc_loss": 0.008116689510643482, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 2.348867019463796e-05, "grad_norm": 2.6442344188690186, "learning_rate": 3.531157270029673e-07, "loss": 0.4667, "mean_token_accuracy": 0.8506896495819092, "num_tokens": 31730620.0, "step": 834 }, { "epoch": 0.10622058262307595, "ewc_loss": 0.008137278258800507, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 2.3694565243204124e-05, "grad_norm": 2.904507637023926, "learning_rate": 3.535396354387452e-07, "loss": 0.4832, "mean_token_accuracy": 0.8439652919769287, "num_tokens": 31770538.0, "step": 835 }, { "epoch": 0.10634779290166646, "ewc_loss": 0.008184922859072685, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 2.4171009499696083e-05, "grad_norm": 2.7242696285247803, "learning_rate": 3.539635438745231e-07, "loss": 0.5245, "mean_token_accuracy": 0.8318747282028198, "num_tokens": 31808687.0, "step": 836 }, { "epoch": 0.10647500318025696, "ewc_loss": 0.008126618340611458, "ewc_loss_diag": 5.781650543212891e-06, "ewc_loss_parallel": 2.3587961550219916e-05, "grad_norm": 2.4751532077789307, "learning_rate": 3.54387452310301e-07, "loss": 0.5037, "mean_token_accuracy": 0.8390007019042969, "num_tokens": 31852310.0, "step": 837 }, { "epoch": 0.10660221345884748, "ewc_loss": 0.008049090392887592, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 2.3117858290788718e-05, "grad_norm": 2.562528610229492, "learning_rate": 3.548113607460788e-07, "loss": 0.472, "mean_token_accuracy": 0.8512476086616516, "num_tokens": 31887897.0, "step": 838 }, { "epoch": 0.10672942373743799, "ewc_loss": 0.008105052635073662, "ewc_loss_diag": 5.751848220825195e-06, "ewc_loss_parallel": 2.3677483113715425e-05, "grad_norm": 2.689006805419922, "learning_rate": 3.552352691818567e-07, "loss": 0.4973, "mean_token_accuracy": 0.8425222635269165, "num_tokens": 31926159.0, "step": 839 }, { "epoch": 0.10685663401602849, "ewc_loss": 0.008188681676983833, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 2.3903423425508663e-05, "grad_norm": 2.542809247970581, "learning_rate": 3.556591776176346e-07, "loss": 0.4729, "mean_token_accuracy": 0.8486871719360352, "num_tokens": 31968432.0, "step": 840 }, { "epoch": 0.106983844294619, "ewc_loss": 0.00815359316766262, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 2.3552529455628246e-05, "grad_norm": 2.813624382019043, "learning_rate": 3.560830860534125e-07, "loss": 0.5042, "mean_token_accuracy": 0.8415011167526245, "num_tokens": 32000949.0, "step": 841 }, { "epoch": 0.10711105457320952, "ewc_loss": 0.008230967447161674, "ewc_loss_diag": 5.811452865600586e-06, "ewc_loss_parallel": 2.4326276616193354e-05, "grad_norm": 2.760148048400879, "learning_rate": 3.565069944891903e-07, "loss": 0.5457, "mean_token_accuracy": 0.8255365490913391, "num_tokens": 32033862.0, "step": 842 }, { "epoch": 0.10723826485180002, "ewc_loss": 0.00824190117418766, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.41304405790288e-05, "grad_norm": 2.7111353874206543, "learning_rate": 3.569309029249682e-07, "loss": 0.5098, "mean_token_accuracy": 0.8385661244392395, "num_tokens": 32068623.0, "step": 843 }, { "epoch": 0.10736547513039053, "ewc_loss": 0.008223959244787693, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.395101728325244e-05, "grad_norm": 2.6525354385375977, "learning_rate": 3.573548113607461e-07, "loss": 0.497, "mean_token_accuracy": 0.8424380421638489, "num_tokens": 32106658.0, "step": 844 }, { "epoch": 0.10749268540898105, "ewc_loss": 0.008221441879868507, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.3925840650917962e-05, "grad_norm": 2.6322479248046875, "learning_rate": 3.577787197965239e-07, "loss": 0.537, "mean_token_accuracy": 0.8277496099472046, "num_tokens": 32142031.0, "step": 845 }, { "epoch": 0.10761989568757156, "ewc_loss": 0.008226429112255573, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.3975713702384382e-05, "grad_norm": 2.5214054584503174, "learning_rate": 3.5820262823230177e-07, "loss": 0.4723, "mean_token_accuracy": 0.8531352281570435, "num_tokens": 32182686.0, "step": 846 }, { "epoch": 0.10774710596616206, "ewc_loss": 0.008206777274608612, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.3779197363182902e-05, "grad_norm": 2.7395718097686768, "learning_rate": 3.5862653666807967e-07, "loss": 0.5652, "mean_token_accuracy": 0.8193662166595459, "num_tokens": 32218027.0, "step": 847 }, { "epoch": 0.10787431624475258, "ewc_loss": 0.008271549828350544, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.4426926756859757e-05, "grad_norm": 2.643683671951294, "learning_rate": 3.5905044510385757e-07, "loss": 0.5224, "mean_token_accuracy": 0.836445689201355, "num_tokens": 32257854.0, "step": 848 }, { "epoch": 0.10800152652334309, "ewc_loss": 0.00824480876326561, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.4159515305655077e-05, "grad_norm": 2.611510992050171, "learning_rate": 3.594743535396354e-07, "loss": 0.5049, "mean_token_accuracy": 0.839055061340332, "num_tokens": 32295019.0, "step": 849 }, { "epoch": 0.10812873680193359, "ewc_loss": 0.008237403817474842, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.4085464247036725e-05, "grad_norm": 2.6412136554718018, "learning_rate": 3.5989826197541326e-07, "loss": 0.5533, "mean_token_accuracy": 0.8238421678543091, "num_tokens": 32335652.0, "step": 850 }, { "epoch": 0.1082559470805241, "ewc_loss": 0.008255653083324432, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.4267956177936867e-05, "grad_norm": 2.5690348148345947, "learning_rate": 3.6032217041119116e-07, "loss": 0.502, "mean_token_accuracy": 0.8357293009757996, "num_tokens": 32377249.0, "step": 851 }, { "epoch": 0.10838315735911462, "ewc_loss": 0.008246492594480515, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.4176357328542508e-05, "grad_norm": 2.4986162185668945, "learning_rate": 3.6074607884696906e-07, "loss": 0.4726, "mean_token_accuracy": 0.8505368232727051, "num_tokens": 32422467.0, "step": 852 }, { "epoch": 0.10851036763770512, "ewc_loss": 0.008245520293712616, "ewc_loss_diag": 5.841255187988281e-06, "ewc_loss_parallel": 2.416662755422294e-05, "grad_norm": 2.544811248779297, "learning_rate": 3.611699872827469e-07, "loss": 0.4616, "mean_token_accuracy": 0.8496775031089783, "num_tokens": 32462120.0, "step": 853 }, { "epoch": 0.10863757791629564, "ewc_loss": 0.00829037744551897, "ewc_loss_diag": 5.8710575103759766e-06, "ewc_loss_parallel": 2.4310025764862075e-05, "grad_norm": 2.785691261291504, "learning_rate": 3.6159389571852475e-07, "loss": 0.5811, "mean_token_accuracy": 0.8174507021903992, "num_tokens": 32498484.0, "step": 854 }, { "epoch": 0.10876478819488615, "ewc_loss": 0.008358996361494064, "ewc_loss_diag": 5.8710575103759766e-06, "ewc_loss_parallel": 2.4996217689476907e-05, "grad_norm": 2.5828611850738525, "learning_rate": 3.6201780415430265e-07, "loss": 0.5199, "mean_token_accuracy": 0.8344151973724365, "num_tokens": 32541536.0, "step": 855 }, { "epoch": 0.10889199847347665, "ewc_loss": 0.008285528980195522, "ewc_loss_diag": 5.8710575103759766e-06, "ewc_loss_parallel": 2.4261538783321157e-05, "grad_norm": 2.6479251384735107, "learning_rate": 3.6244171259008055e-07, "loss": 0.5401, "mean_token_accuracy": 0.8319866061210632, "num_tokens": 32579750.0, "step": 856 }, { "epoch": 0.10901920875206716, "ewc_loss": 0.008376827463507652, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4564176783314906e-05, "grad_norm": 2.6594173908233643, "learning_rate": 3.628656210258584e-07, "loss": 0.5497, "mean_token_accuracy": 0.8228476047515869, "num_tokens": 32617518.0, "step": 857 }, { "epoch": 0.10914641903065768, "ewc_loss": 0.00838412344455719, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4637134629301727e-05, "grad_norm": 2.5954222679138184, "learning_rate": 3.6328952946163624e-07, "loss": 0.4913, "mean_token_accuracy": 0.8427399396896362, "num_tokens": 32653931.0, "step": 858 }, { "epoch": 0.1092736293092482, "ewc_loss": 0.008373094722628593, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4526842025807127e-05, "grad_norm": 2.633709192276001, "learning_rate": 3.6371343789741414e-07, "loss": 0.4797, "mean_token_accuracy": 0.8464917540550232, "num_tokens": 32693797.0, "step": 859 }, { "epoch": 0.1094008395878387, "ewc_loss": 0.008382314816117287, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4619052055641077e-05, "grad_norm": 2.6276445388793945, "learning_rate": 3.6413734633319204e-07, "loss": 0.526, "mean_token_accuracy": 0.8337068557739258, "num_tokens": 32732432.0, "step": 860 }, { "epoch": 0.10952804986642921, "ewc_loss": 0.008389837108552456, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4694269086467102e-05, "grad_norm": 2.651967763900757, "learning_rate": 3.645612547689699e-07, "loss": 0.4801, "mean_token_accuracy": 0.8410699963569641, "num_tokens": 32769225.0, "step": 861 }, { "epoch": 0.10965526014501972, "ewc_loss": 0.008392427116632462, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4720173314563e-05, "grad_norm": 2.5647032260894775, "learning_rate": 3.6498516320474773e-07, "loss": 0.4736, "mean_token_accuracy": 0.8486274480819702, "num_tokens": 32812027.0, "step": 862 }, { "epoch": 0.10978247042361022, "ewc_loss": 0.008375204168260098, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.454794230288826e-05, "grad_norm": 2.5929007530212402, "learning_rate": 3.6540907164052563e-07, "loss": 0.4529, "mean_token_accuracy": 0.8547348976135254, "num_tokens": 32849334.0, "step": 863 }, { "epoch": 0.10990968070220074, "ewc_loss": 0.00839146412909031, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4710543584660627e-05, "grad_norm": 2.6628758907318115, "learning_rate": 3.658329800763035e-07, "loss": 0.5466, "mean_token_accuracy": 0.8246856927871704, "num_tokens": 32888422.0, "step": 864 }, { "epoch": 0.11003689098079125, "ewc_loss": 0.008414147421717644, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4937367925303988e-05, "grad_norm": 3.0508432388305664, "learning_rate": 3.662568885120814e-07, "loss": 0.506, "mean_token_accuracy": 0.8378000855445862, "num_tokens": 32922672.0, "step": 865 }, { "epoch": 0.11016410125938175, "ewc_loss": 0.008503515273332596, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.58310537901707e-05, "grad_norm": 2.745973825454712, "learning_rate": 3.666807969478592e-07, "loss": 0.5369, "mean_token_accuracy": 0.8266507983207703, "num_tokens": 32961694.0, "step": 866 }, { "epoch": 0.11029131153797227, "ewc_loss": 0.008389657363295555, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4692468286957592e-05, "grad_norm": 2.5864884853363037, "learning_rate": 3.671047053836371e-07, "loss": 0.5245, "mean_token_accuracy": 0.8298596143722534, "num_tokens": 32998595.0, "step": 867 }, { "epoch": 0.11041852181656278, "ewc_loss": 0.00836616288870573, "ewc_loss_diag": 5.930662155151367e-06, "ewc_loss_parallel": 2.4457529434585012e-05, "grad_norm": 2.5970065593719482, "learning_rate": 3.6752861381941497e-07, "loss": 0.4871, "mean_token_accuracy": 0.8426974415779114, "num_tokens": 33035884.0, "step": 868 }, { "epoch": 0.11054573209515328, "ewc_loss": 0.00843097548931837, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 2.4800476239761338e-05, "grad_norm": 2.694978713989258, "learning_rate": 3.6795252225519287e-07, "loss": 0.5465, "mean_token_accuracy": 0.8293219804763794, "num_tokens": 33073335.0, "step": 869 }, { "epoch": 0.1106729423737438, "ewc_loss": 0.008451486937701702, "ewc_loss_diag": 5.9604644775390625e-06, "ewc_loss_parallel": 2.5005589122883976e-05, "grad_norm": 2.5586910247802734, "learning_rate": 3.6837643069097077e-07, "loss": 0.4482, "mean_token_accuracy": 0.8543821573257446, "num_tokens": 33111443.0, "step": 870 }, { "epoch": 0.11080015265233431, "ewc_loss": 0.008447316475212574, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 2.465871330059599e-05, "grad_norm": 2.5672335624694824, "learning_rate": 3.688003391267486e-07, "loss": 0.5073, "mean_token_accuracy": 0.8410723209381104, "num_tokens": 33156362.0, "step": 871 }, { "epoch": 0.11092736293092482, "ewc_loss": 0.008465304970741272, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 2.4838594981702045e-05, "grad_norm": 2.505228042602539, "learning_rate": 3.6922424756252646e-07, "loss": 0.4767, "mean_token_accuracy": 0.8466544151306152, "num_tokens": 33204968.0, "step": 872 }, { "epoch": 0.11105457320951533, "ewc_loss": 0.008463406004011631, "ewc_loss_diag": 5.990266799926758e-06, "ewc_loss_parallel": 2.4819604732329026e-05, "grad_norm": 2.7318172454833984, "learning_rate": 3.6964815599830436e-07, "loss": 0.5222, "mean_token_accuracy": 0.8304930925369263, "num_tokens": 33239265.0, "step": 873 }, { "epoch": 0.11118178348810584, "ewc_loss": 0.008586475625634193, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 2.5439956516493112e-05, "grad_norm": 2.721379280090332, "learning_rate": 3.7007206443408226e-07, "loss": 0.4576, "mean_token_accuracy": 0.8535588979721069, "num_tokens": 33275897.0, "step": 874 }, { "epoch": 0.11130899376669635, "ewc_loss": 0.00857173465192318, "ewc_loss_diag": 6.0498714447021484e-06, "ewc_loss_parallel": 2.529254743421916e-05, "grad_norm": 2.661797285079956, "learning_rate": 3.704959728698601e-07, "loss": 0.5774, "mean_token_accuracy": 0.8169320821762085, "num_tokens": 33313966.0, "step": 875 }, { "epoch": 0.11143620404528685, "ewc_loss": 0.008585823699831963, "ewc_loss_diag": 6.079673767089844e-06, "ewc_loss_parallel": 2.51282581302803e-05, "grad_norm": 2.675851821899414, "learning_rate": 3.7091988130563795e-07, "loss": 0.5254, "mean_token_accuracy": 0.8336167335510254, "num_tokens": 33352909.0, "step": 876 }, { "epoch": 0.11156341432387737, "ewc_loss": 0.00860285572707653, "ewc_loss_diag": 6.079673767089844e-06, "ewc_loss_parallel": 2.5298571927123703e-05, "grad_norm": 2.548577308654785, "learning_rate": 3.7134378974141585e-07, "loss": 0.5082, "mean_token_accuracy": 0.8397669792175293, "num_tokens": 33400120.0, "step": 877 }, { "epoch": 0.11169062460246788, "ewc_loss": 0.00856524333357811, "ewc_loss_diag": 6.079673767089844e-06, "ewc_loss_parallel": 2.4922448574216105e-05, "grad_norm": 2.6996278762817383, "learning_rate": 3.7176769817719375e-07, "loss": 0.5331, "mean_token_accuracy": 0.8313151597976685, "num_tokens": 33436703.0, "step": 878 }, { "epoch": 0.11181783488105838, "ewc_loss": 0.008658538572490215, "ewc_loss_diag": 6.109476089477539e-06, "ewc_loss_parallel": 2.5550230930093676e-05, "grad_norm": 2.6764936447143555, "learning_rate": 3.7219160661297154e-07, "loss": 0.4442, "mean_token_accuracy": 0.8581384420394897, "num_tokens": 33473515.0, "step": 879 }, { "epoch": 0.1119450451596489, "ewc_loss": 0.008670004084706306, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 2.535971361794509e-05, "grad_norm": 2.6880478858947754, "learning_rate": 3.7261551504874944e-07, "loss": 0.5047, "mean_token_accuracy": 0.8408285975456238, "num_tokens": 33507659.0, "step": 880 }, { "epoch": 0.11207225543823941, "ewc_loss": 0.008675990626215935, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 2.541956928325817e-05, "grad_norm": 2.629108190536499, "learning_rate": 3.7303942348452734e-07, "loss": 0.4516, "mean_token_accuracy": 0.852902889251709, "num_tokens": 33542430.0, "step": 881 }, { "epoch": 0.11219946571682991, "ewc_loss": 0.00866539403796196, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 2.5313607693533413e-05, "grad_norm": 2.565403938293457, "learning_rate": 3.7346333192030524e-07, "loss": 0.4607, "mean_token_accuracy": 0.8555833101272583, "num_tokens": 33582579.0, "step": 882 }, { "epoch": 0.11232667599542043, "ewc_loss": 0.00865989737212658, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 2.525863965274766e-05, "grad_norm": 2.537247657775879, "learning_rate": 3.7388724035608303e-07, "loss": 0.484, "mean_token_accuracy": 0.8482049107551575, "num_tokens": 33625928.0, "step": 883 }, { "epoch": 0.11245388627401094, "ewc_loss": 0.008669368922710419, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 2.5353361706947908e-05, "grad_norm": 2.7141952514648438, "learning_rate": 3.7431114879186093e-07, "loss": 0.5236, "mean_token_accuracy": 0.8339378833770752, "num_tokens": 33661325.0, "step": 884 }, { "epoch": 0.11258109655260146, "ewc_loss": 0.008716482669115067, "ewc_loss_diag": 6.139278411865234e-06, "ewc_loss_parallel": 2.5824499971349724e-05, "grad_norm": 2.6733126640319824, "learning_rate": 3.7473505722763883e-07, "loss": 0.545, "mean_token_accuracy": 0.826892077922821, "num_tokens": 33696472.0, "step": 885 }, { "epoch": 0.11270830683119196, "ewc_loss": 0.0087295426055789, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.5649915187386796e-05, "grad_norm": 2.6726973056793213, "learning_rate": 3.7515896566341673e-07, "loss": 0.499, "mean_token_accuracy": 0.8378880023956299, "num_tokens": 33732637.0, "step": 886 }, { "epoch": 0.11283551710978247, "ewc_loss": 0.00873008742928505, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.5655363060650416e-05, "grad_norm": 2.7337982654571533, "learning_rate": 3.755828740991945e-07, "loss": 0.5238, "mean_token_accuracy": 0.8330315351486206, "num_tokens": 33775106.0, "step": 887 }, { "epoch": 0.11296272738837299, "ewc_loss": 0.008747330866754055, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.582780143711716e-05, "grad_norm": 2.6724796295166016, "learning_rate": 3.760067825349724e-07, "loss": 0.4861, "mean_token_accuracy": 0.8454728126525879, "num_tokens": 33811297.0, "step": 888 }, { "epoch": 0.11308993766696349, "ewc_loss": 0.008734544739127159, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.5699941033963114e-05, "grad_norm": 2.5527966022491455, "learning_rate": 3.764306909707503e-07, "loss": 0.4494, "mean_token_accuracy": 0.854580283164978, "num_tokens": 33854038.0, "step": 889 }, { "epoch": 0.113217147945554, "ewc_loss": 0.008708933368325233, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.5443820049986243e-05, "grad_norm": 2.799612522125244, "learning_rate": 3.768545994065282e-07, "loss": 0.5052, "mean_token_accuracy": 0.8370116949081421, "num_tokens": 33884929.0, "step": 890 }, { "epoch": 0.11334435822414451, "ewc_loss": 0.008785076439380646, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6205258109257556e-05, "grad_norm": 2.6644034385681152, "learning_rate": 3.77278507842306e-07, "loss": 0.5139, "mean_token_accuracy": 0.8373295068740845, "num_tokens": 33926065.0, "step": 891 }, { "epoch": 0.11347156850273502, "ewc_loss": 0.008740143850445747, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.575592588982545e-05, "grad_norm": 2.615631341934204, "learning_rate": 3.777024162780839e-07, "loss": 0.4664, "mean_token_accuracy": 0.8476994037628174, "num_tokens": 33965557.0, "step": 892 }, { "epoch": 0.11359877878132553, "ewc_loss": 0.008731726557016373, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.5671753974165767e-05, "grad_norm": 2.649386405944824, "learning_rate": 3.781263247138618e-07, "loss": 0.4732, "mean_token_accuracy": 0.847329318523407, "num_tokens": 34004132.0, "step": 893 }, { "epoch": 0.11372598905991604, "ewc_loss": 0.008755641989409924, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.5910912881954573e-05, "grad_norm": 2.709527015686035, "learning_rate": 3.785502331496397e-07, "loss": 0.5006, "mean_token_accuracy": 0.8335323333740234, "num_tokens": 34039781.0, "step": 894 }, { "epoch": 0.11385319933850654, "ewc_loss": 0.008759398013353348, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.594846773718018e-05, "grad_norm": 2.659273624420166, "learning_rate": 3.789741415854175e-07, "loss": 0.5246, "mean_token_accuracy": 0.8314871788024902, "num_tokens": 34080487.0, "step": 895 }, { "epoch": 0.11398040961709706, "ewc_loss": 0.008752420544624329, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.5878698579617776e-05, "grad_norm": 2.6394569873809814, "learning_rate": 3.793980500211954e-07, "loss": 0.5035, "mean_token_accuracy": 0.8379460573196411, "num_tokens": 34119913.0, "step": 896 }, { "epoch": 0.11410761989568757, "ewc_loss": 0.008757096715271473, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.592545934021473e-05, "grad_norm": 2.6151278018951416, "learning_rate": 3.798219584569733e-07, "loss": 0.4375, "mean_token_accuracy": 0.8583243489265442, "num_tokens": 34155811.0, "step": 897 }, { "epoch": 0.11423483017427809, "ewc_loss": 0.00875354278832674, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.588991992524825e-05, "grad_norm": 2.7573201656341553, "learning_rate": 3.8024586689275115e-07, "loss": 0.5586, "mean_token_accuracy": 0.8264648914337158, "num_tokens": 34195022.0, "step": 898 }, { "epoch": 0.11436204045286859, "ewc_loss": 0.008797590620815754, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6330400942242704e-05, "grad_norm": 2.674506425857544, "learning_rate": 3.80669775328529e-07, "loss": 0.4911, "mean_token_accuracy": 0.8398380875587463, "num_tokens": 34229182.0, "step": 899 }, { "epoch": 0.1144892507314591, "ewc_loss": 0.008772672154009342, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6081215764861554e-05, "grad_norm": 2.6282477378845215, "learning_rate": 3.810936837643069e-07, "loss": 0.4557, "mean_token_accuracy": 0.8521338105201721, "num_tokens": 34266931.0, "step": 900 }, { "epoch": 0.11461646101004962, "ewc_loss": 0.008772650733590126, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.608100294310134e-05, "grad_norm": 2.6669654846191406, "learning_rate": 3.815175922000848e-07, "loss": 0.4459, "mean_token_accuracy": 0.8539533615112305, "num_tokens": 34301705.0, "step": 901 }, { "epoch": 0.11474367128864012, "ewc_loss": 0.008790279738605022, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6257290301145986e-05, "grad_norm": 2.6107640266418457, "learning_rate": 3.8194150063586264e-07, "loss": 0.5276, "mean_token_accuracy": 0.835059404373169, "num_tokens": 34340975.0, "step": 902 }, { "epoch": 0.11487088156723063, "ewc_loss": 0.008784307166934013, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6197558327112347e-05, "grad_norm": 2.6576356887817383, "learning_rate": 3.823654090716405e-07, "loss": 0.4969, "mean_token_accuracy": 0.8382624983787537, "num_tokens": 34380024.0, "step": 903 }, { "epoch": 0.11499809184582115, "ewc_loss": 0.008798649534583092, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6340983822592534e-05, "grad_norm": 2.7265806198120117, "learning_rate": 3.827893175074184e-07, "loss": 0.519, "mean_token_accuracy": 0.840974748134613, "num_tokens": 34419654.0, "step": 904 }, { "epoch": 0.11512530212441165, "ewc_loss": 0.008821956813335419, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6574060029815882e-05, "grad_norm": 2.7171742916107178, "learning_rate": 3.832132259431963e-07, "loss": 0.4496, "mean_token_accuracy": 0.8567836880683899, "num_tokens": 34453034.0, "step": 905 }, { "epoch": 0.11525251240300216, "ewc_loss": 0.008812254294753075, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6477040591998957e-05, "grad_norm": 2.6643049716949463, "learning_rate": 3.8363713437897413e-07, "loss": 0.553, "mean_token_accuracy": 0.8228039741516113, "num_tokens": 34493694.0, "step": 906 }, { "epoch": 0.11537972268159268, "ewc_loss": 0.008801054209470749, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.636503086250741e-05, "grad_norm": 2.6971206665039062, "learning_rate": 3.8406104281475197e-07, "loss": 0.5446, "mean_token_accuracy": 0.8281097412109375, "num_tokens": 34531636.0, "step": 907 }, { "epoch": 0.11550693296018319, "ewc_loss": 0.008818958885967731, "ewc_loss_diag": 6.16908073425293e-06, "ewc_loss_parallel": 2.6544077627477236e-05, "grad_norm": 2.69187068939209, "learning_rate": 3.8448495125052987e-07, "loss": 0.5298, "mean_token_accuracy": 0.8285337686538696, "num_tokens": 34570851.0, "step": 908 }, { "epoch": 0.11563414323877369, "ewc_loss": 0.008852523751556873, "ewc_loss_diag": 6.198883056640625e-06, "ewc_loss_parallel": 2.657455661392305e-05, "grad_norm": 2.629732608795166, "learning_rate": 3.8490885968630777e-07, "loss": 0.4823, "mean_token_accuracy": 0.8462541699409485, "num_tokens": 34613314.0, "step": 909 }, { "epoch": 0.1157613535173642, "ewc_loss": 0.008841287344694138, "ewc_loss_diag": 6.198883056640625e-06, "ewc_loss_parallel": 2.6462184905540198e-05, "grad_norm": 2.5755462646484375, "learning_rate": 3.853327681220856e-07, "loss": 0.4983, "mean_token_accuracy": 0.8389682173728943, "num_tokens": 34656701.0, "step": 910 }, { "epoch": 0.11588856379595472, "ewc_loss": 0.008840315043926239, "ewc_loss_diag": 6.198883056640625e-06, "ewc_loss_parallel": 2.6452467864146456e-05, "grad_norm": 2.807924270629883, "learning_rate": 3.8575667655786346e-07, "loss": 0.4727, "mean_token_accuracy": 0.8492637872695923, "num_tokens": 34688595.0, "step": 911 }, { "epoch": 0.11601577407454522, "ewc_loss": 0.008911270648241043, "ewc_loss_diag": 6.198883056640625e-06, "ewc_loss_parallel": 2.716201743169222e-05, "grad_norm": 2.654977560043335, "learning_rate": 3.8618058499364136e-07, "loss": 0.4916, "mean_token_accuracy": 0.8438462615013123, "num_tokens": 34726609.0, "step": 912 }, { "epoch": 0.11614298435313573, "ewc_loss": 0.008857334963977337, "ewc_loss_diag": 6.198883056640625e-06, "ewc_loss_parallel": 2.662266888364684e-05, "grad_norm": 2.6530680656433105, "learning_rate": 3.8660449342941926e-07, "loss": 0.5466, "mean_token_accuracy": 0.829376757144928, "num_tokens": 34769213.0, "step": 913 }, { "epoch": 0.11627019463172625, "ewc_loss": 0.00886385329067707, "ewc_loss_diag": 6.198883056640625e-06, "ewc_loss_parallel": 2.6687852368922904e-05, "grad_norm": 2.689899444580078, "learning_rate": 3.870284018651971e-07, "loss": 0.5209, "mean_token_accuracy": 0.834631085395813, "num_tokens": 34806722.0, "step": 914 }, { "epoch": 0.11639740491031675, "ewc_loss": 0.00894465483725071, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.6885509214480408e-05, "grad_norm": 2.662112236022949, "learning_rate": 3.8745231030097495e-07, "loss": 0.4766, "mean_token_accuracy": 0.849381685256958, "num_tokens": 34846408.0, "step": 915 }, { "epoch": 0.11652461518890726, "ewc_loss": 0.008901109918951988, "ewc_loss_diag": 6.22868537902832e-06, "ewc_loss_parallel": 2.6755240469356067e-05, "grad_norm": 2.699350357055664, "learning_rate": 3.8787621873675285e-07, "loss": 0.4649, "mean_token_accuracy": 0.849439263343811, "num_tokens": 34883791.0, "step": 916 }, { "epoch": 0.11665182546749778, "ewc_loss": 0.008942672051489353, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.6865682229981758e-05, "grad_norm": 2.6197476387023926, "learning_rate": 3.883001271725307e-07, "loss": 0.4603, "mean_token_accuracy": 0.8519896864891052, "num_tokens": 34922768.0, "step": 917 }, { "epoch": 0.11677903574608828, "ewc_loss": 0.008929586037993431, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.6734822313301265e-05, "grad_norm": 2.611485242843628, "learning_rate": 3.887240356083086e-07, "loss": 0.4401, "mean_token_accuracy": 0.8569316864013672, "num_tokens": 34962184.0, "step": 918 }, { "epoch": 0.11690624602467879, "ewc_loss": 0.008945386856794357, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.6892837922787294e-05, "grad_norm": 2.885234832763672, "learning_rate": 3.8914794404408644e-07, "loss": 0.4838, "mean_token_accuracy": 0.8438383936882019, "num_tokens": 34991455.0, "step": 919 }, { "epoch": 0.1170334563032693, "ewc_loss": 0.009028137661516666, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.772033985820599e-05, "grad_norm": 2.8192594051361084, "learning_rate": 3.8957185247986434e-07, "loss": 0.5138, "mean_token_accuracy": 0.8352036476135254, "num_tokens": 35025044.0, "step": 920 }, { "epoch": 0.11716066658185982, "ewc_loss": 0.00898030400276184, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.7242009309702553e-05, "grad_norm": 2.667686939239502, "learning_rate": 3.899957609156422e-07, "loss": 0.4863, "mean_token_accuracy": 0.8430836200714111, "num_tokens": 35064564.0, "step": 921 }, { "epoch": 0.11728787686045032, "ewc_loss": 0.008937426842749119, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.681323530850932e-05, "grad_norm": 2.637101173400879, "learning_rate": 3.904196693514201e-07, "loss": 0.5034, "mean_token_accuracy": 0.8395875692367554, "num_tokens": 35109786.0, "step": 922 }, { "epoch": 0.11741508713904084, "ewc_loss": 0.008959568105638027, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.703464633668773e-05, "grad_norm": 2.7620091438293457, "learning_rate": 3.9084357778719793e-07, "loss": 0.5665, "mean_token_accuracy": 0.8203613758087158, "num_tokens": 35148351.0, "step": 923 }, { "epoch": 0.11754229741763135, "ewc_loss": 0.00900045596063137, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.7443524231784977e-05, "grad_norm": 2.7616045475006104, "learning_rate": 3.9126748622297583e-07, "loss": 0.5295, "mean_token_accuracy": 0.8321439623832703, "num_tokens": 35183464.0, "step": 924 }, { "epoch": 0.11766950769622185, "ewc_loss": 0.008989688009023666, "ewc_loss_diag": 6.258487701416016e-06, "ewc_loss_parallel": 2.7335841878084466e-05, "grad_norm": 2.7142326831817627, "learning_rate": 3.916913946587537e-07, "loss": 0.5127, "mean_token_accuracy": 0.8337528705596924, "num_tokens": 35218561.0, "step": 925 }, { "epoch": 0.11779671797481236, "ewc_loss": 0.009005200117826462, "ewc_loss_diag": 6.288290023803711e-06, "ewc_loss_parallel": 2.718578980420716e-05, "grad_norm": 2.6845359802246094, "learning_rate": 3.921153030945316e-07, "loss": 0.5274, "mean_token_accuracy": 0.8308120965957642, "num_tokens": 35257740.0, "step": 926 }, { "epoch": 0.11792392825340288, "ewc_loss": 0.009047705680131912, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 2.730566484387964e-05, "grad_norm": 2.8314037322998047, "learning_rate": 3.925392115303094e-07, "loss": 0.4856, "mean_token_accuracy": 0.8451127409934998, "num_tokens": 35291632.0, "step": 927 }, { "epoch": 0.11805113853199338, "ewc_loss": 0.009085709229111671, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 2.7685700842994265e-05, "grad_norm": 2.751159191131592, "learning_rate": 3.929631199660873e-07, "loss": 0.4884, "mean_token_accuracy": 0.8399805426597595, "num_tokens": 35327686.0, "step": 928 }, { "epoch": 0.1181783488105839, "ewc_loss": 0.009053755551576614, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 2.736616625043098e-05, "grad_norm": 2.672165632247925, "learning_rate": 3.9338702840186517e-07, "loss": 0.4893, "mean_token_accuracy": 0.8410525918006897, "num_tokens": 35364480.0, "step": 929 }, { "epoch": 0.11830555908917441, "ewc_loss": 0.009042078629136086, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 2.724939440668095e-05, "grad_norm": 2.613797187805176, "learning_rate": 3.9381093683764307e-07, "loss": 0.4951, "mean_token_accuracy": 0.840817391872406, "num_tokens": 35406017.0, "step": 930 }, { "epoch": 0.11843276936776491, "ewc_loss": 0.009073883295059204, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 2.726227103266865e-05, "grad_norm": 2.6599044799804688, "learning_rate": 3.942348452734209e-07, "loss": 0.5532, "mean_token_accuracy": 0.8223599195480347, "num_tokens": 35448447.0, "step": 931 }, { "epoch": 0.11855997964635542, "ewc_loss": 0.009119348600506783, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 2.741174648690503e-05, "grad_norm": 2.8219797611236572, "learning_rate": 3.946587537091988e-07, "loss": 0.5025, "mean_token_accuracy": 0.8372882604598999, "num_tokens": 35488120.0, "step": 932 }, { "epoch": 0.11868718992494594, "ewc_loss": 0.009098397567868233, "ewc_loss_diag": 6.318092346191406e-06, "ewc_loss_parallel": 2.7812588086817414e-05, "grad_norm": 2.6248226165771484, "learning_rate": 3.9508266214497666e-07, "loss": 0.4785, "mean_token_accuracy": 0.8458737134933472, "num_tokens": 35531759.0, "step": 933 }, { "epoch": 0.11881440020353645, "ewc_loss": 0.009063360281288624, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 2.7157042495673522e-05, "grad_norm": 2.637586832046509, "learning_rate": 3.9550657058075456e-07, "loss": 0.4622, "mean_token_accuracy": 0.8529610633850098, "num_tokens": 35571476.0, "step": 934 }, { "epoch": 0.11894161048212695, "ewc_loss": 0.009079317562282085, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 2.7316609703120776e-05, "grad_norm": 2.7665205001831055, "learning_rate": 3.959304790165324e-07, "loss": 0.5022, "mean_token_accuracy": 0.8401128053665161, "num_tokens": 35608440.0, "step": 935 }, { "epoch": 0.11906882076071747, "ewc_loss": 0.009156412445008755, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 2.7782385586760938e-05, "grad_norm": 2.803290367126465, "learning_rate": 3.9635438745231025e-07, "loss": 0.4974, "mean_token_accuracy": 0.8416202664375305, "num_tokens": 35641861.0, "step": 936 }, { "epoch": 0.11919603103930798, "ewc_loss": 0.009148428216576576, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 2.7702541046892293e-05, "grad_norm": 2.684018850326538, "learning_rate": 3.9677829588808815e-07, "loss": 0.5179, "mean_token_accuracy": 0.8329676389694214, "num_tokens": 35684626.0, "step": 937 }, { "epoch": 0.11932324131789848, "ewc_loss": 0.009117970243096352, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 2.7397965823183767e-05, "grad_norm": 2.7579023838043213, "learning_rate": 3.9720220432386605e-07, "loss": 0.5255, "mean_token_accuracy": 0.8308215141296387, "num_tokens": 35718125.0, "step": 938 }, { "epoch": 0.119450451596489, "ewc_loss": 0.009157368913292885, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 2.7791951652034186e-05, "grad_norm": 2.7163162231445312, "learning_rate": 3.976261127596439e-07, "loss": 0.4687, "mean_token_accuracy": 0.8509811162948608, "num_tokens": 35752818.0, "step": 939 }, { "epoch": 0.11957766187507951, "ewc_loss": 0.00917266495525837, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 2.7639733161777258e-05, "grad_norm": 7.796298503875732, "learning_rate": 3.9805002119542174e-07, "loss": 0.4498, "mean_token_accuracy": 0.8553124070167542, "num_tokens": 35787097.0, "step": 940 }, { "epoch": 0.11970487215367001, "ewc_loss": 0.010008007287979126, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 3.629833736340515e-05, "grad_norm": 3.2231638431549072, "learning_rate": 3.9847392963119964e-07, "loss": 0.546, "mean_token_accuracy": 0.8294215202331543, "num_tokens": 35825887.0, "step": 941 }, { "epoch": 0.11983208243226053, "ewc_loss": 0.009357871487736702, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 2.9796974558848888e-05, "grad_norm": 2.5942041873931885, "learning_rate": 3.9889783806697754e-07, "loss": 0.4761, "mean_token_accuracy": 0.8491823673248291, "num_tokens": 35863592.0, "step": 942 }, { "epoch": 0.11995929271085104, "ewc_loss": 0.00904373824596405, "ewc_loss_diag": 6.3478946685791016e-06, "ewc_loss_parallel": 2.696081719477661e-05, "grad_norm": 2.9620587825775146, "learning_rate": 3.993217465027554e-07, "loss": 0.5497, "mean_token_accuracy": 0.8276445865631104, "num_tokens": 35904483.0, "step": 943 }, { "epoch": 0.12008650298944154, "ewc_loss": 0.009341170080006123, "ewc_loss_diag": 6.377696990966797e-06, "ewc_loss_parallel": 2.962996586575173e-05, "grad_norm": 2.835679769515991, "learning_rate": 3.9974565493853323e-07, "loss": 0.4379, "mean_token_accuracy": 0.8590672016143799, "num_tokens": 35938662.0, "step": 944 }, { "epoch": 0.12021371326803205, "ewc_loss": 0.009259159676730633, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 2.8504680813057348e-05, "grad_norm": 2.7656309604644775, "learning_rate": 4.0016956337431113e-07, "loss": 0.5012, "mean_token_accuracy": 0.8389582633972168, "num_tokens": 35975176.0, "step": 945 }, { "epoch": 0.12034092354662257, "ewc_loss": 0.009202173911035061, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 2.793482235574629e-05, "grad_norm": 2.8585853576660156, "learning_rate": 4.0059347181008903e-07, "loss": 0.4882, "mean_token_accuracy": 0.8426868915557861, "num_tokens": 36008893.0, "step": 946 }, { "epoch": 0.12046813382521308, "ewc_loss": 0.00924115814268589, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 2.8324669983703643e-05, "grad_norm": 2.616022825241089, "learning_rate": 4.010173802458669e-07, "loss": 0.4468, "mean_token_accuracy": 0.8577855229377747, "num_tokens": 36049882.0, "step": 947 }, { "epoch": 0.12059534410380358, "ewc_loss": 0.009160308167338371, "ewc_loss_diag": 6.407499313354492e-06, "ewc_loss_parallel": 2.7516169211594388e-05, "grad_norm": 2.737576484680176, "learning_rate": 4.014412886816447e-07, "loss": 0.5588, "mean_token_accuracy": 0.8248805999755859, "num_tokens": 36090615.0, "step": 948 }, { "epoch": 0.1207225543823941, "ewc_loss": 0.009247335605323315, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 2.8081267373636365e-05, "grad_norm": 2.7415735721588135, "learning_rate": 4.018651971174226e-07, "loss": 0.4904, "mean_token_accuracy": 0.8416094183921814, "num_tokens": 36128159.0, "step": 949 }, { "epoch": 0.12084976466098461, "ewc_loss": 0.009240146726369858, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 2.8009382731397636e-05, "grad_norm": 2.6735050678253174, "learning_rate": 4.022891055532005e-07, "loss": 0.5048, "mean_token_accuracy": 0.8384681940078735, "num_tokens": 36172851.0, "step": 950 }, { "epoch": 0.12097697493957511, "ewc_loss": 0.009214493446052074, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 2.775284337985795e-05, "grad_norm": 2.7309019565582275, "learning_rate": 4.0271301398897837e-07, "loss": 0.5543, "mean_token_accuracy": 0.826984167098999, "num_tokens": 36214282.0, "step": 951 }, { "epoch": 0.12110418521816563, "ewc_loss": 0.009244684129953384, "ewc_loss_diag": 6.4373016357421875e-06, "ewc_loss_parallel": 2.8054746508132666e-05, "grad_norm": 2.8100099563598633, "learning_rate": 4.031369224247562e-07, "loss": 0.476, "mean_token_accuracy": 0.8422320485115051, "num_tokens": 36248619.0, "step": 952 }, { "epoch": 0.12123139549675614, "ewc_loss": 0.009296538308262825, "ewc_loss_diag": 6.467103958129883e-06, "ewc_loss_parallel": 2.8268117603147402e-05, "grad_norm": 2.675133466720581, "learning_rate": 4.035608308605341e-07, "loss": 0.5113, "mean_token_accuracy": 0.8342809081077576, "num_tokens": 36289070.0, "step": 953 }, { "epoch": 0.12135860577534664, "ewc_loss": 0.009279755875468254, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 2.7795122150564566e-05, "grad_norm": 2.6997416019439697, "learning_rate": 4.03984739296312e-07, "loss": 0.5118, "mean_token_accuracy": 0.8382750749588013, "num_tokens": 36330883.0, "step": 954 }, { "epoch": 0.12148581605393716, "ewc_loss": 0.009312102571129799, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 2.8118578484281898e-05, "grad_norm": 2.6850011348724365, "learning_rate": 4.044086477320898e-07, "loss": 0.4929, "mean_token_accuracy": 0.8433763980865479, "num_tokens": 36371507.0, "step": 955 }, { "epoch": 0.12161302633252767, "ewc_loss": 0.009307527914643288, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 2.8072838176740333e-05, "grad_norm": 2.803579092025757, "learning_rate": 4.048325561678677e-07, "loss": 0.5146, "mean_token_accuracy": 0.8354583382606506, "num_tokens": 36407262.0, "step": 956 }, { "epoch": 0.12174023661111817, "ewc_loss": 0.009345057420432568, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 2.8448133889469318e-05, "grad_norm": 2.7070631980895996, "learning_rate": 4.052564646036456e-07, "loss": 0.5121, "mean_token_accuracy": 0.8365318775177002, "num_tokens": 36448089.0, "step": 957 }, { "epoch": 0.12186744688970869, "ewc_loss": 0.009313476271927357, "ewc_loss_diag": 6.496906280517578e-06, "ewc_loss_parallel": 2.813232276821509e-05, "grad_norm": 2.779404401779175, "learning_rate": 4.056803730394235e-07, "loss": 0.5236, "mean_token_accuracy": 0.8308674693107605, "num_tokens": 36484022.0, "step": 958 }, { "epoch": 0.1219946571682992, "ewc_loss": 0.009400466457009315, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.839187072822824e-05, "grad_norm": 2.730410575866699, "learning_rate": 4.061042814752013e-07, "loss": 0.5005, "mean_token_accuracy": 0.8415969610214233, "num_tokens": 36519385.0, "step": 959 }, { "epoch": 0.12212186744688971, "ewc_loss": 0.009393103420734406, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.8318241675151512e-05, "grad_norm": 2.8853189945220947, "learning_rate": 4.065281899109792e-07, "loss": 0.5043, "mean_token_accuracy": 0.8376374244689941, "num_tokens": 36551943.0, "step": 960 }, { "epoch": 0.12224907772548022, "ewc_loss": 0.009437236934900284, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.8759572160197422e-05, "grad_norm": 2.658064603805542, "learning_rate": 4.069520983467571e-07, "loss": 0.4519, "mean_token_accuracy": 0.8551056385040283, "num_tokens": 36590927.0, "step": 961 }, { "epoch": 0.12237628800407073, "ewc_loss": 0.009365295991301537, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.8040172765031457e-05, "grad_norm": 2.7573273181915283, "learning_rate": 4.07376006782535e-07, "loss": 0.4822, "mean_token_accuracy": 0.843437910079956, "num_tokens": 36625199.0, "step": 962 }, { "epoch": 0.12250349828266124, "ewc_loss": 0.009419641457498074, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.8583619496203028e-05, "grad_norm": 2.792631149291992, "learning_rate": 4.077999152183128e-07, "loss": 0.4451, "mean_token_accuracy": 0.855415940284729, "num_tokens": 36658981.0, "step": 963 }, { "epoch": 0.12263070856125174, "ewc_loss": 0.009432494640350342, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.871215110644698e-05, "grad_norm": 2.6199381351470947, "learning_rate": 4.082238236540907e-07, "loss": 0.5066, "mean_token_accuracy": 0.8414355516433716, "num_tokens": 36704214.0, "step": 964 }, { "epoch": 0.12275791883984226, "ewc_loss": 0.009375560097396374, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.8142809242126532e-05, "grad_norm": 2.7096774578094482, "learning_rate": 4.086477320898686e-07, "loss": 0.4752, "mean_token_accuracy": 0.8458912372589111, "num_tokens": 36744782.0, "step": 965 }, { "epoch": 0.12288512911843277, "ewc_loss": 0.009428729303181171, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.867450166377239e-05, "grad_norm": 2.733293056488037, "learning_rate": 4.090716405256465e-07, "loss": 0.5094, "mean_token_accuracy": 0.8353418707847595, "num_tokens": 36783645.0, "step": 966 }, { "epoch": 0.12301233939702327, "ewc_loss": 0.009431058540940285, "ewc_loss_diag": 6.556510925292969e-06, "ewc_loss_parallel": 2.8697793823084794e-05, "grad_norm": 2.8489229679107666, "learning_rate": 4.094955489614243e-07, "loss": 0.4779, "mean_token_accuracy": 0.8452758193016052, "num_tokens": 36817539.0, "step": 967 }, { "epoch": 0.12313954967561379, "ewc_loss": 0.009488752111792564, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 2.8969547201995738e-05, "grad_norm": 2.7203009128570557, "learning_rate": 4.099194573972022e-07, "loss": 0.4806, "mean_token_accuracy": 0.8444734811782837, "num_tokens": 36859144.0, "step": 968 }, { "epoch": 0.1232667599542043, "ewc_loss": 0.009447245858609676, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 2.8554490199894644e-05, "grad_norm": 2.7011404037475586, "learning_rate": 4.1034336583298007e-07, "loss": 0.5235, "mean_token_accuracy": 0.8335199356079102, "num_tokens": 36900833.0, "step": 969 }, { "epoch": 0.1233939702327948, "ewc_loss": 0.009463980793952942, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 2.872183540603146e-05, "grad_norm": 2.705080270767212, "learning_rate": 4.1076727426875797e-07, "loss": 0.5117, "mean_token_accuracy": 0.8356146812438965, "num_tokens": 36940935.0, "step": 970 }, { "epoch": 0.12352118051138532, "ewc_loss": 0.009473083540797234, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 2.881286491174251e-05, "grad_norm": 2.777891159057617, "learning_rate": 4.1119118270453577e-07, "loss": 0.5732, "mean_token_accuracy": 0.8159596920013428, "num_tokens": 36979874.0, "step": 971 }, { "epoch": 0.12364839078997583, "ewc_loss": 0.009488224983215332, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 2.8964275770704262e-05, "grad_norm": 2.700631618499756, "learning_rate": 4.1161509114031366e-07, "loss": 0.4557, "mean_token_accuracy": 0.8551607131958008, "num_tokens": 37017917.0, "step": 972 }, { "epoch": 0.12377560106856635, "ewc_loss": 0.009496812708675861, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.8744980227202177e-05, "grad_norm": 2.7321786880493164, "learning_rate": 4.1203899957609156e-07, "loss": 0.5204, "mean_token_accuracy": 0.8345097303390503, "num_tokens": 37054914.0, "step": 973 }, { "epoch": 0.12390281134715685, "ewc_loss": 0.009520826861262321, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.8985126846237108e-05, "grad_norm": 2.8295860290527344, "learning_rate": 4.124629080118694e-07, "loss": 0.4706, "mean_token_accuracy": 0.8454521298408508, "num_tokens": 37088277.0, "step": 974 }, { "epoch": 0.12403002162574736, "ewc_loss": 0.009544627740979195, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.9223130695754662e-05, "grad_norm": 2.7235395908355713, "learning_rate": 4.1288681644764726e-07, "loss": 0.5153, "mean_token_accuracy": 0.8346558213233948, "num_tokens": 37126963.0, "step": 975 }, { "epoch": 0.12415723190433788, "ewc_loss": 0.009505916386842728, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.883602246583905e-05, "grad_norm": 2.7914552688598633, "learning_rate": 4.1331072488342515e-07, "loss": 0.5269, "mean_token_accuracy": 0.8330521583557129, "num_tokens": 37161948.0, "step": 976 }, { "epoch": 0.12428444218292838, "ewc_loss": 0.00951466802507639, "ewc_loss_diag": 6.586313247680664e-06, "ewc_loss_parallel": 2.9228713174234144e-05, "grad_norm": 2.68047833442688, "learning_rate": 4.1373463331920305e-07, "loss": 0.4441, "mean_token_accuracy": 0.857585608959198, "num_tokens": 37198552.0, "step": 977 }, { "epoch": 0.12441165246151889, "ewc_loss": 0.009508994407951832, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.8866797947557643e-05, "grad_norm": 2.7349443435668945, "learning_rate": 4.141585417549809e-07, "loss": 0.5132, "mean_token_accuracy": 0.8336026072502136, "num_tokens": 37235990.0, "step": 978 }, { "epoch": 0.1245388627401094, "ewc_loss": 0.009540756233036518, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.9184417144278996e-05, "grad_norm": 2.8158764839172363, "learning_rate": 4.1458245019075875e-07, "loss": 0.4732, "mean_token_accuracy": 0.846500039100647, "num_tokens": 37269664.0, "step": 979 }, { "epoch": 0.1246660730186999, "ewc_loss": 0.009575865231454372, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.9535505746025592e-05, "grad_norm": 2.693403720855713, "learning_rate": 4.1500635862653664e-07, "loss": 0.5045, "mean_token_accuracy": 0.839519739151001, "num_tokens": 37308647.0, "step": 980 }, { "epoch": 0.12479328329729042, "ewc_loss": 0.0095218475908041, "ewc_loss_diag": 6.616115570068359e-06, "ewc_loss_parallel": 2.899533683375921e-05, "grad_norm": 2.698573589324951, "learning_rate": 4.1543026706231454e-07, "loss": 0.4539, "mean_token_accuracy": 0.8540900945663452, "num_tokens": 37345953.0, "step": 981 }, { "epoch": 0.12492049357588093, "ewc_loss": 0.009611062705516815, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 2.9277134672156535e-05, "grad_norm": 2.754544258117676, "learning_rate": 4.158541754980924e-07, "loss": 0.4496, "mean_token_accuracy": 0.856855034828186, "num_tokens": 37382735.0, "step": 982 }, { "epoch": 0.12504770385447145, "ewc_loss": 0.009630737826228142, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 2.9473887479980476e-05, "grad_norm": 2.7692506313323975, "learning_rate": 4.1627808393387024e-07, "loss": 0.5159, "mean_token_accuracy": 0.8317545652389526, "num_tokens": 37419605.0, "step": 983 }, { "epoch": 0.12517491413306195, "ewc_loss": 0.009635943919420242, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 2.952594331873115e-05, "grad_norm": 2.835294723510742, "learning_rate": 4.1670199236964813e-07, "loss": 0.5093, "mean_token_accuracy": 0.8328959941864014, "num_tokens": 37455395.0, "step": 984 }, { "epoch": 0.12530212441165245, "ewc_loss": 0.009657586924731731, "ewc_loss_diag": 6.67572021484375e-06, "ewc_loss_parallel": 2.9742373953922652e-05, "grad_norm": 2.729376792907715, "learning_rate": 4.1712590080542603e-07, "loss": 0.463, "mean_token_accuracy": 0.8506730794906616, "num_tokens": 37492208.0, "step": 985 }, { "epoch": 0.12542933469024298, "ewc_loss": 0.009685808792710304, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.9414235541480593e-05, "grad_norm": 2.6777377128601074, "learning_rate": 4.175498092412039e-07, "loss": 0.5077, "mean_token_accuracy": 0.8397579193115234, "num_tokens": 37533675.0, "step": 986 }, { "epoch": 0.12555654496883348, "ewc_loss": 0.009690036997199059, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.9456523407134227e-05, "grad_norm": 2.7503714561462402, "learning_rate": 4.179737176769817e-07, "loss": 0.4721, "mean_token_accuracy": 0.8490372896194458, "num_tokens": 37572792.0, "step": 987 }, { "epoch": 0.12568375524742398, "ewc_loss": 0.009728146716952324, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.9837616239092313e-05, "grad_norm": 2.715486526489258, "learning_rate": 4.183976261127596e-07, "loss": 0.469, "mean_token_accuracy": 0.8476163744926453, "num_tokens": 37611481.0, "step": 988 }, { "epoch": 0.1258109655260145, "ewc_loss": 0.00971085112541914, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.966466490761377e-05, "grad_norm": 2.8473494052886963, "learning_rate": 4.1882153454853747e-07, "loss": 0.5469, "mean_token_accuracy": 0.8248893022537231, "num_tokens": 37648253.0, "step": 989 }, { "epoch": 0.125938175804605, "ewc_loss": 0.00975793320685625, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 3.0135484848869964e-05, "grad_norm": 2.7362194061279297, "learning_rate": 4.1924544298431537e-07, "loss": 0.4821, "mean_token_accuracy": 0.8465344905853271, "num_tokens": 37687086.0, "step": 990 }, { "epoch": 0.12606538608319554, "ewc_loss": 0.009712355211377144, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.9679707949981093e-05, "grad_norm": 2.6578915119171143, "learning_rate": 4.196693514200932e-07, "loss": 0.4754, "mean_token_accuracy": 0.8477119207382202, "num_tokens": 37730928.0, "step": 991 }, { "epoch": 0.12619259636178604, "ewc_loss": 0.009698469191789627, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.954084993689321e-05, "grad_norm": 2.7156007289886475, "learning_rate": 4.200932598558711e-07, "loss": 0.4983, "mean_token_accuracy": 0.8373979330062866, "num_tokens": 37771857.0, "step": 992 }, { "epoch": 0.12631980664037654, "ewc_loss": 0.009737648069858551, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.9932627512607723e-05, "grad_norm": 2.676361560821533, "learning_rate": 4.2051716829164896e-07, "loss": 0.5017, "mean_token_accuracy": 0.8422586917877197, "num_tokens": 37817517.0, "step": 993 }, { "epoch": 0.12644701691896706, "ewc_loss": 0.009720058180391788, "ewc_loss_diag": 6.735324859619141e-06, "ewc_loss_parallel": 2.9756733056274243e-05, "grad_norm": 2.7912938594818115, "learning_rate": 4.2094107672742686e-07, "loss": 0.5445, "mean_token_accuracy": 0.8321349024772644, "num_tokens": 37855891.0, "step": 994 }, { "epoch": 0.12657422719755757, "ewc_loss": 0.009799172170460224, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 3.0242696084314957e-05, "grad_norm": 2.8243327140808105, "learning_rate": 4.2136498516320476e-07, "loss": 0.4631, "mean_token_accuracy": 0.853584885597229, "num_tokens": 37888719.0, "step": 995 }, { "epoch": 0.12670143747614807, "ewc_loss": 0.009794053621590137, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 3.0191513360477984e-05, "grad_norm": 2.6545886993408203, "learning_rate": 4.217888935989826e-07, "loss": 0.5163, "mean_token_accuracy": 0.8337497711181641, "num_tokens": 37932968.0, "step": 996 }, { "epoch": 0.1268286477547386, "ewc_loss": 0.009741850197315216, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 2.9669476134586148e-05, "grad_norm": 2.7627766132354736, "learning_rate": 4.2221280203476045e-07, "loss": 0.5096, "mean_token_accuracy": 0.8364244103431702, "num_tokens": 37974297.0, "step": 997 }, { "epoch": 0.1269558580333291, "ewc_loss": 0.009810227900743484, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 3.0353250622283667e-05, "grad_norm": 2.7055602073669434, "learning_rate": 4.2263671047053835e-07, "loss": 0.524, "mean_token_accuracy": 0.833274245262146, "num_tokens": 38020294.0, "step": 998 }, { "epoch": 0.1270830683119196, "ewc_loss": 0.00977754034101963, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 3.002638004545588e-05, "grad_norm": 2.6948914527893066, "learning_rate": 4.2306061890631625e-07, "loss": 0.5045, "mean_token_accuracy": 0.8398310542106628, "num_tokens": 38063322.0, "step": 999 }, { "epoch": 0.12721027859051012, "ewc_loss": 0.009777686558663845, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 3.0027844331925735e-05, "grad_norm": 2.7894365787506104, "learning_rate": 4.234845273420941e-07, "loss": 0.5909, "mean_token_accuracy": 0.8200715780258179, "num_tokens": 38101052.0, "step": 1000 }, { "epoch": 0.12733748886910062, "ewc_loss": 0.009820058941841125, "ewc_loss_diag": 6.765127182006836e-06, "ewc_loss_parallel": 3.0451568818534724e-05, "grad_norm": 2.695632219314575, "learning_rate": 4.2390843577787194e-07, "loss": 0.5125, "mean_token_accuracy": 0.834611177444458, "num_tokens": 38145472.0, "step": 1001 }, { "epoch": 0.12746469914769112, "ewc_loss": 0.009810291230678558, "ewc_loss_diag": 6.794929504394531e-06, "ewc_loss_parallel": 3.0048711778363213e-05, "grad_norm": 2.7338430881500244, "learning_rate": 4.2433234421364984e-07, "loss": 0.4909, "mean_token_accuracy": 0.8432080745697021, "num_tokens": 38182662.0, "step": 1002 }, { "epoch": 0.12759190942628165, "ewc_loss": 0.009835196658968925, "ewc_loss_diag": 6.794929504394531e-06, "ewc_loss_parallel": 3.029776780749671e-05, "grad_norm": 2.77937650680542, "learning_rate": 4.2475625264942774e-07, "loss": 0.4762, "mean_token_accuracy": 0.8481234312057495, "num_tokens": 38224735.0, "step": 1003 }, { "epoch": 0.12771911970487215, "ewc_loss": 0.009847918525338173, "ewc_loss_diag": 6.794929504394531e-06, "ewc_loss_parallel": 3.0424986107391305e-05, "grad_norm": 2.955463171005249, "learning_rate": 4.251801610852056e-07, "loss": 0.5193, "mean_token_accuracy": 0.8343040943145752, "num_tokens": 38255847.0, "step": 1004 }, { "epoch": 0.12784632998346265, "ewc_loss": 0.00989498384296894, "ewc_loss_diag": 6.794929504394531e-06, "ewc_loss_parallel": 3.089564052061178e-05, "grad_norm": 2.7469842433929443, "learning_rate": 4.2560406952098343e-07, "loss": 0.532, "mean_token_accuracy": 0.832493007183075, "num_tokens": 38296921.0, "step": 1005 }, { "epoch": 0.12797354026205318, "ewc_loss": 0.009852100163698196, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.0161625545588322e-05, "grad_norm": 2.775838613510132, "learning_rate": 4.2602797795676133e-07, "loss": 0.5428, "mean_token_accuracy": 0.8286973834037781, "num_tokens": 38336408.0, "step": 1006 }, { "epoch": 0.12810075054064368, "ewc_loss": 0.009879200719296932, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.043263131985441e-05, "grad_norm": 2.705247163772583, "learning_rate": 4.2645188639253923e-07, "loss": 0.4543, "mean_token_accuracy": 0.8522331118583679, "num_tokens": 38372809.0, "step": 1007 }, { "epoch": 0.12822796081923418, "ewc_loss": 0.009865553118288517, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.029615618288517e-05, "grad_norm": 2.7612836360931396, "learning_rate": 4.26875794828317e-07, "loss": 0.5108, "mean_token_accuracy": 0.8360990285873413, "num_tokens": 38413312.0, "step": 1008 }, { "epoch": 0.1283551710978247, "ewc_loss": 0.009888483211398125, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.052546162507497e-05, "grad_norm": 2.708505392074585, "learning_rate": 4.272997032640949e-07, "loss": 0.5297, "mean_token_accuracy": 0.8307788372039795, "num_tokens": 38457710.0, "step": 1009 }, { "epoch": 0.1284823813764152, "ewc_loss": 0.00986852403730154, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.032586391782388e-05, "grad_norm": 2.7472574710845947, "learning_rate": 4.277236116998728e-07, "loss": 0.4909, "mean_token_accuracy": 0.8440231084823608, "num_tokens": 38499630.0, "step": 1010 }, { "epoch": 0.1286095916550057, "ewc_loss": 0.009895715862512589, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.059778464375995e-05, "grad_norm": 2.7878196239471436, "learning_rate": 4.281475201356507e-07, "loss": 0.4979, "mean_token_accuracy": 0.8415863513946533, "num_tokens": 38538133.0, "step": 1011 }, { "epoch": 0.12873680193359624, "ewc_loss": 0.00990373082458973, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.0677936592837796e-05, "grad_norm": 2.7426974773406982, "learning_rate": 4.285714285714285e-07, "loss": 0.5291, "mean_token_accuracy": 0.8310766220092773, "num_tokens": 38576816.0, "step": 1012 }, { "epoch": 0.12886401221218674, "ewc_loss": 0.009891262277960777, "ewc_loss_diag": 6.8247318267822266e-06, "ewc_loss_parallel": 3.055325214518234e-05, "grad_norm": 2.7384586334228516, "learning_rate": 4.289953370072064e-07, "loss": 0.4716, "mean_token_accuracy": 0.8477297425270081, "num_tokens": 38615270.0, "step": 1013 }, { "epoch": 0.12899122249077727, "ewc_loss": 0.009930558502674103, "ewc_loss_diag": 6.854534149169922e-06, "ewc_loss_parallel": 3.064102929783985e-05, "grad_norm": 2.803295373916626, "learning_rate": 4.294192454429843e-07, "loss": 0.5214, "mean_token_accuracy": 0.8320724964141846, "num_tokens": 38654173.0, "step": 1014 }, { "epoch": 0.12911843276936777, "ewc_loss": 0.010026857256889343, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 3.0993665859568864e-05, "grad_norm": 2.6547296047210693, "learning_rate": 4.298431538787622e-07, "loss": 0.4086, "mean_token_accuracy": 0.8698122501373291, "num_tokens": 38695758.0, "step": 1015 }, { "epoch": 0.12924564304795827, "ewc_loss": 0.00996592827141285, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 3.0384384444914758e-05, "grad_norm": 2.8421027660369873, "learning_rate": 4.3026706231454e-07, "loss": 0.552, "mean_token_accuracy": 0.8234506249427795, "num_tokens": 38729524.0, "step": 1016 }, { "epoch": 0.1293728533265488, "ewc_loss": 0.01005413569509983, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 3.126645242446102e-05, "grad_norm": 2.8093245029449463, "learning_rate": 4.306909707503179e-07, "loss": 0.5005, "mean_token_accuracy": 0.8431424498558044, "num_tokens": 38767500.0, "step": 1017 }, { "epoch": 0.1295000636051393, "ewc_loss": 0.01002551056444645, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 3.098019806202501e-05, "grad_norm": 2.846035957336426, "learning_rate": 4.311148791860958e-07, "loss": 0.4889, "mean_token_accuracy": 0.8412218689918518, "num_tokens": 38798683.0, "step": 1018 }, { "epoch": 0.1296272738837298, "ewc_loss": 0.010037827305495739, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 3.1103369110496715e-05, "grad_norm": 2.774711847305298, "learning_rate": 4.315387876218737e-07, "loss": 0.5371, "mean_token_accuracy": 0.8315473794937134, "num_tokens": 38833653.0, "step": 1019 }, { "epoch": 0.12975448416232033, "ewc_loss": 0.010020308196544647, "ewc_loss_diag": 6.9141387939453125e-06, "ewc_loss_parallel": 3.092818224104121e-05, "grad_norm": 2.955190896987915, "learning_rate": 4.319626960576515e-07, "loss": 0.5353, "mean_token_accuracy": 0.8273683786392212, "num_tokens": 38869428.0, "step": 1020 }, { "epoch": 0.12988169444091083, "ewc_loss": 0.010106603614985943, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 3.1485957151744515e-05, "grad_norm": 2.7193734645843506, "learning_rate": 4.323866044934294e-07, "loss": 0.4729, "mean_token_accuracy": 0.8446860313415527, "num_tokens": 38909502.0, "step": 1021 }, { "epoch": 0.13000890471950133, "ewc_loss": 0.010027742013335228, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 3.069734157179482e-05, "grad_norm": 2.6962358951568604, "learning_rate": 4.328105129292073e-07, "loss": 0.4959, "mean_token_accuracy": 0.843307614326477, "num_tokens": 38951730.0, "step": 1022 }, { "epoch": 0.13013611499809186, "ewc_loss": 0.01004722248762846, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 3.089214442297816e-05, "grad_norm": 2.8046975135803223, "learning_rate": 4.332344213649852e-07, "loss": 0.4726, "mean_token_accuracy": 0.8457092046737671, "num_tokens": 38988981.0, "step": 1023 }, { "epoch": 0.13026332527668236, "ewc_loss": 0.01009148359298706, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 3.1334759114542976e-05, "grad_norm": 2.758781909942627, "learning_rate": 4.33658329800763e-07, "loss": 0.4838, "mean_token_accuracy": 0.8457952737808228, "num_tokens": 39026289.0, "step": 1024 }, { "epoch": 0.13039053555527286, "ewc_loss": 0.010058990679681301, "ewc_loss_diag": 6.943941116333008e-06, "ewc_loss_parallel": 3.100982939940877e-05, "grad_norm": 2.680454730987549, "learning_rate": 4.340822382365409e-07, "loss": 0.473, "mean_token_accuracy": 0.8465577363967896, "num_tokens": 39069113.0, "step": 1025 }, { "epoch": 0.13051774583386339, "ewc_loss": 0.010075447149574757, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.086921788053587e-05, "grad_norm": 2.7232773303985596, "learning_rate": 4.345061466723188e-07, "loss": 0.4576, "mean_token_accuracy": 0.8531917929649353, "num_tokens": 39112991.0, "step": 1026 }, { "epoch": 0.13064495611245389, "ewc_loss": 0.010109513998031616, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.1209881854010746e-05, "grad_norm": 2.8157780170440674, "learning_rate": 4.3493005510809663e-07, "loss": 0.5208, "mean_token_accuracy": 0.8357421159744263, "num_tokens": 39149593.0, "step": 1027 }, { "epoch": 0.1307721663910444, "ewc_loss": 0.010138842277228832, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.150317206745967e-05, "grad_norm": 2.7869460582733154, "learning_rate": 4.353539635438745e-07, "loss": 0.4847, "mean_token_accuracy": 0.8430265784263611, "num_tokens": 39186570.0, "step": 1028 }, { "epoch": 0.13089937666963491, "ewc_loss": 0.010119719430804253, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.13119417114649e-05, "grad_norm": 2.8858237266540527, "learning_rate": 4.357778719796524e-07, "loss": 0.5474, "mean_token_accuracy": 0.8231263756752014, "num_tokens": 39220666.0, "step": 1029 }, { "epoch": 0.13102658694822542, "ewc_loss": 0.01015937514603138, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.170849595335312e-05, "grad_norm": 2.7322001457214355, "learning_rate": 4.362017804154303e-07, "loss": 0.4664, "mean_token_accuracy": 0.8520877957344055, "num_tokens": 39258372.0, "step": 1030 }, { "epoch": 0.13115379722681592, "ewc_loss": 0.010106983594596386, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.118458334938623e-05, "grad_norm": 2.8127994537353516, "learning_rate": 4.366256888512081e-07, "loss": 0.4623, "mean_token_accuracy": 0.853087306022644, "num_tokens": 39293647.0, "step": 1031 }, { "epoch": 0.13128100750540644, "ewc_loss": 0.010145188309252262, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.1566629331791773e-05, "grad_norm": 2.7437424659729004, "learning_rate": 4.3704959728698597e-07, "loss": 0.4886, "mean_token_accuracy": 0.8412870168685913, "num_tokens": 39332811.0, "step": 1032 }, { "epoch": 0.13140821778399694, "ewc_loss": 0.010126161389052868, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.137636304018088e-05, "grad_norm": 3.0079896450042725, "learning_rate": 4.3747350572276386e-07, "loss": 0.4914, "mean_token_accuracy": 0.8412495851516724, "num_tokens": 39365971.0, "step": 1033 }, { "epoch": 0.13153542806258745, "ewc_loss": 0.010215118527412415, "ewc_loss_diag": 6.973743438720703e-06, "ewc_loss_parallel": 3.226592889404856e-05, "grad_norm": 2.760970115661621, "learning_rate": 4.3789741415854176e-07, "loss": 0.5005, "mean_token_accuracy": 0.8397865295410156, "num_tokens": 39403178.0, "step": 1034 }, { "epoch": 0.13166263834117797, "ewc_loss": 0.010101565159857273, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 3.1130399293033406e-05, "grad_norm": 2.742581605911255, "learning_rate": 4.383213225943196e-07, "loss": 0.4961, "mean_token_accuracy": 0.8400710821151733, "num_tokens": 39441303.0, "step": 1035 }, { "epoch": 0.13178984861976847, "ewc_loss": 0.010135555639863014, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 3.14703029289376e-05, "grad_norm": 2.7939906120300293, "learning_rate": 4.3874523103009746e-07, "loss": 0.44, "mean_token_accuracy": 0.8588087558746338, "num_tokens": 39477359.0, "step": 1036 }, { "epoch": 0.13191705889835897, "ewc_loss": 0.010182265192270279, "ewc_loss_diag": 7.033348083496094e-06, "ewc_loss_parallel": 3.163221845170483e-05, "grad_norm": 2.8046226501464844, "learning_rate": 4.3916913946587536e-07, "loss": 0.4507, "mean_token_accuracy": 0.8514119386672974, "num_tokens": 39511687.0, "step": 1037 }, { "epoch": 0.1320442691769495, "ewc_loss": 0.010143529623746872, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 3.155004742438905e-05, "grad_norm": 2.7800564765930176, "learning_rate": 4.3959304790165325e-07, "loss": 0.4655, "mean_token_accuracy": 0.8470703363418579, "num_tokens": 39547453.0, "step": 1038 }, { "epoch": 0.13217147945554, "ewc_loss": 0.010144342668354511, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 3.1558171031065285e-05, "grad_norm": 2.701491355895996, "learning_rate": 4.400169563374311e-07, "loss": 0.4831, "mean_token_accuracy": 0.8437517285346985, "num_tokens": 39588934.0, "step": 1039 }, { "epoch": 0.13229868973413053, "ewc_loss": 0.010123682208359241, "ewc_loss_diag": 7.0035457611083984e-06, "ewc_loss_parallel": 3.135156657663174e-05, "grad_norm": 2.829664945602417, "learning_rate": 4.4044086477320895e-07, "loss": 0.5707, "mean_token_accuracy": 0.8221626281738281, "num_tokens": 39627638.0, "step": 1040 }, { "epoch": 0.13242590001272103, "ewc_loss": 0.010215086862444878, "ewc_loss_diag": 7.033348083496094e-06, "ewc_loss_parallel": 3.196043326170184e-05, "grad_norm": 2.8235983848571777, "learning_rate": 4.4086477320898685e-07, "loss": 0.4809, "mean_token_accuracy": 0.8424687385559082, "num_tokens": 39661696.0, "step": 1041 }, { "epoch": 0.13255311029131153, "ewc_loss": 0.010198617354035378, "ewc_loss_diag": 7.033348083496094e-06, "ewc_loss_parallel": 3.179573832312599e-05, "grad_norm": 2.72503399848938, "learning_rate": 4.4128868164476474e-07, "loss": 0.4907, "mean_token_accuracy": 0.8458896279335022, "num_tokens": 39702357.0, "step": 1042 }, { "epoch": 0.13268032056990206, "ewc_loss": 0.010178564116358757, "ewc_loss_diag": 7.033348083496094e-06, "ewc_loss_parallel": 3.159521293127909e-05, "grad_norm": 2.735889196395874, "learning_rate": 4.417125900805426e-07, "loss": 0.4749, "mean_token_accuracy": 0.84908127784729, "num_tokens": 39744960.0, "step": 1043 }, { "epoch": 0.13280753084849256, "ewc_loss": 0.010198388248682022, "ewc_loss_diag": 7.033348083496094e-06, "ewc_loss_parallel": 3.179345731041394e-05, "grad_norm": 2.8418538570404053, "learning_rate": 4.4213649851632044e-07, "loss": 0.5318, "mean_token_accuracy": 0.8344321250915527, "num_tokens": 39785394.0, "step": 1044 }, { "epoch": 0.13293474112708306, "ewc_loss": 0.010294269770383835, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 3.214192111045122e-05, "grad_norm": 2.8411343097686768, "learning_rate": 4.4256040695209834e-07, "loss": 0.515, "mean_token_accuracy": 0.8390580415725708, "num_tokens": 39822698.0, "step": 1045 }, { "epoch": 0.1330619514056736, "ewc_loss": 0.010250184684991837, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 3.200624632881954e-05, "grad_norm": 2.87628173828125, "learning_rate": 4.429843153878762e-07, "loss": 0.5096, "mean_token_accuracy": 0.836023211479187, "num_tokens": 39856874.0, "step": 1046 }, { "epoch": 0.1331891616842641, "ewc_loss": 0.010259412229061127, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 3.209851638530381e-05, "grad_norm": 2.760716438293457, "learning_rate": 4.434082238236541e-07, "loss": 0.5409, "mean_token_accuracy": 0.827116847038269, "num_tokens": 39899683.0, "step": 1047 }, { "epoch": 0.1333163719628546, "ewc_loss": 0.010229157283902168, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 3.179597115376964e-05, "grad_norm": 2.7459020614624023, "learning_rate": 4.4383213225943193e-07, "loss": 0.519, "mean_token_accuracy": 0.836172878742218, "num_tokens": 39940565.0, "step": 1048 }, { "epoch": 0.13344358224144512, "ewc_loss": 0.01024229172617197, "ewc_loss_diag": 7.063150405883789e-06, "ewc_loss_parallel": 3.1927313102642074e-05, "grad_norm": 2.830594539642334, "learning_rate": 4.442560406952098e-07, "loss": 0.5202, "mean_token_accuracy": 0.8319714665412903, "num_tokens": 39979010.0, "step": 1049 }, { "epoch": 0.13357079252003562, "ewc_loss": 0.010312506929039955, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 3.232428935007192e-05, "grad_norm": 2.7535433769226074, "learning_rate": 4.4467994913098767e-07, "loss": 0.4683, "mean_token_accuracy": 0.8498674631118774, "num_tokens": 40021151.0, "step": 1050 }, { "epoch": 0.13369800279862612, "ewc_loss": 0.010280006565153599, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 3.199928323738277e-05, "grad_norm": 2.8068556785583496, "learning_rate": 4.4510385756676557e-07, "loss": 0.5283, "mean_token_accuracy": 0.8295880556106567, "num_tokens": 40059324.0, "step": 1051 }, { "epoch": 0.13382521307721665, "ewc_loss": 0.01031515933573246, "ewc_loss_diag": 7.092952728271484e-06, "ewc_loss_parallel": 3.235081385355443e-05, "grad_norm": 2.886104106903076, "learning_rate": 4.455277660025434e-07, "loss": 0.4841, "mean_token_accuracy": 0.8427503108978271, "num_tokens": 40104577.0, "step": 1052 }, { "epoch": 0.13395242335580715, "ewc_loss": 0.010399412363767624, "ewc_loss_diag": 7.152557373046875e-06, "ewc_loss_parallel": 3.258299329900183e-05, "grad_norm": 2.8390591144561768, "learning_rate": 4.459516744383213e-07, "loss": 0.5074, "mean_token_accuracy": 0.8341370224952698, "num_tokens": 40139734.0, "step": 1053 }, { "epoch": 0.13407963363439765, "ewc_loss": 0.010423405095934868, "ewc_loss_diag": 7.212162017822266e-06, "ewc_loss_parallel": 3.2212567020906135e-05, "grad_norm": 2.784754991531372, "learning_rate": 4.4637558287409916e-07, "loss": 0.5063, "mean_token_accuracy": 0.8374922275543213, "num_tokens": 40180724.0, "step": 1054 }, { "epoch": 0.13420684391298818, "ewc_loss": 0.010424616746604443, "ewc_loss_diag": 7.212162017822266e-06, "ewc_loss_parallel": 3.222468512831256e-05, "grad_norm": 2.7818989753723145, "learning_rate": 4.4679949130987706e-07, "loss": 0.5065, "mean_token_accuracy": 0.8427385091781616, "num_tokens": 40220475.0, "step": 1055 }, { "epoch": 0.13433405419157868, "ewc_loss": 0.010466126725077629, "ewc_loss_diag": 7.241964340209961e-06, "ewc_loss_parallel": 3.2334606657968834e-05, "grad_norm": 2.825636625289917, "learning_rate": 4.472233997456549e-07, "loss": 0.5134, "mean_token_accuracy": 0.836721658706665, "num_tokens": 40258208.0, "step": 1056 }, { "epoch": 0.13446126447016918, "ewc_loss": 0.010476035997271538, "ewc_loss_diag": 7.241964340209961e-06, "ewc_loss_parallel": 3.243369428673759e-05, "grad_norm": 2.755317449569702, "learning_rate": 4.476473081814328e-07, "loss": 0.5006, "mean_token_accuracy": 0.838975191116333, "num_tokens": 40299127.0, "step": 1057 }, { "epoch": 0.1345884747487597, "ewc_loss": 0.010464349761605263, "ewc_loss_diag": 7.241964340209961e-06, "ewc_loss_parallel": 3.231683513149619e-05, "grad_norm": 2.753054141998291, "learning_rate": 4.4807121661721065e-07, "loss": 0.4605, "mean_token_accuracy": 0.8497807383537292, "num_tokens": 40339331.0, "step": 1058 }, { "epoch": 0.1347156850273502, "ewc_loss": 0.01044394075870514, "ewc_loss_diag": 7.212162017822266e-06, "ewc_loss_parallel": 3.2417927286587656e-05, "grad_norm": 2.7824859619140625, "learning_rate": 4.4849512505298855e-07, "loss": 0.5027, "mean_token_accuracy": 0.8386939167976379, "num_tokens": 40379386.0, "step": 1059 }, { "epoch": 0.1348428953059407, "ewc_loss": 0.010455331765115261, "ewc_loss_diag": 7.212162017822266e-06, "ewc_loss_parallel": 3.25318324030377e-05, "grad_norm": 2.7681097984313965, "learning_rate": 4.489190334887664e-07, "loss": 0.5377, "mean_token_accuracy": 0.8348113894462585, "num_tokens": 40419868.0, "step": 1060 }, { "epoch": 0.13497010558453124, "ewc_loss": 0.010457424446940422, "ewc_loss_diag": 7.212162017822266e-06, "ewc_loss_parallel": 3.255275441915728e-05, "grad_norm": 2.8700668811798096, "learning_rate": 4.493429419245443e-07, "loss": 0.5075, "mean_token_accuracy": 0.839641809463501, "num_tokens": 40455346.0, "step": 1061 }, { "epoch": 0.13509731586312174, "ewc_loss": 0.010498283430933952, "ewc_loss_diag": 7.212162017822266e-06, "ewc_loss_parallel": 3.296135400887579e-05, "grad_norm": 2.8672561645507812, "learning_rate": 4.4976685036032214e-07, "loss": 0.4659, "mean_token_accuracy": 0.8483824133872986, "num_tokens": 40490087.0, "step": 1062 }, { "epoch": 0.13522452614171224, "ewc_loss": 0.010540923103690147, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 3.2777399610495195e-05, "grad_norm": 2.849586009979248, "learning_rate": 4.5019075879610004e-07, "loss": 0.5237, "mean_token_accuracy": 0.829728364944458, "num_tokens": 40524799.0, "step": 1063 }, { "epoch": 0.13535173642030277, "ewc_loss": 0.010515272617340088, "ewc_loss_diag": 7.241964340209961e-06, "ewc_loss_parallel": 3.282606849097647e-05, "grad_norm": 2.8212296962738037, "learning_rate": 4.506146672318779e-07, "loss": 0.4962, "mean_token_accuracy": 0.8408151865005493, "num_tokens": 40563740.0, "step": 1064 }, { "epoch": 0.13547894669889327, "ewc_loss": 0.010543497279286385, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 3.280313831055537e-05, "grad_norm": 2.818089485168457, "learning_rate": 4.5103857566765573e-07, "loss": 0.5507, "mean_token_accuracy": 0.8296461701393127, "num_tokens": 40604743.0, "step": 1065 }, { "epoch": 0.1356061569774838, "ewc_loss": 0.010547332465648651, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 3.284148624516092e-05, "grad_norm": 2.8397281169891357, "learning_rate": 4.5146248410343363e-07, "loss": 0.4552, "mean_token_accuracy": 0.8534508943557739, "num_tokens": 40640909.0, "step": 1066 }, { "epoch": 0.1357333672560743, "ewc_loss": 0.010562865063548088, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 3.299681702628732e-05, "grad_norm": 2.7731616497039795, "learning_rate": 4.5188639253921153e-07, "loss": 0.465, "mean_token_accuracy": 0.8510478138923645, "num_tokens": 40678930.0, "step": 1067 }, { "epoch": 0.1358605775346648, "ewc_loss": 0.010538955219089985, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 3.275771814514883e-05, "grad_norm": 2.8105452060699463, "learning_rate": 4.523103009749894e-07, "loss": 0.4583, "mean_token_accuracy": 0.8518348932266235, "num_tokens": 40715609.0, "step": 1068 }, { "epoch": 0.13598778781325532, "ewc_loss": 0.010624335147440434, "ewc_loss_diag": 7.331371307373047e-06, "ewc_loss_parallel": 3.300116441096179e-05, "grad_norm": 2.778526544570923, "learning_rate": 4.527342094107672e-07, "loss": 0.4838, "mean_token_accuracy": 0.8445850014686584, "num_tokens": 40760533.0, "step": 1069 }, { "epoch": 0.13611499809184582, "ewc_loss": 0.010553881525993347, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 3.290697713964619e-05, "grad_norm": 2.7720701694488525, "learning_rate": 4.531581178465451e-07, "loss": 0.5112, "mean_token_accuracy": 0.8375043272972107, "num_tokens": 40800039.0, "step": 1070 }, { "epoch": 0.13624220837043632, "ewc_loss": 0.010552567429840565, "ewc_loss_diag": 7.271766662597656e-06, "ewc_loss_parallel": 3.289384039817378e-05, "grad_norm": 2.7999329566955566, "learning_rate": 4.53582026282323e-07, "loss": 0.473, "mean_token_accuracy": 0.8476527333259583, "num_tokens": 40840207.0, "step": 1071 }, { "epoch": 0.13636941864902685, "ewc_loss": 0.010657474398612976, "ewc_loss_diag": 7.361173629760742e-06, "ewc_loss_parallel": 3.3027379686245695e-05, "grad_norm": 2.8873291015625, "learning_rate": 4.5400593471810087e-07, "loss": 0.5555, "mean_token_accuracy": 0.8227742314338684, "num_tokens": 40879559.0, "step": 1072 }, { "epoch": 0.13649662892761735, "ewc_loss": 0.010744202882051468, "ewc_loss_diag": 7.420778274536133e-06, "ewc_loss_parallel": 3.3284315577475354e-05, "grad_norm": 2.813058614730835, "learning_rate": 4.544298431538787e-07, "loss": 0.4831, "mean_token_accuracy": 0.844782829284668, "num_tokens": 40919608.0, "step": 1073 }, { "epoch": 0.13662383920620785, "ewc_loss": 0.010733420960605145, "ewc_loss_diag": 7.450580596923828e-06, "ewc_loss_parallel": 3.287132130935788e-05, "grad_norm": 2.846219778060913, "learning_rate": 4.548537515896566e-07, "loss": 0.5426, "mean_token_accuracy": 0.8264223337173462, "num_tokens": 40953648.0, "step": 1074 }, { "epoch": 0.13675104948479838, "ewc_loss": 0.010779823176562786, "ewc_loss_diag": 7.450580596923828e-06, "ewc_loss_parallel": 3.3335338230244815e-05, "grad_norm": 2.890052080154419, "learning_rate": 4.552776600254345e-07, "loss": 0.5153, "mean_token_accuracy": 0.8360652327537537, "num_tokens": 40989675.0, "step": 1075 }, { "epoch": 0.13687825976338888, "ewc_loss": 0.010778329335153103, "ewc_loss_diag": 7.450580596923828e-06, "ewc_loss_parallel": 3.3320404327241704e-05, "grad_norm": 2.798485040664673, "learning_rate": 4.5570156846121236e-07, "loss": 0.4669, "mean_token_accuracy": 0.849279522895813, "num_tokens": 41027992.0, "step": 1076 }, { "epoch": 0.13700547004197938, "ewc_loss": 0.010745169594883919, "ewc_loss_diag": 7.450580596923828e-06, "ewc_loss_parallel": 3.298880983493291e-05, "grad_norm": 2.8047659397125244, "learning_rate": 4.561254768969902e-07, "loss": 0.4659, "mean_token_accuracy": 0.8477283716201782, "num_tokens": 41066925.0, "step": 1077 }, { "epoch": 0.1371326803205699, "ewc_loss": 0.010832197964191437, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 3.324873250676319e-05, "grad_norm": 2.8898730278015137, "learning_rate": 4.565493853327681e-07, "loss": 0.4746, "mean_token_accuracy": 0.8478909730911255, "num_tokens": 41105090.0, "step": 1078 }, { "epoch": 0.1372598905991604, "ewc_loss": 0.010798676870763302, "ewc_loss_diag": 7.450580596923828e-06, "ewc_loss_parallel": 3.352387648192234e-05, "grad_norm": 2.872528076171875, "learning_rate": 4.56973293768546e-07, "loss": 0.5177, "mean_token_accuracy": 0.8378874659538269, "num_tokens": 41141924.0, "step": 1079 }, { "epoch": 0.1373871008777509, "ewc_loss": 0.010785604827105999, "ewc_loss_diag": 7.450580596923828e-06, "ewc_loss_parallel": 3.339316026540473e-05, "grad_norm": 2.8594424724578857, "learning_rate": 4.573972022043238e-07, "loss": 0.5775, "mean_token_accuracy": 0.8166481256484985, "num_tokens": 41181512.0, "step": 1080 }, { "epoch": 0.13751431115634144, "ewc_loss": 0.01084220688790083, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 3.334882785566151e-05, "grad_norm": 2.7704153060913086, "learning_rate": 4.578211106401017e-07, "loss": 0.526, "mean_token_accuracy": 0.8319541215896606, "num_tokens": 41224015.0, "step": 1081 }, { "epoch": 0.13764152143493194, "ewc_loss": 0.010823939926922321, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 3.316615766379982e-05, "grad_norm": 2.811575174331665, "learning_rate": 4.582450190758796e-07, "loss": 0.5282, "mean_token_accuracy": 0.8338175415992737, "num_tokens": 41265568.0, "step": 1082 }, { "epoch": 0.13776873171352244, "ewc_loss": 0.010862632654607296, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 3.355308581376448e-05, "grad_norm": 2.7949588298797607, "learning_rate": 4.586689275116575e-07, "loss": 0.4434, "mean_token_accuracy": 0.8579854965209961, "num_tokens": 41303595.0, "step": 1083 }, { "epoch": 0.13789594199211297, "ewc_loss": 0.010842974297702312, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 3.3356503990944475e-05, "grad_norm": 2.8289899826049805, "learning_rate": 4.590928359474353e-07, "loss": 0.5442, "mean_token_accuracy": 0.8278954029083252, "num_tokens": 41340119.0, "step": 1084 }, { "epoch": 0.13802315227070347, "ewc_loss": 0.01087142899632454, "ewc_loss_diag": 7.510185241699219e-06, "ewc_loss_parallel": 3.3641044865362346e-05, "grad_norm": 2.7807443141937256, "learning_rate": 4.595167443832132e-07, "loss": 0.4966, "mean_token_accuracy": 0.8445606231689453, "num_tokens": 41380184.0, "step": 1085 }, { "epoch": 0.13815036254929397, "ewc_loss": 0.010912097990512848, "ewc_loss_diag": 7.569789886474609e-06, "ewc_loss_parallel": 3.3437387173762545e-05, "grad_norm": 2.7836363315582275, "learning_rate": 4.599406528189911e-07, "loss": 0.4408, "mean_token_accuracy": 0.856994092464447, "num_tokens": 41419038.0, "step": 1086 }, { "epoch": 0.1382775728278845, "ewc_loss": 0.010935280472040176, "ewc_loss_diag": 7.569789886474609e-06, "ewc_loss_parallel": 3.366921009728685e-05, "grad_norm": 2.8199009895324707, "learning_rate": 4.60364561254769e-07, "loss": 0.483, "mean_token_accuracy": 0.8422247171401978, "num_tokens": 41455767.0, "step": 1087 }, { "epoch": 0.138404783106475, "ewc_loss": 0.010949844494462013, "ewc_loss_diag": 7.569789886474609e-06, "ewc_loss_parallel": 3.381484930287115e-05, "grad_norm": 2.807910680770874, "learning_rate": 4.607884696905468e-07, "loss": 0.4803, "mean_token_accuracy": 0.8445624113082886, "num_tokens": 41497493.0, "step": 1088 }, { "epoch": 0.1385319933850655, "ewc_loss": 0.010944368317723274, "ewc_loss_diag": 7.569789886474609e-06, "ewc_loss_parallel": 3.376009044586681e-05, "grad_norm": 2.7994208335876465, "learning_rate": 4.612123781263247e-07, "loss": 0.5134, "mean_token_accuracy": 0.8352262377738953, "num_tokens": 41542940.0, "step": 1089 }, { "epoch": 0.13865920366365603, "ewc_loss": 0.011008969508111477, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.379574991413392e-05, "grad_norm": 2.7749011516571045, "learning_rate": 4.616362865621026e-07, "loss": 0.413, "mean_token_accuracy": 0.864457368850708, "num_tokens": 41579302.0, "step": 1090 }, { "epoch": 0.13878641394224653, "ewc_loss": 0.011007883585989475, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.378489054739475e-05, "grad_norm": 2.8756954669952393, "learning_rate": 4.620601949978805e-07, "loss": 0.5108, "mean_token_accuracy": 0.8385982513427734, "num_tokens": 41617555.0, "step": 1091 }, { "epoch": 0.13891362422083706, "ewc_loss": 0.011045274324715137, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.415879837120883e-05, "grad_norm": 2.9492664337158203, "learning_rate": 4.6248410343365827e-07, "loss": 0.5632, "mean_token_accuracy": 0.8206551671028137, "num_tokens": 41654660.0, "step": 1092 }, { "epoch": 0.13904083449942756, "ewc_loss": 0.01105257123708725, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.4231772588100284e-05, "grad_norm": 2.84788179397583, "learning_rate": 4.6290801186943617e-07, "loss": 0.4705, "mean_token_accuracy": 0.8479393720626831, "num_tokens": 41693773.0, "step": 1093 }, { "epoch": 0.13916804477801806, "ewc_loss": 0.011007895693182945, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.3785014238674194e-05, "grad_norm": 2.874286651611328, "learning_rate": 4.6333192030521407e-07, "loss": 0.4846, "mean_token_accuracy": 0.8458726406097412, "num_tokens": 41728864.0, "step": 1094 }, { "epoch": 0.13929525505660859, "ewc_loss": 0.011032444424927235, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.403049777261913e-05, "grad_norm": 2.7545275688171387, "learning_rate": 4.6375582874099196e-07, "loss": 0.4588, "mean_token_accuracy": 0.8524327278137207, "num_tokens": 41770596.0, "step": 1095 }, { "epoch": 0.1394224653351991, "ewc_loss": 0.011004672385752201, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.3752778108464554e-05, "grad_norm": 2.8756933212280273, "learning_rate": 4.6417973717676976e-07, "loss": 0.4749, "mean_token_accuracy": 0.849896252155304, "num_tokens": 41806585.0, "step": 1096 }, { "epoch": 0.1395496756137896, "ewc_loss": 0.011052707210183144, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.423312955419533e-05, "grad_norm": 2.8347136974334717, "learning_rate": 4.6460364561254766e-07, "loss": 0.5047, "mean_token_accuracy": 0.8375682830810547, "num_tokens": 41845488.0, "step": 1097 }, { "epoch": 0.13967688589238011, "ewc_loss": 0.011036662384867668, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.407267286092974e-05, "grad_norm": 2.931445598602295, "learning_rate": 4.6502755404832556e-07, "loss": 0.4717, "mean_token_accuracy": 0.847161054611206, "num_tokens": 41880674.0, "step": 1098 }, { "epoch": 0.13980409617097062, "ewc_loss": 0.011065821163356304, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.4364267776254565e-05, "grad_norm": 2.8978183269500732, "learning_rate": 4.654514624841034e-07, "loss": 0.4864, "mean_token_accuracy": 0.8426761031150818, "num_tokens": 41913020.0, "step": 1099 }, { "epoch": 0.13993130644956112, "ewc_loss": 0.011048516258597374, "ewc_loss_diag": 7.62939453125e-06, "ewc_loss_parallel": 3.419121276238002e-05, "grad_norm": 2.838285446166992, "learning_rate": 4.6587537091988125e-07, "loss": 0.4915, "mean_token_accuracy": 0.843258261680603, "num_tokens": 41948483.0, "step": 1100 }, { "epoch": 0.14005851672815164, "ewc_loss": 0.011163750663399696, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 3.412285877857357e-05, "grad_norm": 2.8848795890808105, "learning_rate": 4.6629927935565915e-07, "loss": 0.4644, "mean_token_accuracy": 0.851396381855011, "num_tokens": 41985659.0, "step": 1101 }, { "epoch": 0.14018572700674214, "ewc_loss": 0.011181443929672241, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 3.429978823987767e-05, "grad_norm": 2.8618476390838623, "learning_rate": 4.6672318779143705e-07, "loss": 0.5581, "mean_token_accuracy": 0.8218181729316711, "num_tokens": 42027746.0, "step": 1102 }, { "epoch": 0.14031293728533265, "ewc_loss": 0.011179545894265175, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 3.428081618039869e-05, "grad_norm": 2.914062738418579, "learning_rate": 4.671470962272149e-07, "loss": 0.5134, "mean_token_accuracy": 0.8352162837982178, "num_tokens": 42060903.0, "step": 1103 }, { "epoch": 0.14044014756392317, "ewc_loss": 0.011265084147453308, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 3.452583769103512e-05, "grad_norm": 2.8017690181732178, "learning_rate": 4.6757100466299274e-07, "loss": 0.5109, "mean_token_accuracy": 0.8407275676727295, "num_tokens": 42102722.0, "step": 1104 }, { "epoch": 0.14056735784251367, "ewc_loss": 0.01117025688290596, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 3.4187916753580794e-05, "grad_norm": 2.8363702297210693, "learning_rate": 4.6799491309877064e-07, "loss": 0.519, "mean_token_accuracy": 0.8319332003593445, "num_tokens": 42143418.0, "step": 1105 }, { "epoch": 0.14069456812110417, "ewc_loss": 0.011211424134671688, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 3.45995940733701e-05, "grad_norm": 2.9045469760894775, "learning_rate": 4.6841882153454854e-07, "loss": 0.4582, "mean_token_accuracy": 0.8495604395866394, "num_tokens": 42177313.0, "step": 1106 }, { "epoch": 0.1408217783996947, "ewc_loss": 0.011343885213136673, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.4703498386079445e-05, "grad_norm": 2.8608803749084473, "learning_rate": 4.688427299703264e-07, "loss": 0.4407, "mean_token_accuracy": 0.8553547859191895, "num_tokens": 42214158.0, "step": 1107 }, { "epoch": 0.1409489886782852, "ewc_loss": 0.011200200766324997, "ewc_loss_diag": 7.748603820800781e-06, "ewc_loss_parallel": 3.448735878919251e-05, "grad_norm": 2.817317247390747, "learning_rate": 4.6926663840610423e-07, "loss": 0.498, "mean_token_accuracy": 0.8415623903274536, "num_tokens": 42254189.0, "step": 1108 }, { "epoch": 0.1410761989568757, "ewc_loss": 0.011331569403409958, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.458034188952297e-05, "grad_norm": 2.873459577560425, "learning_rate": 4.6969054684188213e-07, "loss": 0.5182, "mean_token_accuracy": 0.8319113850593567, "num_tokens": 42296853.0, "step": 1109 }, { "epoch": 0.14120340923546623, "ewc_loss": 0.011355876922607422, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.482342071947642e-05, "grad_norm": 2.881847620010376, "learning_rate": 4.7011445527766003e-07, "loss": 0.4906, "mean_token_accuracy": 0.8401001691818237, "num_tokens": 42331983.0, "step": 1110 }, { "epoch": 0.14133061951405673, "ewc_loss": 0.01129523478448391, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 3.4827349736588076e-05, "grad_norm": 2.8909432888031006, "learning_rate": 4.7053836371343787e-07, "loss": 0.4999, "mean_token_accuracy": 0.8409044146537781, "num_tokens": 42370496.0, "step": 1111 }, { "epoch": 0.14145782979264723, "ewc_loss": 0.011362016201019287, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.4884815249824896e-05, "grad_norm": 2.855613946914673, "learning_rate": 4.709622721492157e-07, "loss": 0.4769, "mean_token_accuracy": 0.8480035066604614, "num_tokens": 42406489.0, "step": 1112 }, { "epoch": 0.14158504007123776, "ewc_loss": 0.011343738995492458, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.47020395565778e-05, "grad_norm": 2.9009335041046143, "learning_rate": 4.713861805849936e-07, "loss": 0.4779, "mean_token_accuracy": 0.8451392650604248, "num_tokens": 42440679.0, "step": 1113 }, { "epoch": 0.14171225034982826, "ewc_loss": 0.011376798152923584, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.503262632875703e-05, "grad_norm": 2.9590415954589844, "learning_rate": 4.718100890207715e-07, "loss": 0.5496, "mean_token_accuracy": 0.8247377872467041, "num_tokens": 42476699.0, "step": 1114 }, { "epoch": 0.1418394606284188, "ewc_loss": 0.011382991448044777, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.5094566555926576e-05, "grad_norm": 2.896632194519043, "learning_rate": 4.7223399745654936e-07, "loss": 0.5052, "mean_token_accuracy": 0.8383938670158386, "num_tokens": 42513348.0, "step": 1115 }, { "epoch": 0.1419666709070093, "ewc_loss": 0.011361257173120975, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.487721915007569e-05, "grad_norm": 2.870563507080078, "learning_rate": 4.726579058923272e-07, "loss": 0.4993, "mean_token_accuracy": 0.8418893218040466, "num_tokens": 42550633.0, "step": 1116 }, { "epoch": 0.1420938811855998, "ewc_loss": 0.011362310498952866, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.4887758374679834e-05, "grad_norm": 2.9042770862579346, "learning_rate": 4.730818143281051e-07, "loss": 0.5241, "mean_token_accuracy": 0.8351523876190186, "num_tokens": 42593649.0, "step": 1117 }, { "epoch": 0.14222109146419032, "ewc_loss": 0.011323437094688416, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 3.5109365853713825e-05, "grad_norm": 2.9101204872131348, "learning_rate": 4.7350572276388295e-07, "loss": 0.4704, "mean_token_accuracy": 0.8482900857925415, "num_tokens": 42632379.0, "step": 1118 }, { "epoch": 0.14234830174278082, "ewc_loss": 0.011320228688418865, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 3.50772897945717e-05, "grad_norm": 2.88387131690979, "learning_rate": 4.7392963119966085e-07, "loss": 0.4968, "mean_token_accuracy": 0.8395556807518005, "num_tokens": 42670319.0, "step": 1119 }, { "epoch": 0.14247551202137132, "ewc_loss": 0.011319261975586414, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 3.506762004690245e-05, "grad_norm": 2.8167612552642822, "learning_rate": 4.7435353963543875e-07, "loss": 0.49, "mean_token_accuracy": 0.8459360003471375, "num_tokens": 42712680.0, "step": 1120 }, { "epoch": 0.14260272229996185, "ewc_loss": 0.01135564036667347, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.4821052395273e-05, "grad_norm": 2.8791046142578125, "learning_rate": 4.747774480712166e-07, "loss": 0.5066, "mean_token_accuracy": 0.8411386609077454, "num_tokens": 42748241.0, "step": 1121 }, { "epoch": 0.14272993257855235, "ewc_loss": 0.011341591365635395, "ewc_loss_diag": 7.808208465576172e-06, "ewc_loss_parallel": 3.529091554810293e-05, "grad_norm": 2.9288675785064697, "learning_rate": 4.7520135650699444e-07, "loss": 0.5262, "mean_token_accuracy": 0.832404613494873, "num_tokens": 42783208.0, "step": 1122 }, { "epoch": 0.14285714285714285, "ewc_loss": 0.011420507915318012, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.5469725844450295e-05, "grad_norm": 2.928662061691284, "learning_rate": 4.7562526494277234e-07, "loss": 0.5018, "mean_token_accuracy": 0.8407959938049316, "num_tokens": 42818570.0, "step": 1123 }, { "epoch": 0.14298435313573338, "ewc_loss": 0.011412528343498707, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.5389930417295545e-05, "grad_norm": 2.9292500019073486, "learning_rate": 4.7604917337855024e-07, "loss": 0.4408, "mean_token_accuracy": 0.8556286096572876, "num_tokens": 42852891.0, "step": 1124 }, { "epoch": 0.14311156341432388, "ewc_loss": 0.011420903727412224, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.5473680327413604e-05, "grad_norm": 2.8938794136047363, "learning_rate": 4.764730818143281e-07, "loss": 0.4901, "mean_token_accuracy": 0.8356485366821289, "num_tokens": 42887359.0, "step": 1125 }, { "epoch": 0.14323877369291438, "ewc_loss": 0.011409001424908638, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.53546638507396e-05, "grad_norm": 2.8681507110595703, "learning_rate": 4.768969902501059e-07, "loss": 0.5419, "mean_token_accuracy": 0.8269252777099609, "num_tokens": 42929308.0, "step": 1126 }, { "epoch": 0.1433659839715049, "ewc_loss": 0.011409549973905087, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.5360146284801885e-05, "grad_norm": 2.8003482818603516, "learning_rate": 4.773208986858838e-07, "loss": 0.4926, "mean_token_accuracy": 0.8395062685012817, "num_tokens": 42969373.0, "step": 1127 }, { "epoch": 0.1434931942500954, "ewc_loss": 0.011404739692807198, "ewc_loss_diag": 7.867813110351562e-06, "ewc_loss_parallel": 3.5312048566993326e-05, "grad_norm": 2.8939530849456787, "learning_rate": 4.777448071216617e-07, "loss": 0.4986, "mean_token_accuracy": 0.841374397277832, "num_tokens": 43010133.0, "step": 1128 }, { "epoch": 0.1436204045286859, "ewc_loss": 0.011500973254442215, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.566403029253706e-05, "grad_norm": 2.884601593017578, "learning_rate": 4.781687155574396e-07, "loss": 0.5213, "mean_token_accuracy": 0.8326027393341064, "num_tokens": 43053743.0, "step": 1129 }, { "epoch": 0.14374761480727644, "ewc_loss": 0.011484215967357159, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.5496454074746e-05, "grad_norm": 2.8193697929382324, "learning_rate": 4.785926239932175e-07, "loss": 0.4632, "mean_token_accuracy": 0.8506547212600708, "num_tokens": 43094125.0, "step": 1130 }, { "epoch": 0.14387482508586694, "ewc_loss": 0.011460071429610252, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.525501597323455e-05, "grad_norm": 2.885388135910034, "learning_rate": 4.790165324289953e-07, "loss": 0.4899, "mean_token_accuracy": 0.8418468236923218, "num_tokens": 43132399.0, "step": 1131 }, { "epoch": 0.14400203536445744, "ewc_loss": 0.011569596827030182, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.573991489247419e-05, "grad_norm": 2.9361820220947266, "learning_rate": 4.794404408647732e-07, "loss": 0.5034, "mean_token_accuracy": 0.841450572013855, "num_tokens": 43169036.0, "step": 1132 }, { "epoch": 0.14412924564304797, "ewc_loss": 0.011514132842421532, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.5795623261947185e-05, "grad_norm": 2.825291395187378, "learning_rate": 4.798643493005511e-07, "loss": 0.4362, "mean_token_accuracy": 0.8568373918533325, "num_tokens": 43209397.0, "step": 1133 }, { "epoch": 0.14425645592163847, "ewc_loss": 0.011472265236079693, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.537695010891184e-05, "grad_norm": 2.9169535636901855, "learning_rate": 4.80288257736329e-07, "loss": 0.5322, "mean_token_accuracy": 0.8324548602104187, "num_tokens": 43246788.0, "step": 1134 }, { "epoch": 0.14438366620022897, "ewc_loss": 0.011527673341333866, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.5931032471125945e-05, "grad_norm": 2.867488145828247, "learning_rate": 4.807121661721068e-07, "loss": 0.4454, "mean_token_accuracy": 0.8564313650131226, "num_tokens": 43282774.0, "step": 1135 }, { "epoch": 0.1445108764788195, "ewc_loss": 0.011492772027850151, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.558201206033118e-05, "grad_norm": 2.8478593826293945, "learning_rate": 4.811360746078847e-07, "loss": 0.4997, "mean_token_accuracy": 0.843768298625946, "num_tokens": 43322303.0, "step": 1136 }, { "epoch": 0.14463808675741, "ewc_loss": 0.011496776714920998, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.5622058931039646e-05, "grad_norm": 3.005808115005493, "learning_rate": 4.815599830436625e-07, "loss": 0.4945, "mean_token_accuracy": 0.8410736322402954, "num_tokens": 43354611.0, "step": 1137 }, { "epoch": 0.1447652970360005, "ewc_loss": 0.011556494049727917, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.621923679020256e-05, "grad_norm": 2.9021453857421875, "learning_rate": 4.819838914794405e-07, "loss": 0.4768, "mean_token_accuracy": 0.847811222076416, "num_tokens": 43390640.0, "step": 1138 }, { "epoch": 0.14489250731459102, "ewc_loss": 0.011562110856175423, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.566505620256066e-05, "grad_norm": 2.9110708236694336, "learning_rate": 4.824077999152183e-07, "loss": 0.4957, "mean_token_accuracy": 0.8369860649108887, "num_tokens": 43424513.0, "step": 1139 }, { "epoch": 0.14501971759318152, "ewc_loss": 0.01151574868708849, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.5811783163808286e-05, "grad_norm": 2.898736000061035, "learning_rate": 4.828317083509962e-07, "loss": 0.5017, "mean_token_accuracy": 0.8385515213012695, "num_tokens": 43461699.0, "step": 1140 }, { "epoch": 0.14514692787177205, "ewc_loss": 0.011525869369506836, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.591299173422158e-05, "grad_norm": 2.944830894470215, "learning_rate": 4.83255616786774e-07, "loss": 0.4748, "mean_token_accuracy": 0.851862907409668, "num_tokens": 43494681.0, "step": 1141 }, { "epoch": 0.14527413815036255, "ewc_loss": 0.011538924649357796, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.6043540603714064e-05, "grad_norm": 2.860487699508667, "learning_rate": 4.83679525222552e-07, "loss": 0.4529, "mean_token_accuracy": 0.8542910814285278, "num_tokens": 43531318.0, "step": 1142 }, { "epoch": 0.14540134842895305, "ewc_loss": 0.011574076488614082, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.578471296350472e-05, "grad_norm": 2.8396708965301514, "learning_rate": 4.841034336583298e-07, "loss": 0.4697, "mean_token_accuracy": 0.8539688587188721, "num_tokens": 43569991.0, "step": 1143 }, { "epoch": 0.14552855870754358, "ewc_loss": 0.011590642854571342, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.595037560444325e-05, "grad_norm": 2.8685243129730225, "learning_rate": 4.845273420941076e-07, "loss": 0.457, "mean_token_accuracy": 0.8552738428115845, "num_tokens": 43609978.0, "step": 1144 }, { "epoch": 0.14565576898613408, "ewc_loss": 0.011605357751250267, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.609752093325369e-05, "grad_norm": 3.0152199268341064, "learning_rate": 4.849512505298855e-07, "loss": 0.5038, "mean_token_accuracy": 0.8329567909240723, "num_tokens": 43640419.0, "step": 1145 }, { "epoch": 0.14578297926472458, "ewc_loss": 0.011665343306958675, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.669737998279743e-05, "grad_norm": 2.8413453102111816, "learning_rate": 4.853751589656634e-07, "loss": 0.4674, "mean_token_accuracy": 0.8497492074966431, "num_tokens": 43682408.0, "step": 1146 }, { "epoch": 0.1459101895433151, "ewc_loss": 0.011513743549585342, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.5791734262602404e-05, "grad_norm": 2.815626382827759, "learning_rate": 4.857990674014413e-07, "loss": 0.5022, "mean_token_accuracy": 0.8389869332313538, "num_tokens": 43726158.0, "step": 1147 }, { "epoch": 0.1460373998219056, "ewc_loss": 0.011602171696722507, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.606565951486118e-05, "grad_norm": 2.8948657512664795, "learning_rate": 4.862229758372191e-07, "loss": 0.5324, "mean_token_accuracy": 0.8278415203094482, "num_tokens": 43766466.0, "step": 1148 }, { "epoch": 0.1461646101004961, "ewc_loss": 0.011642326600849628, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.646720870165154e-05, "grad_norm": 2.8629088401794434, "learning_rate": 4.86646884272997e-07, "loss": 0.5209, "mean_token_accuracy": 0.8310036659240723, "num_tokens": 43807255.0, "step": 1149 }, { "epoch": 0.14629182037908664, "ewc_loss": 0.011554069817066193, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.6194996937410906e-05, "grad_norm": 2.884942054748535, "learning_rate": 4.870707927087749e-07, "loss": 0.4998, "mean_token_accuracy": 0.8401156663894653, "num_tokens": 43845419.0, "step": 1150 }, { "epoch": 0.14641903065767714, "ewc_loss": 0.011627880856394768, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.632275547715835e-05, "grad_norm": 2.8895254135131836, "learning_rate": 4.874947011445528e-07, "loss": 0.5068, "mean_token_accuracy": 0.8393471837043762, "num_tokens": 43882942.0, "step": 1151 }, { "epoch": 0.14654624093626764, "ewc_loss": 0.011580423451960087, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.645853212219663e-05, "grad_norm": 2.972470760345459, "learning_rate": 4.879186095803306e-07, "loss": 0.4304, "mean_token_accuracy": 0.8612890243530273, "num_tokens": 43923109.0, "step": 1152 }, { "epoch": 0.14667345121485817, "ewc_loss": 0.011602068319916725, "ewc_loss_diag": 7.927417755126953e-06, "ewc_loss_parallel": 3.6674980947282165e-05, "grad_norm": 2.9296631813049316, "learning_rate": 4.883425180161085e-07, "loss": 0.4662, "mean_token_accuracy": 0.8490620255470276, "num_tokens": 43959752.0, "step": 1153 }, { "epoch": 0.14680066149344867, "ewc_loss": 0.011638712137937546, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.643106538220309e-05, "grad_norm": 2.911426067352295, "learning_rate": 4.887664264518864e-07, "loss": 0.5122, "mean_token_accuracy": 0.8397830724716187, "num_tokens": 43996247.0, "step": 1154 }, { "epoch": 0.14692787177203917, "ewc_loss": 0.011637171730399132, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.641566581791267e-05, "grad_norm": 2.894191026687622, "learning_rate": 4.891903348876643e-07, "loss": 0.4975, "mean_token_accuracy": 0.8436908721923828, "num_tokens": 44035751.0, "step": 1155 }, { "epoch": 0.1470550820506297, "ewc_loss": 0.01163378544151783, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.638180351117626e-05, "grad_norm": 2.8790509700775146, "learning_rate": 4.896142433234421e-07, "loss": 0.5303, "mean_token_accuracy": 0.8367999196052551, "num_tokens": 44073225.0, "step": 1156 }, { "epoch": 0.1471822923292202, "ewc_loss": 0.011630833148956299, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.6352274037199095e-05, "grad_norm": 2.9278786182403564, "learning_rate": 4.9003815175922e-07, "loss": 0.5335, "mean_token_accuracy": 0.8361527919769287, "num_tokens": 44111761.0, "step": 1157 }, { "epoch": 0.1473095026078107, "ewc_loss": 0.01166286040097475, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.667254713946022e-05, "grad_norm": 2.8448500633239746, "learning_rate": 4.904620601949979e-07, "loss": 0.4847, "mean_token_accuracy": 0.8445316553115845, "num_tokens": 44151311.0, "step": 1158 }, { "epoch": 0.14743671288640123, "ewc_loss": 0.011700625531375408, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 3.6439851101022214e-05, "grad_norm": 3.015840768814087, "learning_rate": 4.908859686307758e-07, "loss": 0.5275, "mean_token_accuracy": 0.8342266082763672, "num_tokens": 44184623.0, "step": 1159 }, { "epoch": 0.14756392316499173, "ewc_loss": 0.011705376207828522, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.709770680870861e-05, "grad_norm": 2.885892629623413, "learning_rate": 4.913098770665536e-07, "loss": 0.5155, "mean_token_accuracy": 0.8386892676353455, "num_tokens": 44221753.0, "step": 1160 }, { "epoch": 0.14769113344358223, "ewc_loss": 0.011642754077911377, "ewc_loss_diag": 7.987022399902344e-06, "ewc_loss_parallel": 3.6471483326749876e-05, "grad_norm": 2.8628032207489014, "learning_rate": 4.917337855023314e-07, "loss": 0.4683, "mean_token_accuracy": 0.8463025689125061, "num_tokens": 44262542.0, "step": 1161 }, { "epoch": 0.14781834372217276, "ewc_loss": 0.011723829433321953, "ewc_loss_diag": 8.046627044677734e-06, "ewc_loss_parallel": 3.667188866529614e-05, "grad_norm": 2.9218950271606445, "learning_rate": 4.921576939381094e-07, "loss": 0.4908, "mean_token_accuracy": 0.8473693132400513, "num_tokens": 44300294.0, "step": 1162 }, { "epoch": 0.14794555400076326, "ewc_loss": 0.011815162375569344, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.697486317832954e-05, "grad_norm": 2.9331884384155273, "learning_rate": 4.925816023738872e-07, "loss": 0.5353, "mean_token_accuracy": 0.8289737701416016, "num_tokens": 44338476.0, "step": 1163 }, { "epoch": 0.14807276427935376, "ewc_loss": 0.011815118603408337, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.697442662087269e-05, "grad_norm": 3.0418519973754883, "learning_rate": 4.930055108096651e-07, "loss": 0.4356, "mean_token_accuracy": 0.8603752851486206, "num_tokens": 44367437.0, "step": 1164 }, { "epoch": 0.1481999745579443, "ewc_loss": 0.01185283251106739, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.7351568607846275e-05, "grad_norm": 3.0126376152038574, "learning_rate": 4.934294192454429e-07, "loss": 0.5493, "mean_token_accuracy": 0.8253695964813232, "num_tokens": 44402636.0, "step": 1165 }, { "epoch": 0.1483271848365348, "ewc_loss": 0.011833157390356064, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.715481943800114e-05, "grad_norm": 2.871579885482788, "learning_rate": 4.938533276812209e-07, "loss": 0.4919, "mean_token_accuracy": 0.8441922664642334, "num_tokens": 44438187.0, "step": 1166 }, { "epoch": 0.14845439511512531, "ewc_loss": 0.011788278818130493, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.67060347343795e-05, "grad_norm": 2.8357973098754883, "learning_rate": 4.942772361169987e-07, "loss": 0.414, "mean_token_accuracy": 0.8620600700378418, "num_tokens": 44478345.0, "step": 1167 }, { "epoch": 0.14858160539371582, "ewc_loss": 0.011815755628049374, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.6980800359742716e-05, "grad_norm": 2.826711416244507, "learning_rate": 4.947011445527766e-07, "loss": 0.4918, "mean_token_accuracy": 0.8433681726455688, "num_tokens": 44520272.0, "step": 1168 }, { "epoch": 0.14870881567230632, "ewc_loss": 0.011822424829006195, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.7047488149255514e-05, "grad_norm": 2.8463375568389893, "learning_rate": 4.951250529885544e-07, "loss": 0.4464, "mean_token_accuracy": 0.8588558435440063, "num_tokens": 44561058.0, "step": 1169 }, { "epoch": 0.14883602595089684, "ewc_loss": 0.011897058226168156, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.71834757970646e-05, "grad_norm": 3.0157957077026367, "learning_rate": 4.955489614243324e-07, "loss": 0.578, "mean_token_accuracy": 0.8145158290863037, "num_tokens": 44599371.0, "step": 1170 }, { "epoch": 0.14896323622948734, "ewc_loss": 0.011896872892975807, "ewc_loss_diag": 8.106231689453125e-06, "ewc_loss_parallel": 3.7791975046275184e-05, "grad_norm": 2.8426735401153564, "learning_rate": 4.959728698601102e-07, "loss": 0.4891, "mean_token_accuracy": 0.8356612324714661, "num_tokens": 44640499.0, "step": 1171 }, { "epoch": 0.14909044650807785, "ewc_loss": 0.01194041594862938, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 3.70066991308704e-05, "grad_norm": 2.8928415775299072, "learning_rate": 4.963967782958881e-07, "loss": 0.4777, "mean_token_accuracy": 0.8475472331047058, "num_tokens": 44683164.0, "step": 1172 }, { "epoch": 0.14921765678666837, "ewc_loss": 0.011923842132091522, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.745131107280031e-05, "grad_norm": 2.912870407104492, "learning_rate": 4.968206867316659e-07, "loss": 0.4857, "mean_token_accuracy": 0.8436541557312012, "num_tokens": 44721938.0, "step": 1173 }, { "epoch": 0.14934486706525887, "ewc_loss": 0.011933283880352974, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.754572389880195e-05, "grad_norm": 3.0387396812438965, "learning_rate": 4.972445951674439e-07, "loss": 0.4599, "mean_token_accuracy": 0.8503091335296631, "num_tokens": 44753506.0, "step": 1174 }, { "epoch": 0.14947207734384937, "ewc_loss": 0.011970069259405136, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.791358903981745e-05, "grad_norm": 2.964245080947876, "learning_rate": 4.976685036032216e-07, "loss": 0.5496, "mean_token_accuracy": 0.8247790336608887, "num_tokens": 44792170.0, "step": 1175 }, { "epoch": 0.1495992876224399, "ewc_loss": 0.011923321522772312, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.744610512512736e-05, "grad_norm": 2.86509108543396, "learning_rate": 4.980924120389996e-07, "loss": 0.4282, "mean_token_accuracy": 0.8631750345230103, "num_tokens": 44830733.0, "step": 1176 }, { "epoch": 0.1497264979010304, "ewc_loss": 0.011907991021871567, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.729280433617532e-05, "grad_norm": 2.870882749557495, "learning_rate": 4.985163204747774e-07, "loss": 0.4405, "mean_token_accuracy": 0.8583760261535645, "num_tokens": 44869743.0, "step": 1177 }, { "epoch": 0.1498537081796209, "ewc_loss": 0.011918846517801285, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.740135798580013e-05, "grad_norm": 2.9173190593719482, "learning_rate": 4.989402289105554e-07, "loss": 0.4872, "mean_token_accuracy": 0.8494901657104492, "num_tokens": 44913358.0, "step": 1178 }, { "epoch": 0.14998091845821143, "ewc_loss": 0.011954622343182564, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.7759113183710724e-05, "grad_norm": 3.0070080757141113, "learning_rate": 4.993641373463331e-07, "loss": 0.5465, "mean_token_accuracy": 0.8254654407501221, "num_tokens": 44949963.0, "step": 1179 }, { "epoch": 0.15010812873680193, "ewc_loss": 0.011967910453677177, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.789199035963975e-05, "grad_norm": 2.8970212936401367, "learning_rate": 4.997880457821111e-07, "loss": 0.5006, "mean_token_accuracy": 0.8389737606048584, "num_tokens": 44988569.0, "step": 1180 }, { "epoch": 0.15023533901539243, "ewc_loss": 0.0119258351624012, "ewc_loss_diag": 8.165836334228516e-06, "ewc_loss_parallel": 3.747123992070556e-05, "grad_norm": 2.8337748050689697, "learning_rate": 5.002119542178889e-07, "loss": 0.4783, "mean_token_accuracy": 0.8453949689865112, "num_tokens": 45032634.0, "step": 1181 }, { "epoch": 0.15036254929398296, "ewc_loss": 0.011993227526545525, "ewc_loss_diag": 8.225440979003906e-06, "ewc_loss_parallel": 3.753481723833829e-05, "grad_norm": 2.8988900184631348, "learning_rate": 5.006358626536667e-07, "loss": 0.4481, "mean_token_accuracy": 0.8569359183311462, "num_tokens": 45069685.0, "step": 1182 }, { "epoch": 0.15048975957257346, "ewc_loss": 0.01214645430445671, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.784637738135643e-05, "grad_norm": 2.889845132827759, "learning_rate": 5.010597710894446e-07, "loss": 0.4702, "mean_token_accuracy": 0.8499032258987427, "num_tokens": 45107481.0, "step": 1183 }, { "epoch": 0.15061696985116396, "ewc_loss": 0.012137742713093758, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.775926597882062e-05, "grad_norm": 2.966109037399292, "learning_rate": 5.014836795252225e-07, "loss": 0.4554, "mean_token_accuracy": 0.8530993461608887, "num_tokens": 45145747.0, "step": 1184 }, { "epoch": 0.1507441801297545, "ewc_loss": 0.012108049355447292, "ewc_loss_diag": 8.285045623779297e-06, "ewc_loss_parallel": 3.807268149103038e-05, "grad_norm": 2.917987823486328, "learning_rate": 5.019075879610004e-07, "loss": 0.4618, "mean_token_accuracy": 0.8548257350921631, "num_tokens": 45183140.0, "step": 1185 }, { "epoch": 0.150871390408345, "ewc_loss": 0.012144094333052635, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.782277781283483e-05, "grad_norm": 2.8877971172332764, "learning_rate": 5.023314963967783e-07, "loss": 0.4964, "mean_token_accuracy": 0.8405834436416626, "num_tokens": 45224911.0, "step": 1186 }, { "epoch": 0.1509986006869355, "ewc_loss": 0.012138505466282368, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.776688754442148e-05, "grad_norm": 3.109628200531006, "learning_rate": 5.027554048325562e-07, "loss": 0.549, "mean_token_accuracy": 0.8258082270622253, "num_tokens": 45257388.0, "step": 1187 }, { "epoch": 0.15112581096552602, "ewc_loss": 0.01222174521535635, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.859928619931452e-05, "grad_norm": 2.9159018993377686, "learning_rate": 5.03179313268334e-07, "loss": 0.5357, "mean_token_accuracy": 0.8304479718208313, "num_tokens": 45297876.0, "step": 1188 }, { "epoch": 0.15125302124411652, "ewc_loss": 0.012117864564061165, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.756048681680113e-05, "grad_norm": 2.997225046157837, "learning_rate": 5.036032217041119e-07, "loss": 0.5112, "mean_token_accuracy": 0.8342686891555786, "num_tokens": 45334040.0, "step": 1189 }, { "epoch": 0.15138023152270705, "ewc_loss": 0.012186814099550247, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.82499820261728e-05, "grad_norm": 2.8497533798217773, "learning_rate": 5.040271301398897e-07, "loss": 0.4943, "mean_token_accuracy": 0.838952898979187, "num_tokens": 45375985.0, "step": 1190 }, { "epoch": 0.15150744180129755, "ewc_loss": 0.012117425911128521, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.7556092138402164e-05, "grad_norm": 2.8342084884643555, "learning_rate": 5.044510385756676e-07, "loss": 0.4626, "mean_token_accuracy": 0.8487703800201416, "num_tokens": 45419951.0, "step": 1191 }, { "epoch": 0.15163465207988805, "ewc_loss": 0.012155027128756046, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.793210635194555e-05, "grad_norm": 2.8980448246002197, "learning_rate": 5.048749470114455e-07, "loss": 0.4754, "mean_token_accuracy": 0.8479640483856201, "num_tokens": 45456988.0, "step": 1192 }, { "epoch": 0.15176186235847858, "ewc_loss": 0.012192275375127792, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.830458445008844e-05, "grad_norm": 3.1009042263031006, "learning_rate": 5.052988554472234e-07, "loss": 0.4945, "mean_token_accuracy": 0.8402149081230164, "num_tokens": 45488226.0, "step": 1193 }, { "epoch": 0.15188907263706908, "ewc_loss": 0.012248765677213669, "ewc_loss_diag": 8.344650268554688e-06, "ewc_loss_parallel": 3.886949707521126e-05, "grad_norm": 2.920064687728882, "learning_rate": 5.057227638830013e-07, "loss": 0.4943, "mean_token_accuracy": 0.8428130149841309, "num_tokens": 45524661.0, "step": 1194 }, { "epoch": 0.15201628291565958, "ewc_loss": 0.012219545431435108, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.796693999902345e-05, "grad_norm": 3.002769708633423, "learning_rate": 5.061466723187792e-07, "loss": 0.4965, "mean_token_accuracy": 0.84242844581604, "num_tokens": 45563491.0, "step": 1195 }, { "epoch": 0.1521434931942501, "ewc_loss": 0.012281141243875027, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.858289710478857e-05, "grad_norm": 3.0223958492279053, "learning_rate": 5.065705807545569e-07, "loss": 0.4653, "mean_token_accuracy": 0.8514423966407776, "num_tokens": 45596771.0, "step": 1196 }, { "epoch": 0.1522707034728406, "ewc_loss": 0.01228727214038372, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.8644200685666874e-05, "grad_norm": 2.870753526687622, "learning_rate": 5.069944891903349e-07, "loss": 0.481, "mean_token_accuracy": 0.8449563980102539, "num_tokens": 45640049.0, "step": 1197 }, { "epoch": 0.1523979137514311, "ewc_loss": 0.012226168066263199, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.803316212724894e-05, "grad_norm": 2.990305185317993, "learning_rate": 5.074183976261127e-07, "loss": 0.4842, "mean_token_accuracy": 0.8421757817268372, "num_tokens": 45677375.0, "step": 1198 }, { "epoch": 0.15252512403002164, "ewc_loss": 0.0123065747320652, "ewc_loss_diag": 8.404254913330078e-06, "ewc_loss_parallel": 3.883723184117116e-05, "grad_norm": 2.9340479373931885, "learning_rate": 5.078423060618906e-07, "loss": 0.4887, "mean_token_accuracy": 0.8453441858291626, "num_tokens": 45719253.0, "step": 1199 }, { "epoch": 0.15265233430861214, "ewc_loss": 0.012332327663898468, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.8484409742522985e-05, "grad_norm": 2.959064245223999, "learning_rate": 5.082662144976685e-07, "loss": 0.4839, "mean_token_accuracy": 0.8415230512619019, "num_tokens": 45756065.0, "step": 1200 }, { "epoch": 0.15277954458720264, "ewc_loss": 0.012348536401987076, "ewc_loss_diag": 8.463859558105469e-06, "ewc_loss_parallel": 3.8646499888272956e-05, "grad_norm": 2.943532705307007, "learning_rate": 5.086901229334464e-07, "loss": 0.4749, "mean_token_accuracy": 0.8498684167861938, "num_tokens": 45791707.0, "step": 1201 }, { "epoch": 0.15290675486579317, "ewc_loss": 0.012474051676690578, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.868094790959731e-05, "grad_norm": 2.9508798122406006, "learning_rate": 5.091140313692243e-07, "loss": 0.488, "mean_token_accuracy": 0.8424115180969238, "num_tokens": 45828464.0, "step": 1202 }, { "epoch": 0.15303396514438367, "ewc_loss": 0.012478849850594997, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.872892921208404e-05, "grad_norm": 2.8801586627960205, "learning_rate": 5.095379398050022e-07, "loss": 0.4705, "mean_token_accuracy": 0.8473765254020691, "num_tokens": 45867046.0, "step": 1203 }, { "epoch": 0.15316117542297417, "ewc_loss": 0.012462075799703598, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.856119292322546e-05, "grad_norm": 3.0511929988861084, "learning_rate": 5.099618482407799e-07, "loss": 0.5, "mean_token_accuracy": 0.8389015197753906, "num_tokens": 45900530.0, "step": 1204 }, { "epoch": 0.1532883857015647, "ewc_loss": 0.01254795677959919, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.9419996028300375e-05, "grad_norm": 2.9799129962921143, "learning_rate": 5.103857566765578e-07, "loss": 0.4766, "mean_token_accuracy": 0.847338080406189, "num_tokens": 45940002.0, "step": 1205 }, { "epoch": 0.1534155959801552, "ewc_loss": 0.01248701848089695, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.881060911226086e-05, "grad_norm": 2.9535210132598877, "learning_rate": 5.108096651123357e-07, "loss": 0.4414, "mean_token_accuracy": 0.857350766658783, "num_tokens": 45975299.0, "step": 1206 }, { "epoch": 0.1535428062587457, "ewc_loss": 0.012497950345277786, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.891993765137158e-05, "grad_norm": 3.083630323410034, "learning_rate": 5.112335735481135e-07, "loss": 0.4807, "mean_token_accuracy": 0.844880998134613, "num_tokens": 46012693.0, "step": 1207 }, { "epoch": 0.15367001653733622, "ewc_loss": 0.01260324940085411, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.936257417080924e-05, "grad_norm": 2.899996757507324, "learning_rate": 5.116574819838915e-07, "loss": 0.389, "mean_token_accuracy": 0.8752850294113159, "num_tokens": 46051558.0, "step": 1208 }, { "epoch": 0.15379722681592672, "ewc_loss": 0.012517586350440979, "ewc_loss_diag": 8.64267349243164e-06, "ewc_loss_parallel": 3.8505939301103354e-05, "grad_norm": 2.98411226272583, "learning_rate": 5.120813904196693e-07, "loss": 0.4706, "mean_token_accuracy": 0.850469708442688, "num_tokens": 46087173.0, "step": 1209 }, { "epoch": 0.15392443709451722, "ewc_loss": 0.012519304640591145, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.9133476093411446e-05, "grad_norm": 3.0664615631103516, "learning_rate": 5.125052988554473e-07, "loss": 0.5265, "mean_token_accuracy": 0.8343775272369385, "num_tokens": 46120482.0, "step": 1210 }, { "epoch": 0.15405164737310775, "ewc_loss": 0.012537769973278046, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.931812534574419e-05, "grad_norm": 2.9666271209716797, "learning_rate": 5.12929207291225e-07, "loss": 0.4715, "mean_token_accuracy": 0.8478935956954956, "num_tokens": 46154478.0, "step": 1211 }, { "epoch": 0.15417885765169825, "ewc_loss": 0.012487782165408134, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.881825614371337e-05, "grad_norm": 2.889509439468384, "learning_rate": 5.133531157270029e-07, "loss": 0.4498, "mean_token_accuracy": 0.855392575263977, "num_tokens": 46195012.0, "step": 1212 }, { "epoch": 0.15430606793028875, "ewc_loss": 0.012490865774452686, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.884908801410347e-05, "grad_norm": 2.9248874187469482, "learning_rate": 5.137770241627808e-07, "loss": 0.4159, "mean_token_accuracy": 0.8655392527580261, "num_tokens": 46230641.0, "step": 1213 }, { "epoch": 0.15443327820887928, "ewc_loss": 0.012528044171631336, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.922087125829421e-05, "grad_norm": 3.0205893516540527, "learning_rate": 5.142009325985587e-07, "loss": 0.4548, "mean_token_accuracy": 0.8556411266326904, "num_tokens": 46268993.0, "step": 1214 }, { "epoch": 0.15456048848746978, "ewc_loss": 0.012558162212371826, "ewc_loss_diag": 8.58306884765625e-06, "ewc_loss_parallel": 3.952204860979691e-05, "grad_norm": 2.896160125732422, "learning_rate": 5.146248410343365e-07, "loss": 0.4865, "mean_token_accuracy": 0.8443031311035156, "num_tokens": 46309310.0, "step": 1215 }, { "epoch": 0.1546876987660603, "ewc_loss": 0.012616815976798534, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.88878834201023e-05, "grad_norm": 2.8633522987365723, "learning_rate": 5.150487494701145e-07, "loss": 0.4296, "mean_token_accuracy": 0.8608608245849609, "num_tokens": 46352415.0, "step": 1216 }, { "epoch": 0.1548149090446508, "ewc_loss": 0.012644903734326363, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.916876084986143e-05, "grad_norm": 2.99458384513855, "learning_rate": 5.154726579058923e-07, "loss": 0.4843, "mean_token_accuracy": 0.8442821502685547, "num_tokens": 46387838.0, "step": 1217 }, { "epoch": 0.1549421193232413, "ewc_loss": 0.012685266323387623, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.9572390960529447e-05, "grad_norm": 2.8538661003112793, "learning_rate": 5.158965663416703e-07, "loss": 0.48, "mean_token_accuracy": 0.845665454864502, "num_tokens": 46432669.0, "step": 1218 }, { "epoch": 0.15506932960183184, "ewc_loss": 0.012622401118278503, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.894373367074877e-05, "grad_norm": 2.9057860374450684, "learning_rate": 5.16320474777448e-07, "loss": 0.4369, "mean_token_accuracy": 0.858460009098053, "num_tokens": 46474383.0, "step": 1219 }, { "epoch": 0.15519653988042234, "ewc_loss": 0.012669049203395844, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.9410224417224526e-05, "grad_norm": 2.9438226222991943, "learning_rate": 5.167443832132259e-07, "loss": 0.4848, "mean_token_accuracy": 0.8436335325241089, "num_tokens": 46514263.0, "step": 1220 }, { "epoch": 0.15532375015901284, "ewc_loss": 0.01273741852492094, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.948355879401788e-05, "grad_norm": 2.939941167831421, "learning_rate": 5.171682916490038e-07, "loss": 0.4544, "mean_token_accuracy": 0.8527791500091553, "num_tokens": 46549835.0, "step": 1221 }, { "epoch": 0.15545096043760337, "ewc_loss": 0.012730980291962624, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.9419181121047586e-05, "grad_norm": 2.99960994720459, "learning_rate": 5.175922000847816e-07, "loss": 0.4406, "mean_token_accuracy": 0.8565793037414551, "num_tokens": 46584811.0, "step": 1222 }, { "epoch": 0.15557817071619387, "ewc_loss": 0.0127058494836092, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.977822416345589e-05, "grad_norm": 2.964160680770874, "learning_rate": 5.180161085205595e-07, "loss": 0.4629, "mean_token_accuracy": 0.8512322902679443, "num_tokens": 46621441.0, "step": 1223 }, { "epoch": 0.15570538099478437, "ewc_loss": 0.012685220688581467, "ewc_loss_diag": 8.702278137207031e-06, "ewc_loss_parallel": 3.957193985115737e-05, "grad_norm": 3.086146593093872, "learning_rate": 5.184400169563374e-07, "loss": 0.5146, "mean_token_accuracy": 0.8361589908599854, "num_tokens": 46661572.0, "step": 1224 }, { "epoch": 0.1558325912733749, "ewc_loss": 0.012793758884072304, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.0046958019956946e-05, "grad_norm": 2.9712908267974854, "learning_rate": 5.188639253921153e-07, "loss": 0.4899, "mean_token_accuracy": 0.8427004218101501, "num_tokens": 46702157.0, "step": 1225 }, { "epoch": 0.1559598015519654, "ewc_loss": 0.012716861441731453, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.927799116354436e-05, "grad_norm": 2.9621379375457764, "learning_rate": 5.192878338278932e-07, "loss": 0.4645, "mean_token_accuracy": 0.8501552939414978, "num_tokens": 46739119.0, "step": 1226 }, { "epoch": 0.1560870118305559, "ewc_loss": 0.01274160947650671, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.952546830987558e-05, "grad_norm": 2.9274539947509766, "learning_rate": 5.19711742263671e-07, "loss": 0.5245, "mean_token_accuracy": 0.8330484628677368, "num_tokens": 46783990.0, "step": 1227 }, { "epoch": 0.15621422210914643, "ewc_loss": 0.012734582647681236, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.945520438719541e-05, "grad_norm": 3.0217292308807373, "learning_rate": 5.201356506994488e-07, "loss": 0.4643, "mean_token_accuracy": 0.8454363346099854, "num_tokens": 46816131.0, "step": 1228 }, { "epoch": 0.15634143238773693, "ewc_loss": 0.012770436704158783, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.981373811257072e-05, "grad_norm": 2.963489055633545, "learning_rate": 5.205595591352268e-07, "loss": 0.545, "mean_token_accuracy": 0.8265055418014526, "num_tokens": 46856453.0, "step": 1229 }, { "epoch": 0.15646864266632743, "ewc_loss": 0.012739242054522038, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.950179598177783e-05, "grad_norm": 2.925983428955078, "learning_rate": 5.209834675710046e-07, "loss": 0.4847, "mean_token_accuracy": 0.8447384834289551, "num_tokens": 46897961.0, "step": 1230 }, { "epoch": 0.15659585294491796, "ewc_loss": 0.01274910569190979, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.960043613915332e-05, "grad_norm": 3.0659914016723633, "learning_rate": 5.214073760067825e-07, "loss": 0.558, "mean_token_accuracy": 0.8244004845619202, "num_tokens": 46931961.0, "step": 1231 }, { "epoch": 0.15672306322350846, "ewc_loss": 0.01281543355435133, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.026370879728347e-05, "grad_norm": 3.0074782371520996, "learning_rate": 5.218312844425604e-07, "loss": 0.4796, "mean_token_accuracy": 0.8486224412918091, "num_tokens": 46967822.0, "step": 1232 }, { "epoch": 0.15685027350209896, "ewc_loss": 0.012761453166604042, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.97239055018872e-05, "grad_norm": 2.9682226181030273, "learning_rate": 5.222551928783383e-07, "loss": 0.4618, "mean_token_accuracy": 0.8508744239807129, "num_tokens": 47004511.0, "step": 1233 }, { "epoch": 0.1569774837806895, "ewc_loss": 0.012777745723724365, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.988683602074161e-05, "grad_norm": 2.9738755226135254, "learning_rate": 5.226791013141161e-07, "loss": 0.5147, "mean_token_accuracy": 0.8379749059677124, "num_tokens": 47047244.0, "step": 1234 }, { "epoch": 0.15710469405928, "ewc_loss": 0.012799149379134178, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.010086922789924e-05, "grad_norm": 2.976189613342285, "learning_rate": 5.23103009749894e-07, "loss": 0.4707, "mean_token_accuracy": 0.8500673770904541, "num_tokens": 47088206.0, "step": 1235 }, { "epoch": 0.1572319043378705, "ewc_loss": 0.012791186571121216, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.002123750979081e-05, "grad_norm": 2.912465810775757, "learning_rate": 5.235269181856718e-07, "loss": 0.4794, "mean_token_accuracy": 0.8457324504852295, "num_tokens": 47132733.0, "step": 1236 }, { "epoch": 0.15735911461646102, "ewc_loss": 0.012774387374520302, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 3.985324292443693e-05, "grad_norm": 3.0556159019470215, "learning_rate": 5.239508266214498e-07, "loss": 0.5368, "mean_token_accuracy": 0.82724928855896, "num_tokens": 47166840.0, "step": 1237 }, { "epoch": 0.15748632489505152, "ewc_loss": 0.012841334566473961, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.0522714698454365e-05, "grad_norm": 3.0186767578125, "learning_rate": 5.243747350572276e-07, "loss": 0.4963, "mean_token_accuracy": 0.8427873849868774, "num_tokens": 47203444.0, "step": 1238 }, { "epoch": 0.15761353517364202, "ewc_loss": 0.012809542939066887, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.0204809920396656e-05, "grad_norm": 2.9863171577453613, "learning_rate": 5.247986434930056e-07, "loss": 0.4912, "mean_token_accuracy": 0.8461672067642212, "num_tokens": 47240380.0, "step": 1239 }, { "epoch": 0.15774074545223254, "ewc_loss": 0.01281256228685379, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.0234994230559096e-05, "grad_norm": 3.044206142425537, "learning_rate": 5.252225519287834e-07, "loss": 0.5497, "mean_token_accuracy": 0.825594425201416, "num_tokens": 47277886.0, "step": 1240 }, { "epoch": 0.15786795573082305, "ewc_loss": 0.01284603402018547, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.056971374666318e-05, "grad_norm": 2.987450361251831, "learning_rate": 5.256464603645613e-07, "loss": 0.4794, "mean_token_accuracy": 0.8502248525619507, "num_tokens": 47315261.0, "step": 1241 }, { "epoch": 0.15799516600941357, "ewc_loss": 0.012820182368159294, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.0311198972631246e-05, "grad_norm": 2.8991451263427734, "learning_rate": 5.260703688003391e-07, "loss": 0.4821, "mean_token_accuracy": 0.8464100360870361, "num_tokens": 47357489.0, "step": 1242 }, { "epoch": 0.15812237628800407, "ewc_loss": 0.012802006676793098, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.0129445551428944e-05, "grad_norm": 3.064558506011963, "learning_rate": 5.26494277236117e-07, "loss": 0.4944, "mean_token_accuracy": 0.8426397442817688, "num_tokens": 47389851.0, "step": 1243 }, { "epoch": 0.15824958656659457, "ewc_loss": 0.01289116870611906, "ewc_loss_diag": 8.761882781982422e-06, "ewc_loss_parallel": 4.1021063225343823e-05, "grad_norm": 3.030320405960083, "learning_rate": 5.269181856718948e-07, "loss": 0.5227, "mean_token_accuracy": 0.8332007527351379, "num_tokens": 47427491.0, "step": 1244 }, { "epoch": 0.1583767968451851, "ewc_loss": 0.012905016541481018, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 4.0549195546191186e-05, "grad_norm": 2.96085786819458, "learning_rate": 5.273420941076727e-07, "loss": 0.4639, "mean_token_accuracy": 0.8523563742637634, "num_tokens": 47462790.0, "step": 1245 }, { "epoch": 0.1585040071237756, "ewc_loss": 0.012894841842353344, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 4.044744127895683e-05, "grad_norm": 2.9260897636413574, "learning_rate": 5.277660025434506e-07, "loss": 0.4279, "mean_token_accuracy": 0.8613256812095642, "num_tokens": 47502240.0, "step": 1246 }, { "epoch": 0.1586312174023661, "ewc_loss": 0.012907445430755615, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 4.057347177877091e-05, "grad_norm": 2.9231560230255127, "learning_rate": 5.281899109792285e-07, "loss": 0.4515, "mean_token_accuracy": 0.8544793725013733, "num_tokens": 47542493.0, "step": 1247 }, { "epoch": 0.15875842768095663, "ewc_loss": 0.01291313674300909, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 4.0630391595186666e-05, "grad_norm": 2.974860668182373, "learning_rate": 5.286138194150064e-07, "loss": 0.4874, "mean_token_accuracy": 0.8456102609634399, "num_tokens": 47583378.0, "step": 1248 }, { "epoch": 0.15888563795954713, "ewc_loss": 0.012931598350405693, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 4.081500446773134e-05, "grad_norm": 2.9787986278533936, "learning_rate": 5.290377278507841e-07, "loss": 0.5132, "mean_token_accuracy": 0.841702938079834, "num_tokens": 47622671.0, "step": 1249 }, { "epoch": 0.15901284823813763, "ewc_loss": 0.01294185034930706, "ewc_loss_diag": 8.821487426757812e-06, "ewc_loss_parallel": 4.09175299864728e-05, "grad_norm": 2.9751791954040527, "learning_rate": 5.294616362865621e-07, "loss": 0.4709, "mean_token_accuracy": 0.8527491092681885, "num_tokens": 47663774.0, "step": 1250 }, { "epoch": 0.15914005851672816, "ewc_loss": 0.012996263802051544, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.0851315134204924e-05, "grad_norm": 3.001206398010254, "learning_rate": 5.298855447223399e-07, "loss": 0.542, "mean_token_accuracy": 0.8361663222312927, "num_tokens": 47706143.0, "step": 1251 }, { "epoch": 0.15926726879531866, "ewc_loss": 0.013005397282540798, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.094264659215696e-05, "grad_norm": 3.0295042991638184, "learning_rate": 5.303094531581178e-07, "loss": 0.5028, "mean_token_accuracy": 0.8399055600166321, "num_tokens": 47741197.0, "step": 1252 }, { "epoch": 0.15939447907390916, "ewc_loss": 0.01302121952176094, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.110086592845619e-05, "grad_norm": 2.9870800971984863, "learning_rate": 5.307333615938957e-07, "loss": 0.4265, "mean_token_accuracy": 0.8621058464050293, "num_tokens": 47779306.0, "step": 1253 }, { "epoch": 0.1595216893524997, "ewc_loss": 0.012999819591641426, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.088687273906544e-05, "grad_norm": 3.1192760467529297, "learning_rate": 5.311572700296736e-07, "loss": 0.5121, "mean_token_accuracy": 0.8361448049545288, "num_tokens": 47811056.0, "step": 1254 }, { "epoch": 0.1596488996310902, "ewc_loss": 0.013043882325291634, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.132749745622277e-05, "grad_norm": 3.0155739784240723, "learning_rate": 5.315811784654515e-07, "loss": 0.4205, "mean_token_accuracy": 0.8606824278831482, "num_tokens": 47847075.0, "step": 1255 }, { "epoch": 0.1597761099096807, "ewc_loss": 0.013002844527363777, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.09171188948676e-05, "grad_norm": 2.9166269302368164, "learning_rate": 5.320050869012294e-07, "loss": 0.4786, "mean_token_accuracy": 0.8480048179626465, "num_tokens": 47892639.0, "step": 1256 }, { "epoch": 0.15990332018827122, "ewc_loss": 0.012977669015526772, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.066536348545924e-05, "grad_norm": 2.977538824081421, "learning_rate": 5.324289953370071e-07, "loss": 0.5344, "mean_token_accuracy": 0.8324449062347412, "num_tokens": 47933522.0, "step": 1257 }, { "epoch": 0.16003053046686172, "ewc_loss": 0.013039674609899521, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.128542059333995e-05, "grad_norm": 2.944342851638794, "learning_rate": 5.328529037727851e-07, "loss": 0.4718, "mean_token_accuracy": 0.8473492860794067, "num_tokens": 47977307.0, "step": 1258 }, { "epoch": 0.16015774074545222, "ewc_loss": 0.013012760318815708, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.101627564523369e-05, "grad_norm": 3.0849497318267822, "learning_rate": 5.332768122085629e-07, "loss": 0.4684, "mean_token_accuracy": 0.848436713218689, "num_tokens": 48015129.0, "step": 1259 }, { "epoch": 0.16028495102404275, "ewc_loss": 0.013068560510873795, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.1574283386580646e-05, "grad_norm": 3.035503387451172, "learning_rate": 5.337007206443408e-07, "loss": 0.503, "mean_token_accuracy": 0.8371827006340027, "num_tokens": 48051904.0, "step": 1260 }, { "epoch": 0.16041216130263325, "ewc_loss": 0.013032985851168633, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.121852543903515e-05, "grad_norm": 2.962890386581421, "learning_rate": 5.341246290801187e-07, "loss": 0.4644, "mean_token_accuracy": 0.8545049428939819, "num_tokens": 48094001.0, "step": 1261 }, { "epoch": 0.16053937158122375, "ewc_loss": 0.013003617525100708, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.0924853237811476e-05, "grad_norm": 2.9783947467803955, "learning_rate": 5.345485375158966e-07, "loss": 0.4572, "mean_token_accuracy": 0.8503983020782471, "num_tokens": 48134124.0, "step": 1262 }, { "epoch": 0.16066658185981428, "ewc_loss": 0.01303580030798912, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.124666884308681e-05, "grad_norm": 3.0101757049560547, "learning_rate": 5.349724459516745e-07, "loss": 0.4474, "mean_token_accuracy": 0.8546482920646667, "num_tokens": 48170730.0, "step": 1263 }, { "epoch": 0.16079379213840478, "ewc_loss": 0.013041060417890549, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.129927037865855e-05, "grad_norm": 3.0471620559692383, "learning_rate": 5.353963543874522e-07, "loss": 0.5185, "mean_token_accuracy": 0.8363527059555054, "num_tokens": 48212640.0, "step": 1264 }, { "epoch": 0.1609210024169953, "ewc_loss": 0.013054899871349335, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.143767000641674e-05, "grad_norm": 3.114452362060547, "learning_rate": 5.358202628232301e-07, "loss": 0.4947, "mean_token_accuracy": 0.8435890078544617, "num_tokens": 48243136.0, "step": 1265 }, { "epoch": 0.1610482126955858, "ewc_loss": 0.013076895847916603, "ewc_loss_diag": 8.881092071533203e-06, "ewc_loss_parallel": 4.165762584307231e-05, "grad_norm": 3.0296525955200195, "learning_rate": 5.36244171259008e-07, "loss": 0.4761, "mean_token_accuracy": 0.8472458124160767, "num_tokens": 48282905.0, "step": 1266 }, { "epoch": 0.1611754229741763, "ewc_loss": 0.013034909963607788, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.123777762288228e-05, "grad_norm": 2.9939045906066895, "learning_rate": 5.366680796947859e-07, "loss": 0.5142, "mean_token_accuracy": 0.8381247520446777, "num_tokens": 48323325.0, "step": 1267 }, { "epoch": 0.16130263325276684, "ewc_loss": 0.013036516495049, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.125383566133678e-05, "grad_norm": 3.077680826187134, "learning_rate": 5.370919881305637e-07, "loss": 0.4911, "mean_token_accuracy": 0.8467527627944946, "num_tokens": 48358400.0, "step": 1268 }, { "epoch": 0.16142984353135734, "ewc_loss": 0.013079335913062096, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.1682036680867895e-05, "grad_norm": 3.033524513244629, "learning_rate": 5.375158965663417e-07, "loss": 0.4931, "mean_token_accuracy": 0.8447933197021484, "num_tokens": 48393018.0, "step": 1269 }, { "epoch": 0.16155705380994784, "ewc_loss": 0.01304644625633955, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.135313429287635e-05, "grad_norm": 2.979856491088867, "learning_rate": 5.379398050021195e-07, "loss": 0.4804, "mean_token_accuracy": 0.8467646837234497, "num_tokens": 48432952.0, "step": 1270 }, { "epoch": 0.16168426408853837, "ewc_loss": 0.013038665056228638, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.1275317926192656e-05, "grad_norm": 3.0617194175720215, "learning_rate": 5.383637134378975e-07, "loss": 0.4679, "mean_token_accuracy": 0.8477785587310791, "num_tokens": 48466913.0, "step": 1271 }, { "epoch": 0.16181147436712887, "ewc_loss": 0.013080074451863766, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.1689418139867485e-05, "grad_norm": 2.970336437225342, "learning_rate": 5.387876218736752e-07, "loss": 0.5484, "mean_token_accuracy": 0.822148323059082, "num_tokens": 48512658.0, "step": 1272 }, { "epoch": 0.16193868464571937, "ewc_loss": 0.013036818243563175, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.1256855183746666e-05, "grad_norm": 2.9300899505615234, "learning_rate": 5.392115303094531e-07, "loss": 0.4453, "mean_token_accuracy": 0.855555534362793, "num_tokens": 48552283.0, "step": 1273 }, { "epoch": 0.1620658949243099, "ewc_loss": 0.013053730130195618, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.1425966628594324e-05, "grad_norm": 2.9796106815338135, "learning_rate": 5.39635438745231e-07, "loss": 0.4246, "mean_token_accuracy": 0.862852931022644, "num_tokens": 48590504.0, "step": 1274 }, { "epoch": 0.1621931052029004, "ewc_loss": 0.01308770477771759, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.1765721107367426e-05, "grad_norm": 3.0341720581054688, "learning_rate": 5.400593471810089e-07, "loss": 0.5134, "mean_token_accuracy": 0.8360246419906616, "num_tokens": 48629993.0, "step": 1275 }, { "epoch": 0.1623203154814909, "ewc_loss": 0.01309474278241396, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.1836097807390615e-05, "grad_norm": 3.128056287765503, "learning_rate": 5.404832556167867e-07, "loss": 0.5155, "mean_token_accuracy": 0.8366034030914307, "num_tokens": 48665074.0, "step": 1276 }, { "epoch": 0.16244752576008142, "ewc_loss": 0.01319393515586853, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.221767812850885e-05, "grad_norm": 2.9710769653320312, "learning_rate": 5.409071640525647e-07, "loss": 0.4749, "mean_token_accuracy": 0.8506035208702087, "num_tokens": 48705524.0, "step": 1277 }, { "epoch": 0.16257473603867192, "ewc_loss": 0.01305619440972805, "ewc_loss_diag": 8.940696716308594e-06, "ewc_loss_parallel": 4.145061393501237e-05, "grad_norm": 3.1036579608917236, "learning_rate": 5.413310724883425e-07, "loss": 0.5225, "mean_token_accuracy": 0.8271393179893494, "num_tokens": 48744030.0, "step": 1278 }, { "epoch": 0.16270194631726242, "ewc_loss": 0.013202699832618237, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.230531703797169e-05, "grad_norm": 3.02415132522583, "learning_rate": 5.417549809241205e-07, "loss": 0.5109, "mean_token_accuracy": 0.8402621746063232, "num_tokens": 48788401.0, "step": 1279 }, { "epoch": 0.16282915659585295, "ewc_loss": 0.013142172247171402, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.1700044675963e-05, "grad_norm": 3.086982250213623, "learning_rate": 5.421788893598982e-07, "loss": 0.5162, "mean_token_accuracy": 0.8359829783439636, "num_tokens": 48823046.0, "step": 1280 }, { "epoch": 0.16295636687444345, "ewc_loss": 0.013185933232307434, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.2137653508689255e-05, "grad_norm": 2.9778406620025635, "learning_rate": 5.42602797795676e-07, "loss": 0.5225, "mean_token_accuracy": 0.8325246572494507, "num_tokens": 48868293.0, "step": 1281 }, { "epoch": 0.16308357715303395, "ewc_loss": 0.013203844428062439, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 4.170641113887541e-05, "grad_norm": 3.02690052986145, "learning_rate": 5.43026706231454e-07, "loss": 0.5498, "mean_token_accuracy": 0.8243749737739563, "num_tokens": 48908775.0, "step": 1282 }, { "epoch": 0.16321078743162448, "ewc_loss": 0.013179742731153965, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.207574966130778e-05, "grad_norm": 2.9883978366851807, "learning_rate": 5.434506146672319e-07, "loss": 0.5277, "mean_token_accuracy": 0.8318419456481934, "num_tokens": 48957640.0, "step": 1283 }, { "epoch": 0.16333799771021498, "ewc_loss": 0.013173459097743034, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.201290721539408e-05, "grad_norm": 3.1093924045562744, "learning_rate": 5.438745231030097e-07, "loss": 0.5192, "mean_token_accuracy": 0.8319119215011597, "num_tokens": 48993601.0, "step": 1284 }, { "epoch": 0.16346520798880548, "ewc_loss": 0.013220055028796196, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.247887045494281e-05, "grad_norm": 3.0559980869293213, "learning_rate": 5.442984315387876e-07, "loss": 0.4245, "mean_token_accuracy": 0.8611514568328857, "num_tokens": 49028411.0, "step": 1285 }, { "epoch": 0.163592418267396, "ewc_loss": 0.01324681006371975, "ewc_loss_diag": 9.059906005859375e-06, "ewc_loss_parallel": 4.2136063711950555e-05, "grad_norm": 2.9891302585601807, "learning_rate": 5.447223399745655e-07, "loss": 0.4885, "mean_token_accuracy": 0.8478500843048096, "num_tokens": 49067449.0, "step": 1286 }, { "epoch": 0.1637196285459865, "ewc_loss": 0.01316978968679905, "ewc_loss_diag": 9.000301361083984e-06, "ewc_loss_parallel": 4.197621819912456e-05, "grad_norm": 3.0477938652038574, "learning_rate": 5.451462484103433e-07, "loss": 0.4642, "mean_token_accuracy": 0.8519319891929626, "num_tokens": 49102650.0, "step": 1287 }, { "epoch": 0.163846838824577, "ewc_loss": 0.01333664357662201, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2424046114319935e-05, "grad_norm": 3.0724902153015137, "learning_rate": 5.455701568461212e-07, "loss": 0.4868, "mean_token_accuracy": 0.8435699343681335, "num_tokens": 49137110.0, "step": 1288 }, { "epoch": 0.16397404910316754, "ewc_loss": 0.013332303613424301, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.238064866513014e-05, "grad_norm": 3.0907325744628906, "learning_rate": 5.45994065281899e-07, "loss": 0.5588, "mean_token_accuracy": 0.8197150230407715, "num_tokens": 49177765.0, "step": 1289 }, { "epoch": 0.16410125938175804, "ewc_loss": 0.013337668031454086, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2434290662640706e-05, "grad_norm": 3.1406171321868896, "learning_rate": 5.46417973717677e-07, "loss": 0.4994, "mean_token_accuracy": 0.8386354446411133, "num_tokens": 49207860.0, "step": 1290 }, { "epoch": 0.16422846966034857, "ewc_loss": 0.01337392721325159, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2796891648322344e-05, "grad_norm": 3.1320269107818604, "learning_rate": 5.468418821534548e-07, "loss": 0.4335, "mean_token_accuracy": 0.8600894212722778, "num_tokens": 49239994.0, "step": 1291 }, { "epoch": 0.16435567993893907, "ewc_loss": 0.01335771381855011, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2634761484805495e-05, "grad_norm": 3.047442674636841, "learning_rate": 5.472657905892327e-07, "loss": 0.4476, "mean_token_accuracy": 0.8572111129760742, "num_tokens": 49273464.0, "step": 1292 }, { "epoch": 0.16448289021752957, "ewc_loss": 0.013334870338439941, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2406321881571785e-05, "grad_norm": 3.050081729888916, "learning_rate": 5.476896990250106e-07, "loss": 0.5606, "mean_token_accuracy": 0.8223982453346252, "num_tokens": 49313144.0, "step": 1293 }, { "epoch": 0.1646101004961201, "ewc_loss": 0.013362055644392967, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.267816984793171e-05, "grad_norm": 2.993900775909424, "learning_rate": 5.481136074607885e-07, "loss": 0.4471, "mean_token_accuracy": 0.852250874042511, "num_tokens": 49351392.0, "step": 1294 }, { "epoch": 0.1647373107747106, "ewc_loss": 0.01333528570830822, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2410472815390676e-05, "grad_norm": 2.968613862991333, "learning_rate": 5.485375158965663e-07, "loss": 0.451, "mean_token_accuracy": 0.855237603187561, "num_tokens": 49396726.0, "step": 1295 }, { "epoch": 0.1648645210533011, "ewc_loss": 0.013350866734981537, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.256628380971961e-05, "grad_norm": 3.021263599395752, "learning_rate": 5.489614243323442e-07, "loss": 0.4846, "mean_token_accuracy": 0.8435609340667725, "num_tokens": 49435856.0, "step": 1296 }, { "epoch": 0.16499173133189163, "ewc_loss": 0.013375725597143173, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2814866901608184e-05, "grad_norm": 2.9774580001831055, "learning_rate": 5.49385332768122e-07, "loss": 0.4254, "mean_token_accuracy": 0.8598874807357788, "num_tokens": 49473695.0, "step": 1297 }, { "epoch": 0.16511894161048213, "ewc_loss": 0.013366248458623886, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.2720108467619866e-05, "grad_norm": 3.0986123085021973, "learning_rate": 5.498092412039e-07, "loss": 0.5497, "mean_token_accuracy": 0.8303771018981934, "num_tokens": 49509165.0, "step": 1298 }, { "epoch": 0.16524615188907263, "ewc_loss": 0.013429984450340271, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.3357467802707106e-05, "grad_norm": 2.918311834335327, "learning_rate": 5.502331496396778e-07, "loss": 0.454, "mean_token_accuracy": 0.8523435592651367, "num_tokens": 49553790.0, "step": 1299 }, { "epoch": 0.16537336216766316, "ewc_loss": 0.013345465064048767, "ewc_loss_diag": 9.119510650634766e-06, "ewc_loss_parallel": 4.251226710039191e-05, "grad_norm": 3.011812210083008, "learning_rate": 5.506570580754557e-07, "loss": 0.4783, "mean_token_accuracy": 0.8458161354064941, "num_tokens": 49593124.0, "step": 1300 }, { "epoch": 0.16550057244625366, "ewc_loss": 0.013491351157426834, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.336077472544275e-05, "grad_norm": 3.1558213233947754, "learning_rate": 5.510809665112336e-07, "loss": 0.4649, "mean_token_accuracy": 0.8529711961746216, "num_tokens": 49632016.0, "step": 1301 }, { "epoch": 0.16562778272484416, "ewc_loss": 0.013519925996661186, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.364652340882458e-05, "grad_norm": 3.0085792541503906, "learning_rate": 5.515048749470113e-07, "loss": 0.4813, "mean_token_accuracy": 0.8478173613548279, "num_tokens": 49670689.0, "step": 1302 }, { "epoch": 0.1657549930034347, "ewc_loss": 0.013439931906759739, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.284658643882722e-05, "grad_norm": 3.0705041885375977, "learning_rate": 5.519287833827893e-07, "loss": 0.467, "mean_token_accuracy": 0.8486214876174927, "num_tokens": 49709230.0, "step": 1303 }, { "epoch": 0.1658822032820252, "ewc_loss": 0.013516019098460674, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.360745879239403e-05, "grad_norm": 3.2471673488616943, "learning_rate": 5.523526918185671e-07, "loss": 0.5215, "mean_token_accuracy": 0.8327168226242065, "num_tokens": 49741648.0, "step": 1304 }, { "epoch": 0.1660094135606157, "ewc_loss": 0.013567689806222916, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.412416456034407e-05, "grad_norm": 3.114534616470337, "learning_rate": 5.52776600254345e-07, "loss": 0.4416, "mean_token_accuracy": 0.8540048599243164, "num_tokens": 49773906.0, "step": 1305 }, { "epoch": 0.16613662383920622, "ewc_loss": 0.01347365416586399, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.318381252232939e-05, "grad_norm": 3.0227162837982178, "learning_rate": 5.532005086901229e-07, "loss": 0.5014, "mean_token_accuracy": 0.8444375991821289, "num_tokens": 49812645.0, "step": 1306 }, { "epoch": 0.16626383411779672, "ewc_loss": 0.013474607840180397, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.319334038882516e-05, "grad_norm": 2.992058038711548, "learning_rate": 5.536244171259008e-07, "loss": 0.4695, "mean_token_accuracy": 0.8505523204803467, "num_tokens": 49854500.0, "step": 1307 }, { "epoch": 0.16639104439638722, "ewc_loss": 0.013477101922035217, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.321827873354778e-05, "grad_norm": 3.2349061965942383, "learning_rate": 5.540483255616786e-07, "loss": 0.5627, "mean_token_accuracy": 0.8232945203781128, "num_tokens": 49889408.0, "step": 1308 }, { "epoch": 0.16651825467497774, "ewc_loss": 0.013580736704170704, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.42546333943028e-05, "grad_norm": 3.0687756538391113, "learning_rate": 5.544722339974566e-07, "loss": 0.485, "mean_token_accuracy": 0.845169186592102, "num_tokens": 49923581.0, "step": 1309 }, { "epoch": 0.16664546495356825, "ewc_loss": 0.013457784429192543, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.3025116610806435e-05, "grad_norm": 3.0284814834594727, "learning_rate": 5.548961424332343e-07, "loss": 0.4729, "mean_token_accuracy": 0.8459089994430542, "num_tokens": 49959433.0, "step": 1310 }, { "epoch": 0.16677267523215875, "ewc_loss": 0.013477703556418419, "ewc_loss_diag": 9.179115295410156e-06, "ewc_loss_parallel": 4.322430322645232e-05, "grad_norm": 3.024261713027954, "learning_rate": 5.553200508690123e-07, "loss": 0.4416, "mean_token_accuracy": 0.8595960140228271, "num_tokens": 50000307.0, "step": 1311 }, { "epoch": 0.16689988551074927, "ewc_loss": 0.013552283868193626, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.335974881541915e-05, "grad_norm": 3.1110570430755615, "learning_rate": 5.557439593047901e-07, "loss": 0.535, "mean_token_accuracy": 0.8282309770584106, "num_tokens": 50035587.0, "step": 1312 }, { "epoch": 0.16702709578933977, "ewc_loss": 0.013584399595856667, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.36809059465304e-05, "grad_norm": 2.985778570175171, "learning_rate": 5.56167867740568e-07, "loss": 0.4741, "mean_token_accuracy": 0.848225474357605, "num_tokens": 50079425.0, "step": 1313 }, { "epoch": 0.16715430606793028, "ewc_loss": 0.013530043885111809, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.3137359170941636e-05, "grad_norm": 3.1161653995513916, "learning_rate": 5.565917761763459e-07, "loss": 0.4582, "mean_token_accuracy": 0.8537590503692627, "num_tokens": 50110674.0, "step": 1314 }, { "epoch": 0.1672815163465208, "ewc_loss": 0.013607965782284737, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.391656693769619e-05, "grad_norm": 3.0585546493530273, "learning_rate": 5.570156846121238e-07, "loss": 0.4953, "mean_token_accuracy": 0.8411489725112915, "num_tokens": 50149033.0, "step": 1315 }, { "epoch": 0.1674087266251113, "ewc_loss": 0.013555847108364105, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.339539009379223e-05, "grad_norm": 3.088193655014038, "learning_rate": 5.574395930479016e-07, "loss": 0.5162, "mean_token_accuracy": 0.8366344571113586, "num_tokens": 50185728.0, "step": 1316 }, { "epoch": 0.16753593690370183, "ewc_loss": 0.013590831309556961, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.374522541183978e-05, "grad_norm": 3.102184295654297, "learning_rate": 5.578635014836796e-07, "loss": 0.4669, "mean_token_accuracy": 0.8513688445091248, "num_tokens": 50219671.0, "step": 1317 }, { "epoch": 0.16766314718229233, "ewc_loss": 0.01358555257320404, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.369244197732769e-05, "grad_norm": 3.148709774017334, "learning_rate": 5.582874099194573e-07, "loss": 0.5234, "mean_token_accuracy": 0.8364210724830627, "num_tokens": 50251989.0, "step": 1318 }, { "epoch": 0.16779035746088283, "ewc_loss": 0.01360793225467205, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.391624315758236e-05, "grad_norm": 3.156383514404297, "learning_rate": 5.587113183552353e-07, "loss": 0.5102, "mean_token_accuracy": 0.8372688293457031, "num_tokens": 50291079.0, "step": 1319 }, { "epoch": 0.16791756773947336, "ewc_loss": 0.013601759448647499, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.3854503019247204e-05, "grad_norm": 3.011908769607544, "learning_rate": 5.591352267910131e-07, "loss": 0.4675, "mean_token_accuracy": 0.8499583601951599, "num_tokens": 50332262.0, "step": 1320 }, { "epoch": 0.16804477801806386, "ewc_loss": 0.013555574230849743, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.3392657971708104e-05, "grad_norm": 3.079664945602417, "learning_rate": 5.59559135226791e-07, "loss": 0.4972, "mean_token_accuracy": 0.8433351516723633, "num_tokens": 50371278.0, "step": 1321 }, { "epoch": 0.16817198829665436, "ewc_loss": 0.013611432164907455, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.395123687572777e-05, "grad_norm": 3.117295503616333, "learning_rate": 5.599830436625689e-07, "loss": 0.4735, "mean_token_accuracy": 0.8497387766838074, "num_tokens": 50409463.0, "step": 1322 }, { "epoch": 0.1682991985752449, "ewc_loss": 0.013611964881420135, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.395656651468016e-05, "grad_norm": 3.0594217777252197, "learning_rate": 5.604069520983468e-07, "loss": 0.4685, "mean_token_accuracy": 0.852246105670929, "num_tokens": 50446745.0, "step": 1323 }, { "epoch": 0.1684264088538354, "ewc_loss": 0.013591762632131577, "ewc_loss_diag": 9.238719940185547e-06, "ewc_loss_parallel": 4.3754542275564745e-05, "grad_norm": 3.1499521732330322, "learning_rate": 5.608308605341246e-07, "loss": 0.4603, "mean_token_accuracy": 0.8511911630630493, "num_tokens": 50479725.0, "step": 1324 }, { "epoch": 0.1685536191324259, "ewc_loss": 0.013701548799872398, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.424204962560907e-05, "grad_norm": 3.1214423179626465, "learning_rate": 5.612547689699024e-07, "loss": 0.5222, "mean_token_accuracy": 0.8336318731307983, "num_tokens": 50514734.0, "step": 1325 }, { "epoch": 0.16868082941101642, "ewc_loss": 0.013676759786903858, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.399416138767265e-05, "grad_norm": 3.0457446575164795, "learning_rate": 5.616786774056803e-07, "loss": 0.5007, "mean_token_accuracy": 0.8429727554321289, "num_tokens": 50555165.0, "step": 1326 }, { "epoch": 0.16880803968960692, "ewc_loss": 0.013655735179781914, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.3783911678474396e-05, "grad_norm": 3.0539703369140625, "learning_rate": 5.621025858414582e-07, "loss": 0.5026, "mean_token_accuracy": 0.8393332362174988, "num_tokens": 50598251.0, "step": 1327 }, { "epoch": 0.16893524996819742, "ewc_loss": 0.013671573251485825, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.394229472381994e-05, "grad_norm": 3.0151267051696777, "learning_rate": 5.625264942772361e-07, "loss": 0.4942, "mean_token_accuracy": 0.8444646596908569, "num_tokens": 50638423.0, "step": 1328 }, { "epoch": 0.16906246024678795, "ewc_loss": 0.013666544109582901, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.389200694276951e-05, "grad_norm": 3.112917900085449, "learning_rate": 5.629504027130139e-07, "loss": 0.5508, "mean_token_accuracy": 0.8339102268218994, "num_tokens": 50678226.0, "step": 1329 }, { "epoch": 0.16918967052537845, "ewc_loss": 0.01378017757087946, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.441798591869883e-05, "grad_norm": 3.2031257152557373, "learning_rate": 5.633743111487919e-07, "loss": 0.5515, "mean_token_accuracy": 0.8233925104141235, "num_tokens": 50717542.0, "step": 1330 }, { "epoch": 0.16931688080396895, "ewc_loss": 0.01378781907260418, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.44944053015206e-05, "grad_norm": 3.124722719192505, "learning_rate": 5.637982195845697e-07, "loss": 0.4331, "mean_token_accuracy": 0.8602700233459473, "num_tokens": 50752914.0, "step": 1331 }, { "epoch": 0.16944409108255948, "ewc_loss": 0.0137337576597929, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.3953787098871544e-05, "grad_norm": 3.0461597442626953, "learning_rate": 5.642221280203476e-07, "loss": 0.4329, "mean_token_accuracy": 0.8621066212654114, "num_tokens": 50789276.0, "step": 1332 }, { "epoch": 0.16957130136114998, "ewc_loss": 0.013734593987464905, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.3962154450127855e-05, "grad_norm": 3.3147776126861572, "learning_rate": 5.646460364561254e-07, "loss": 0.4521, "mean_token_accuracy": 0.853771448135376, "num_tokens": 50820759.0, "step": 1333 }, { "epoch": 0.16969851163974048, "ewc_loss": 0.0138616394251585, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.523260940914042e-05, "grad_norm": 3.092957019805908, "learning_rate": 5.650699448919033e-07, "loss": 0.5425, "mean_token_accuracy": 0.8285765647888184, "num_tokens": 50863241.0, "step": 1334 }, { "epoch": 0.169825721918331, "ewc_loss": 0.01365387998521328, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.376536890049465e-05, "grad_norm": 3.1240761280059814, "learning_rate": 5.654938533276812e-07, "loss": 0.4814, "mean_token_accuracy": 0.8447613716125488, "num_tokens": 50904296.0, "step": 1335 }, { "epoch": 0.1699529321969215, "ewc_loss": 0.013716038316488266, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.438695032149553e-05, "grad_norm": 3.0991709232330322, "learning_rate": 5.659177617634591e-07, "loss": 0.4701, "mean_token_accuracy": 0.8508046269416809, "num_tokens": 50942180.0, "step": 1336 }, { "epoch": 0.170080142475512, "ewc_loss": 0.013691849075257778, "ewc_loss_diag": 9.298324584960938e-06, "ewc_loss_parallel": 4.414505383465439e-05, "grad_norm": 3.0497090816497803, "learning_rate": 5.663416701992369e-07, "loss": 0.4935, "mean_token_accuracy": 0.8430282473564148, "num_tokens": 50980287.0, "step": 1337 }, { "epoch": 0.17020735275410254, "ewc_loss": 0.013748283497989178, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.409904795465991e-05, "grad_norm": 3.1992220878601074, "learning_rate": 5.667655786350149e-07, "loss": 0.5551, "mean_token_accuracy": 0.817638635635376, "num_tokens": 51014218.0, "step": 1338 }, { "epoch": 0.17033456303269304, "ewc_loss": 0.013830015435814857, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.491637082537636e-05, "grad_norm": 3.0244593620300293, "learning_rate": 5.671894870707927e-07, "loss": 0.4747, "mean_token_accuracy": 0.852066159248352, "num_tokens": 51055915.0, "step": 1339 }, { "epoch": 0.17046177331128357, "ewc_loss": 0.013733090832829475, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.394711504573934e-05, "grad_norm": 3.0589053630828857, "learning_rate": 5.676133955065705e-07, "loss": 0.4194, "mean_token_accuracy": 0.8634133338928223, "num_tokens": 51090684.0, "step": 1340 }, { "epoch": 0.17058898358987407, "ewc_loss": 0.013803735375404358, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.465356687433086e-05, "grad_norm": 3.0776216983795166, "learning_rate": 5.680373039423484e-07, "loss": 0.4632, "mean_token_accuracy": 0.85100257396698, "num_tokens": 51130156.0, "step": 1341 }, { "epoch": 0.17071619386846457, "ewc_loss": 0.013801923021674156, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.463543518795632e-05, "grad_norm": 3.147803544998169, "learning_rate": 5.684612123781263e-07, "loss": 0.5044, "mean_token_accuracy": 0.837712287902832, "num_tokens": 51171282.0, "step": 1342 }, { "epoch": 0.1708434041470551, "ewc_loss": 0.013830441981554031, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.492063453653827e-05, "grad_norm": 3.1293041706085205, "learning_rate": 5.688851208139042e-07, "loss": 0.5194, "mean_token_accuracy": 0.8323107361793518, "num_tokens": 51210616.0, "step": 1343 }, { "epoch": 0.1709706144256456, "ewc_loss": 0.013805379159748554, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.4670003262581304e-05, "grad_norm": 3.076084613800049, "learning_rate": 5.69309029249682e-07, "loss": 0.5058, "mean_token_accuracy": 0.8434228897094727, "num_tokens": 51253357.0, "step": 1344 }, { "epoch": 0.1710978247042361, "ewc_loss": 0.013808520510792732, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.4701409933622926e-05, "grad_norm": 3.133978843688965, "learning_rate": 5.697329376854599e-07, "loss": 0.4875, "mean_token_accuracy": 0.8437376022338867, "num_tokens": 51292671.0, "step": 1345 }, { "epoch": 0.17122503498282662, "ewc_loss": 0.01382378488779068, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.4854059524368495e-05, "grad_norm": 3.1073973178863525, "learning_rate": 5.701568461212378e-07, "loss": 0.427, "mean_token_accuracy": 0.8636288046836853, "num_tokens": 51329145.0, "step": 1346 }, { "epoch": 0.17135224526141712, "ewc_loss": 0.013804513961076736, "ewc_loss_diag": 9.357929229736328e-06, "ewc_loss_parallel": 4.466134851099923e-05, "grad_norm": 3.0268521308898926, "learning_rate": 5.705807545570157e-07, "loss": 0.4366, "mean_token_accuracy": 0.859087347984314, "num_tokens": 51371265.0, "step": 1347 }, { "epoch": 0.17147945554000762, "ewc_loss": 0.013845648616552353, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 4.446234743227251e-05, "grad_norm": 3.1538782119750977, "learning_rate": 5.710046629927934e-07, "loss": 0.4722, "mean_token_accuracy": 0.8476147651672363, "num_tokens": 51410262.0, "step": 1348 }, { "epoch": 0.17160666581859815, "ewc_loss": 0.013912467285990715, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 4.513052772381343e-05, "grad_norm": 3.1463303565979004, "learning_rate": 5.714285714285714e-07, "loss": 0.4441, "mean_token_accuracy": 0.8575320243835449, "num_tokens": 51447796.0, "step": 1349 }, { "epoch": 0.17173387609718865, "ewc_loss": 0.013886133208870888, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 4.486718535190448e-05, "grad_norm": 3.2417550086975098, "learning_rate": 5.718524798643492e-07, "loss": 0.508, "mean_token_accuracy": 0.8388968110084534, "num_tokens": 51482677.0, "step": 1350 }, { "epoch": 0.17186108637577915, "ewc_loss": 0.013930198736488819, "ewc_loss_diag": 9.417533874511719e-06, "ewc_loss_parallel": 4.530784644884989e-05, "grad_norm": 3.0724916458129883, "learning_rate": 5.722763883001272e-07, "loss": 0.509, "mean_token_accuracy": 0.8388403654098511, "num_tokens": 51524111.0, "step": 1351 }, { "epoch": 0.17198829665436968, "ewc_loss": 0.013910487294197083, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.4500378862721846e-05, "grad_norm": 3.0939714908599854, "learning_rate": 5.72700296735905e-07, "loss": 0.4808, "mean_token_accuracy": 0.8445786237716675, "num_tokens": 51561008.0, "step": 1352 }, { "epoch": 0.17211550693296018, "ewc_loss": 0.013964422047138214, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.503972377278842e-05, "grad_norm": 3.078089714050293, "learning_rate": 5.731242051716829e-07, "loss": 0.5359, "mean_token_accuracy": 0.826881468296051, "num_tokens": 51603091.0, "step": 1353 }, { "epoch": 0.17224271721155068, "ewc_loss": 0.013961338438093662, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.500889190239832e-05, "grad_norm": 3.191117763519287, "learning_rate": 5.735481136074608e-07, "loss": 0.4506, "mean_token_accuracy": 0.8529043197631836, "num_tokens": 51637789.0, "step": 1354 }, { "epoch": 0.1723699274901412, "ewc_loss": 0.01400625891983509, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.545810224954039e-05, "grad_norm": 3.1454572677612305, "learning_rate": 5.739720220432386e-07, "loss": 0.4557, "mean_token_accuracy": 0.8507793545722961, "num_tokens": 51674207.0, "step": 1355 }, { "epoch": 0.1724971377687317, "ewc_loss": 0.013980887830257416, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.520438960753381e-05, "grad_norm": 3.074605941772461, "learning_rate": 5.743959304790164e-07, "loss": 0.5498, "mean_token_accuracy": 0.8280142545700073, "num_tokens": 51715109.0, "step": 1356 }, { "epoch": 0.1726243480473222, "ewc_loss": 0.013974078930914402, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.513629755820148e-05, "grad_norm": 3.1047651767730713, "learning_rate": 5.748198389147944e-07, "loss": 0.4979, "mean_token_accuracy": 0.8435941338539124, "num_tokens": 51753373.0, "step": 1357 }, { "epoch": 0.17275155832591274, "ewc_loss": 0.013999775052070618, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.539325163932517e-05, "grad_norm": 3.2410993576049805, "learning_rate": 5.752437473505722e-07, "loss": 0.5339, "mean_token_accuracy": 0.8336519002914429, "num_tokens": 51795100.0, "step": 1358 }, { "epoch": 0.17287876860450324, "ewc_loss": 0.014042602851986885, "ewc_loss_diag": 9.47713851928711e-06, "ewc_loss_parallel": 4.582153633236885e-05, "grad_norm": 3.0851755142211914, "learning_rate": 5.756676557863502e-07, "loss": 0.5087, "mean_token_accuracy": 0.8373219966888428, "num_tokens": 51831840.0, "step": 1359 }, { "epoch": 0.17300597888309374, "ewc_loss": 0.014019040390849113, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 4.497556437854655e-05, "grad_norm": 3.034085988998413, "learning_rate": 5.76091564222128e-07, "loss": 0.4611, "mean_token_accuracy": 0.853427529335022, "num_tokens": 51870227.0, "step": 1360 }, { "epoch": 0.17313318916168427, "ewc_loss": 0.01403621956706047, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 4.514735337579623e-05, "grad_norm": 3.1363234519958496, "learning_rate": 5.765154726579059e-07, "loss": 0.4861, "mean_token_accuracy": 0.8456910848617554, "num_tokens": 51908251.0, "step": 1361 }, { "epoch": 0.17326039944027477, "ewc_loss": 0.014095750637352467, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 4.5742661313852295e-05, "grad_norm": 3.1101760864257812, "learning_rate": 5.769393810936838e-07, "loss": 0.4769, "mean_token_accuracy": 0.8475602269172668, "num_tokens": 51944531.0, "step": 1362 }, { "epoch": 0.17338760971886527, "ewc_loss": 0.014065194875001907, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 4.543710747384466e-05, "grad_norm": 3.0835916996002197, "learning_rate": 5.773632895294616e-07, "loss": 0.5283, "mean_token_accuracy": 0.8306573629379272, "num_tokens": 51985742.0, "step": 1363 }, { "epoch": 0.1735148199974558, "ewc_loss": 0.0140897361561656, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 4.5682518248213455e-05, "grad_norm": 3.1166305541992188, "learning_rate": 5.777871979652394e-07, "loss": 0.4674, "mean_token_accuracy": 0.8498907089233398, "num_tokens": 52022036.0, "step": 1364 }, { "epoch": 0.1736420302760463, "ewc_loss": 0.014101581647992134, "ewc_loss_diag": 9.5367431640625e-06, "ewc_loss_parallel": 4.580097447615117e-05, "grad_norm": 3.0374279022216797, "learning_rate": 5.782111064010173e-07, "loss": 0.4915, "mean_token_accuracy": 0.842197060585022, "num_tokens": 52066077.0, "step": 1365 }, { "epoch": 0.17376924055463683, "ewc_loss": 0.01419639028608799, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.552835525828414e-05, "grad_norm": 3.032606363296509, "learning_rate": 5.786350148367952e-07, "loss": 0.4825, "mean_token_accuracy": 0.8470042943954468, "num_tokens": 52111280.0, "step": 1366 }, { "epoch": 0.17389645083322733, "ewc_loss": 0.014224089682102203, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.580535460263491e-05, "grad_norm": 3.0870447158813477, "learning_rate": 5.790589232725731e-07, "loss": 0.4811, "mean_token_accuracy": 0.8449622988700867, "num_tokens": 52153148.0, "step": 1367 }, { "epoch": 0.17402366111181783, "ewc_loss": 0.014244595542550087, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.6010405640117824e-05, "grad_norm": 3.2060766220092773, "learning_rate": 5.79482831708351e-07, "loss": 0.466, "mean_token_accuracy": 0.8503128886222839, "num_tokens": 52186722.0, "step": 1368 }, { "epoch": 0.17415087139040836, "ewc_loss": 0.014280389994382858, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.636834637494758e-05, "grad_norm": 3.1934428215026855, "learning_rate": 5.799067401441288e-07, "loss": 0.4744, "mean_token_accuracy": 0.8462467193603516, "num_tokens": 52220837.0, "step": 1369 }, { "epoch": 0.17427808166899886, "ewc_loss": 0.014250030741095543, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.6064757043495774e-05, "grad_norm": 3.0946455001831055, "learning_rate": 5.803306485799068e-07, "loss": 0.4601, "mean_token_accuracy": 0.8540126085281372, "num_tokens": 52257049.0, "step": 1370 }, { "epoch": 0.17440529194758936, "ewc_loss": 0.01422375813126564, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.580203312798403e-05, "grad_norm": 3.0660483837127686, "learning_rate": 5.807545570156845e-07, "loss": 0.4384, "mean_token_accuracy": 0.8579158186912537, "num_tokens": 52297632.0, "step": 1371 }, { "epoch": 0.1745325022261799, "ewc_loss": 0.014234323985874653, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.59076909464784e-05, "grad_norm": 3.2554783821105957, "learning_rate": 5.811784654514624e-07, "loss": 0.5338, "mean_token_accuracy": 0.829695463180542, "num_tokens": 52331008.0, "step": 1372 }, { "epoch": 0.1746597125047704, "ewc_loss": 0.014324285089969635, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.680730853579007e-05, "grad_norm": 3.021707057952881, "learning_rate": 5.816023738872403e-07, "loss": 0.4658, "mean_token_accuracy": 0.851733922958374, "num_tokens": 52369420.0, "step": 1373 }, { "epoch": 0.1747869227833609, "ewc_loss": 0.014196719974279404, "ewc_loss_diag": 9.655952453613281e-06, "ewc_loss_parallel": 4.553165126708336e-05, "grad_norm": 3.08786678314209, "learning_rate": 5.820262823230182e-07, "loss": 0.4964, "mean_token_accuracy": 0.8410581946372986, "num_tokens": 52411759.0, "step": 1374 }, { "epoch": 0.17491413306195142, "ewc_loss": 0.014392675831913948, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.627051021088846e-05, "grad_norm": 3.1924479007720947, "learning_rate": 5.824501907587961e-07, "loss": 0.4579, "mean_token_accuracy": 0.8528839945793152, "num_tokens": 52446459.0, "step": 1375 }, { "epoch": 0.17504134334054192, "ewc_loss": 0.01441718079149723, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.6515557187376544e-05, "grad_norm": 3.11403751373291, "learning_rate": 5.82874099194574e-07, "loss": 0.541, "mean_token_accuracy": 0.8274252414703369, "num_tokens": 52484615.0, "step": 1376 }, { "epoch": 0.17516855361913242, "ewc_loss": 0.014363452792167664, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.59782786492724e-05, "grad_norm": 3.0568771362304688, "learning_rate": 5.832980076303518e-07, "loss": 0.551, "mean_token_accuracy": 0.8242433071136475, "num_tokens": 52525195.0, "step": 1377 }, { "epoch": 0.17529576389772294, "ewc_loss": 0.014383431524038315, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.6178061893442646e-05, "grad_norm": 3.1711955070495605, "learning_rate": 5.837219160661297e-07, "loss": 0.4375, "mean_token_accuracy": 0.8580244183540344, "num_tokens": 52558783.0, "step": 1378 }, { "epoch": 0.17542297417631345, "ewc_loss": 0.014436950907111168, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.671325586969033e-05, "grad_norm": 3.0763392448425293, "learning_rate": 5.841458245019075e-07, "loss": 0.4319, "mean_token_accuracy": 0.8588082790374756, "num_tokens": 52599176.0, "step": 1379 }, { "epoch": 0.17555018445490395, "ewc_loss": 0.01436635758727789, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.600732427206822e-05, "grad_norm": 3.219377279281616, "learning_rate": 5.845697329376855e-07, "loss": 0.4882, "mean_token_accuracy": 0.8470268845558167, "num_tokens": 52630865.0, "step": 1380 }, { "epoch": 0.17567739473349447, "ewc_loss": 0.014457008801400661, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.691383946919814e-05, "grad_norm": 3.1597113609313965, "learning_rate": 5.849936413734633e-07, "loss": 0.54, "mean_token_accuracy": 0.8274660706520081, "num_tokens": 52671702.0, "step": 1381 }, { "epoch": 0.17580460501208497, "ewc_loss": 0.014464473351836205, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 4.637812889995985e-05, "grad_norm": 3.0499706268310547, "learning_rate": 5.854175498092412e-07, "loss": 0.4449, "mean_token_accuracy": 0.8579639196395874, "num_tokens": 52712288.0, "step": 1382 }, { "epoch": 0.17593181529067548, "ewc_loss": 0.014447731897234917, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 4.6210712753236294e-05, "grad_norm": 3.0486834049224854, "learning_rate": 5.858414582450191e-07, "loss": 0.4869, "mean_token_accuracy": 0.8455727696418762, "num_tokens": 52756569.0, "step": 1383 }, { "epoch": 0.176059025569266, "ewc_loss": 0.014471771195530891, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 4.645111403078772e-05, "grad_norm": 3.121288776397705, "learning_rate": 5.86265366680797e-07, "loss": 0.5098, "mean_token_accuracy": 0.8386332392692566, "num_tokens": 52796195.0, "step": 1384 }, { "epoch": 0.1761862358478565, "ewc_loss": 0.014446891844272614, "ewc_loss_diag": 9.775161743164062e-06, "ewc_loss_parallel": 4.681267455453053e-05, "grad_norm": 3.0869972705841064, "learning_rate": 5.866892751165748e-07, "loss": 0.4799, "mean_token_accuracy": 0.8478403687477112, "num_tokens": 52839687.0, "step": 1385 }, { "epoch": 0.176313446126447, "ewc_loss": 0.014485538937151432, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 4.658878606278449e-05, "grad_norm": 3.198190927505493, "learning_rate": 5.871131835523526e-07, "loss": 0.5348, "mean_token_accuracy": 0.829480767250061, "num_tokens": 52875660.0, "step": 1386 }, { "epoch": 0.17644065640503753, "ewc_loss": 0.014533395878970623, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 4.70673585368786e-05, "grad_norm": 3.06697416305542, "learning_rate": 5.875370919881305e-07, "loss": 0.4903, "mean_token_accuracy": 0.8425183892250061, "num_tokens": 52921220.0, "step": 1387 }, { "epoch": 0.17656786668362803, "ewc_loss": 0.014472956769168377, "ewc_loss_diag": 9.834766387939453e-06, "ewc_loss_parallel": 4.6462966565741226e-05, "grad_norm": 3.1808791160583496, "learning_rate": 5.879610004239084e-07, "loss": 0.5439, "mean_token_accuracy": 0.8312957286834717, "num_tokens": 52960042.0, "step": 1388 }, { "epoch": 0.17669507696221853, "ewc_loss": 0.014612666331231594, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.7249708586605266e-05, "grad_norm": 3.2003164291381836, "learning_rate": 5.883849088596863e-07, "loss": 0.5286, "mean_token_accuracy": 0.8343392014503479, "num_tokens": 53002817.0, "step": 1389 }, { "epoch": 0.17682228724080906, "ewc_loss": 0.014597365632653236, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.70967024739366e-05, "grad_norm": 3.170058012008667, "learning_rate": 5.888088172954641e-07, "loss": 0.5012, "mean_token_accuracy": 0.8419496417045593, "num_tokens": 53037751.0, "step": 1390 }, { "epoch": 0.17694949751939956, "ewc_loss": 0.014577565714716911, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.689869820140302e-05, "grad_norm": 3.118593215942383, "learning_rate": 5.892327257312421e-07, "loss": 0.4174, "mean_token_accuracy": 0.8662349581718445, "num_tokens": 53074286.0, "step": 1391 }, { "epoch": 0.1770767077979901, "ewc_loss": 0.014562144875526428, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.674449883168563e-05, "grad_norm": 3.105808973312378, "learning_rate": 5.896566341670199e-07, "loss": 0.4375, "mean_token_accuracy": 0.8552370071411133, "num_tokens": 53113525.0, "step": 1392 }, { "epoch": 0.1772039180765806, "ewc_loss": 0.014578244648873806, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.690549394581467e-05, "grad_norm": 3.142727851867676, "learning_rate": 5.900805426027977e-07, "loss": 0.4813, "mean_token_accuracy": 0.8458881378173828, "num_tokens": 53150137.0, "step": 1393 }, { "epoch": 0.1773311283551711, "ewc_loss": 0.01458598393946886, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.698288466897793e-05, "grad_norm": 3.0678367614746094, "learning_rate": 5.905044510385756e-07, "loss": 0.4351, "mean_token_accuracy": 0.8594837188720703, "num_tokens": 53189519.0, "step": 1394 }, { "epoch": 0.17745833863376162, "ewc_loss": 0.014563299715518951, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.675604941439815e-05, "grad_norm": 3.1385838985443115, "learning_rate": 5.909283594743535e-07, "loss": 0.5431, "mean_token_accuracy": 0.8278576731681824, "num_tokens": 53227452.0, "step": 1395 }, { "epoch": 0.17758554891235212, "ewc_loss": 0.014611436985433102, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.72374158562161e-05, "grad_norm": 3.130694627761841, "learning_rate": 5.913522679101314e-07, "loss": 0.4752, "mean_token_accuracy": 0.8446342349052429, "num_tokens": 53265875.0, "step": 1396 }, { "epoch": 0.17771275919094262, "ewc_loss": 0.014590305276215076, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.7026100219227374e-05, "grad_norm": 3.1047043800354004, "learning_rate": 5.917761763459093e-07, "loss": 0.5349, "mean_token_accuracy": 0.8297120332717896, "num_tokens": 53305477.0, "step": 1397 }, { "epoch": 0.17783996946953315, "ewc_loss": 0.01459791511297226, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.7102203097892925e-05, "grad_norm": 3.1193299293518066, "learning_rate": 5.922000847816871e-07, "loss": 0.456, "mean_token_accuracy": 0.8546854853630066, "num_tokens": 53342800.0, "step": 1398 }, { "epoch": 0.17796717974812365, "ewc_loss": 0.014628730714321136, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.741034717881121e-05, "grad_norm": 3.1153833866119385, "learning_rate": 5.926239932174651e-07, "loss": 0.5208, "mean_token_accuracy": 0.8355042934417725, "num_tokens": 53387476.0, "step": 1399 }, { "epoch": 0.17809439002671415, "ewc_loss": 0.014606677927076817, "ewc_loss_diag": 9.894371032714844e-06, "ewc_loss_parallel": 4.718982381746173e-05, "grad_norm": 3.1023471355438232, "learning_rate": 5.930479016532429e-07, "loss": 0.4773, "mean_token_accuracy": 0.8473643660545349, "num_tokens": 53424628.0, "step": 1400 }, { "epoch": 0.17822160030530468, "ewc_loss": 0.01474219560623169, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 4.732429806608707e-05, "grad_norm": 3.1522905826568604, "learning_rate": 5.934718100890207e-07, "loss": 0.4587, "mean_token_accuracy": 0.8500097990036011, "num_tokens": 53464459.0, "step": 1401 }, { "epoch": 0.17834881058389518, "ewc_loss": 0.014761943370103836, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 4.752177119371481e-05, "grad_norm": 3.1921701431274414, "learning_rate": 5.938957185247986e-07, "loss": 0.5076, "mean_token_accuracy": 0.8354805111885071, "num_tokens": 53501721.0, "step": 1402 }, { "epoch": 0.17847602086248568, "ewc_loss": 0.014784843660891056, "ewc_loss_diag": 1.0013580322265625e-05, "ewc_loss_parallel": 4.7750781959621236e-05, "grad_norm": 3.220431327819824, "learning_rate": 5.943196269605765e-07, "loss": 0.4688, "mean_token_accuracy": 0.8456137180328369, "num_tokens": 53536607.0, "step": 1403 }, { "epoch": 0.1786032311410762, "ewc_loss": 0.014840610325336456, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.7698096750536934e-05, "grad_norm": 3.210587501525879, "learning_rate": 5.947435353963544e-07, "loss": 0.5185, "mean_token_accuracy": 0.8385435342788696, "num_tokens": 53569809.0, "step": 1404 }, { "epoch": 0.1787304414196667, "ewc_loss": 0.014840472489595413, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.769671068061143e-05, "grad_norm": 3.179185152053833, "learning_rate": 5.951674438321323e-07, "loss": 0.4956, "mean_token_accuracy": 0.8429194688796997, "num_tokens": 53604423.0, "step": 1405 }, { "epoch": 0.1788576516982572, "ewc_loss": 0.014838270843029022, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.7674704546807334e-05, "grad_norm": 3.182246446609497, "learning_rate": 5.955913522679101e-07, "loss": 0.4919, "mean_token_accuracy": 0.8395544290542603, "num_tokens": 53642145.0, "step": 1406 }, { "epoch": 0.17898486197684774, "ewc_loss": 0.014843471348285675, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.7726702177897096e-05, "grad_norm": 3.1275110244750977, "learning_rate": 5.96015260703688e-07, "loss": 0.5659, "mean_token_accuracy": 0.8190059065818787, "num_tokens": 53682998.0, "step": 1407 }, { "epoch": 0.17911207225543824, "ewc_loss": 0.014837592840194702, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.766792699228972e-05, "grad_norm": 3.082935094833374, "learning_rate": 5.964391691394659e-07, "loss": 0.4998, "mean_token_accuracy": 0.8440195322036743, "num_tokens": 53729764.0, "step": 1408 }, { "epoch": 0.17923928253402874, "ewc_loss": 0.014832396060228348, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.761595482705161e-05, "grad_norm": 3.200439453125, "learning_rate": 5.968630775752436e-07, "loss": 0.5315, "mean_token_accuracy": 0.8313717246055603, "num_tokens": 53764522.0, "step": 1409 }, { "epoch": 0.17936649281261927, "ewc_loss": 0.014897748827934265, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.826947406399995e-05, "grad_norm": 3.2574408054351807, "learning_rate": 5.972869860110216e-07, "loss": 0.5146, "mean_token_accuracy": 0.8327767252922058, "num_tokens": 53799791.0, "step": 1410 }, { "epoch": 0.17949370309120977, "ewc_loss": 0.014902271330356598, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.831470869248733e-05, "grad_norm": 3.1565449237823486, "learning_rate": 5.977108944467994e-07, "loss": 0.5528, "mean_token_accuracy": 0.8289387822151184, "num_tokens": 53840181.0, "step": 1411 }, { "epoch": 0.17962091336980027, "ewc_loss": 0.0148408692330122, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.770068699144758e-05, "grad_norm": 3.102607011795044, "learning_rate": 5.981348028825774e-07, "loss": 0.4603, "mean_token_accuracy": 0.8551275730133057, "num_tokens": 53882813.0, "step": 1412 }, { "epoch": 0.1797481236483908, "ewc_loss": 0.014857500791549683, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.7867004468571395e-05, "grad_norm": 3.1444244384765625, "learning_rate": 5.985587113183552e-07, "loss": 0.5076, "mean_token_accuracy": 0.8392007350921631, "num_tokens": 53922198.0, "step": 1413 }, { "epoch": 0.1798753339269813, "ewc_loss": 0.014889296144247055, "ewc_loss_diag": 1.0073184967041016e-05, "ewc_loss_parallel": 4.81849601783324e-05, "grad_norm": 3.185610771179199, "learning_rate": 5.989826197541331e-07, "loss": 0.5035, "mean_token_accuracy": 0.8402404189109802, "num_tokens": 53959390.0, "step": 1414 }, { "epoch": 0.18000254420557182, "ewc_loss": 0.015020929276943207, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.8280588089255616e-05, "grad_norm": 3.126701593399048, "learning_rate": 5.99406528189911e-07, "loss": 0.5297, "mean_token_accuracy": 0.8372478485107422, "num_tokens": 54005707.0, "step": 1415 }, { "epoch": 0.18012975448416232, "ewc_loss": 0.014992542564868927, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.799672024091706e-05, "grad_norm": 3.120049238204956, "learning_rate": 5.998304366256888e-07, "loss": 0.495, "mean_token_accuracy": 0.8415355086326599, "num_tokens": 54047808.0, "step": 1416 }, { "epoch": 0.18025696476275282, "ewc_loss": 0.015015475451946259, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.82260475109797e-05, "grad_norm": 3.227663516998291, "learning_rate": 6.002543450614666e-07, "loss": 0.4205, "mean_token_accuracy": 0.8674171566963196, "num_tokens": 54082929.0, "step": 1417 }, { "epoch": 0.18038417504134335, "ewc_loss": 0.015042319893836975, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.849448669119738e-05, "grad_norm": 3.151050090789795, "learning_rate": 6.006782534972446e-07, "loss": 0.4805, "mean_token_accuracy": 0.8468724489212036, "num_tokens": 54120199.0, "step": 1418 }, { "epoch": 0.18051138531993385, "ewc_loss": 0.014987069182097912, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.7941983211785555e-05, "grad_norm": 3.2324306964874268, "learning_rate": 6.011021619330224e-07, "loss": 0.5161, "mean_token_accuracy": 0.8353117108345032, "num_tokens": 54158629.0, "step": 1419 }, { "epoch": 0.18063859559852435, "ewc_loss": 0.015053736977279186, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.860866101807915e-05, "grad_norm": 3.23099422454834, "learning_rate": 6.015260703688004e-07, "loss": 0.4956, "mean_token_accuracy": 0.840208888053894, "num_tokens": 54194772.0, "step": 1420 }, { "epoch": 0.18076580587711488, "ewc_loss": 0.015032442286610603, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.839571556658484e-05, "grad_norm": 3.182635545730591, "learning_rate": 6.019499788045782e-07, "loss": 0.5037, "mean_token_accuracy": 0.8419830799102783, "num_tokens": 54229825.0, "step": 1421 }, { "epoch": 0.18089301615570538, "ewc_loss": 0.01501922681927681, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.8263551434502006e-05, "grad_norm": 3.2506937980651855, "learning_rate": 6.023738872403561e-07, "loss": 0.4594, "mean_token_accuracy": 0.8517481684684753, "num_tokens": 54260051.0, "step": 1422 }, { "epoch": 0.18102022643429588, "ewc_loss": 0.015064546838402748, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.871675992035307e-05, "grad_norm": 3.143101692199707, "learning_rate": 6.02797795676134e-07, "loss": 0.486, "mean_token_accuracy": 0.8487004041671753, "num_tokens": 54297865.0, "step": 1423 }, { "epoch": 0.1811474367128864, "ewc_loss": 0.015067532658576965, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.8136262194020674e-05, "grad_norm": 3.2203826904296875, "learning_rate": 6.032217041119118e-07, "loss": 0.5005, "mean_token_accuracy": 0.8357489705085754, "num_tokens": 54334618.0, "step": 1424 }, { "epoch": 0.1812746469914769, "ewc_loss": 0.015068979002535343, "ewc_loss_diag": 1.0192394256591797e-05, "ewc_loss_parallel": 4.876107777818106e-05, "grad_norm": 3.205204963684082, "learning_rate": 6.036456125476896e-07, "loss": 0.5369, "mean_token_accuracy": 0.8283512592315674, "num_tokens": 54369761.0, "step": 1425 }, { "epoch": 0.1814018572700674, "ewc_loss": 0.01511503104120493, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.861124762101099e-05, "grad_norm": 3.139702796936035, "learning_rate": 6.040695209834675e-07, "loss": 0.4372, "mean_token_accuracy": 0.8567275404930115, "num_tokens": 54411549.0, "step": 1426 }, { "epoch": 0.18152906754865794, "ewc_loss": 0.015085067600011826, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.8311612772522494e-05, "grad_norm": 3.1423442363739014, "learning_rate": 6.044934294192454e-07, "loss": 0.523, "mean_token_accuracy": 0.8351778984069824, "num_tokens": 54449137.0, "step": 1427 }, { "epoch": 0.18165627782724844, "ewc_loss": 0.015113761648535728, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.859855107497424e-05, "grad_norm": 3.0445592403411865, "learning_rate": 6.049173378550233e-07, "loss": 0.4775, "mean_token_accuracy": 0.8498303890228271, "num_tokens": 54494604.0, "step": 1428 }, { "epoch": 0.18178348810583894, "ewc_loss": 0.015077736228704453, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.823830022360198e-05, "grad_norm": 3.1152918338775635, "learning_rate": 6.053412462908012e-07, "loss": 0.4584, "mean_token_accuracy": 0.851571798324585, "num_tokens": 54533533.0, "step": 1429 }, { "epoch": 0.18191069838442947, "ewc_loss": 0.015139262191951275, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.885355883743614e-05, "grad_norm": 3.1275594234466553, "learning_rate": 6.05765154726579e-07, "loss": 0.5297, "mean_token_accuracy": 0.8331401944160461, "num_tokens": 54575211.0, "step": 1430 }, { "epoch": 0.18203790866301997, "ewc_loss": 0.015125769190490246, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.8718629841459915e-05, "grad_norm": 3.088968276977539, "learning_rate": 6.061890631623569e-07, "loss": 0.437, "mean_token_accuracy": 0.8590762615203857, "num_tokens": 54618308.0, "step": 1431 }, { "epoch": 0.18216511894161047, "ewc_loss": 0.015125466510653496, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.871559940511361e-05, "grad_norm": 3.1183087825775146, "learning_rate": 6.066129715981347e-07, "loss": 0.4752, "mean_token_accuracy": 0.8473637700080872, "num_tokens": 54660260.0, "step": 1432 }, { "epoch": 0.182292329220201, "ewc_loss": 0.01516365073621273, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.909743802272715e-05, "grad_norm": 3.228980541229248, "learning_rate": 6.070368800339126e-07, "loss": 0.444, "mean_token_accuracy": 0.8566396236419678, "num_tokens": 54692056.0, "step": 1433 }, { "epoch": 0.1824195394987915, "ewc_loss": 0.015176838263869286, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.922931839246303e-05, "grad_norm": 3.225269079208374, "learning_rate": 6.074607884696905e-07, "loss": 0.4675, "mean_token_accuracy": 0.8484792709350586, "num_tokens": 54729252.0, "step": 1434 }, { "epoch": 0.182546749777382, "ewc_loss": 0.015164962038397789, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.9110563850263134e-05, "grad_norm": 3.225558280944824, "learning_rate": 6.078846969054684e-07, "loss": 0.4892, "mean_token_accuracy": 0.8423469066619873, "num_tokens": 54763422.0, "step": 1435 }, { "epoch": 0.18267396005597253, "ewc_loss": 0.015175551176071167, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.9216450861422345e-05, "grad_norm": 3.1960830688476562, "learning_rate": 6.083086053412463e-07, "loss": 0.4674, "mean_token_accuracy": 0.8500785231590271, "num_tokens": 54803494.0, "step": 1436 }, { "epoch": 0.18280117033456303, "ewc_loss": 0.015151025727391243, "ewc_loss_diag": 1.0251998901367188e-05, "ewc_loss_parallel": 4.897118924418464e-05, "grad_norm": 3.2456486225128174, "learning_rate": 6.087325137770242e-07, "loss": 0.4547, "mean_token_accuracy": 0.8531519174575806, "num_tokens": 54839349.0, "step": 1437 }, { "epoch": 0.18292838061315353, "ewc_loss": 0.015247666276991367, "ewc_loss_diag": 1.0311603546142578e-05, "ewc_loss_parallel": 4.932724914397113e-05, "grad_norm": 3.1970982551574707, "learning_rate": 6.09156422212802e-07, "loss": 0.5278, "mean_token_accuracy": 0.8365322947502136, "num_tokens": 54881941.0, "step": 1438 }, { "epoch": 0.18305559089174406, "ewc_loss": 0.015272567048668861, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.896589962299913e-05, "grad_norm": 3.178708791732788, "learning_rate": 6.095803306485799e-07, "loss": 0.4795, "mean_token_accuracy": 0.8434275984764099, "num_tokens": 54922927.0, "step": 1439 }, { "epoch": 0.18318280117033456, "ewc_loss": 0.015284983441233635, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.909006747766398e-05, "grad_norm": 3.1632587909698486, "learning_rate": 6.100042390843577e-07, "loss": 0.5054, "mean_token_accuracy": 0.8383364677429199, "num_tokens": 54963885.0, "step": 1440 }, { "epoch": 0.1833100114489251, "ewc_loss": 0.015285560861229897, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.909584822598845e-05, "grad_norm": 3.1781699657440186, "learning_rate": 6.104281475201356e-07, "loss": 0.4575, "mean_token_accuracy": 0.8517544269561768, "num_tokens": 55001284.0, "step": 1441 }, { "epoch": 0.1834372217275156, "ewc_loss": 0.01528125535696745, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.905278910882771e-05, "grad_norm": 3.2193830013275146, "learning_rate": 6.108520559559135e-07, "loss": 0.5159, "mean_token_accuracy": 0.8362289667129517, "num_tokens": 55034813.0, "step": 1442 }, { "epoch": 0.1835644320061061, "ewc_loss": 0.01531701348721981, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.941036968375556e-05, "grad_norm": 3.16540789604187, "learning_rate": 6.112759643916914e-07, "loss": 0.4227, "mean_token_accuracy": 0.8627331256866455, "num_tokens": 55070620.0, "step": 1443 }, { "epoch": 0.18369164228469662, "ewc_loss": 0.015281743369996548, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.905766763840802e-05, "grad_norm": 3.262634038925171, "learning_rate": 6.116998728274693e-07, "loss": 0.5036, "mean_token_accuracy": 0.8439148664474487, "num_tokens": 55108394.0, "step": 1444 }, { "epoch": 0.18381885256328712, "ewc_loss": 0.015341956168413162, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.965979314874858e-05, "grad_norm": 3.2016258239746094, "learning_rate": 6.121237812632472e-07, "loss": 0.5067, "mean_token_accuracy": 0.837247371673584, "num_tokens": 55148170.0, "step": 1445 }, { "epoch": 0.18394606284187762, "ewc_loss": 0.015300437808036804, "ewc_loss_diag": 1.0371208190917969e-05, "ewc_loss_parallel": 4.9244608817389235e-05, "grad_norm": 3.1338095664978027, "learning_rate": 6.125476896990249e-07, "loss": 0.4504, "mean_token_accuracy": 0.8578964471817017, "num_tokens": 55188102.0, "step": 1446 }, { "epoch": 0.18407327312046814, "ewc_loss": 0.015402313321828842, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 4.904266097582877e-05, "grad_norm": 3.1998863220214844, "learning_rate": 6.129715981348028e-07, "loss": 0.488, "mean_token_accuracy": 0.8487124443054199, "num_tokens": 55225893.0, "step": 1447 }, { "epoch": 0.18420048339905865, "ewc_loss": 0.015704233199357986, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 4.96204593218863e-05, "grad_norm": 9.497587203979492, "learning_rate": 6.133955065705807e-07, "loss": 0.5087, "mean_token_accuracy": 0.8359642624855042, "num_tokens": 55265321.0, "step": 1448 }, { "epoch": 0.18432769367764915, "ewc_loss": 0.01750538870692253, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 7.00734235579148e-05, "grad_norm": 3.9635934829711914, "learning_rate": 6.138194150063585e-07, "loss": 0.4718, "mean_token_accuracy": 0.8502614498138428, "num_tokens": 55304179.0, "step": 1449 }, { "epoch": 0.18445490395623967, "ewc_loss": 0.015728335827589035, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 5.230287933954969e-05, "grad_norm": 2.9566023349761963, "learning_rate": 6.142433234421365e-07, "loss": 0.5043, "mean_token_accuracy": 0.8387251496315002, "num_tokens": 55343266.0, "step": 1450 }, { "epoch": 0.18458211423483017, "ewc_loss": 0.015442317351698875, "ewc_loss_diag": 1.049041748046875e-05, "ewc_loss_parallel": 4.944271131535061e-05, "grad_norm": 3.473499059677124, "learning_rate": 6.146672318779143e-07, "loss": 0.5054, "mean_token_accuracy": 0.8386491537094116, "num_tokens": 55381147.0, "step": 1451 }, { "epoch": 0.18470932451342068, "ewc_loss": 0.016129091382026672, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.570010398514569e-05, "grad_norm": 3.4139366149902344, "learning_rate": 6.150911403136923e-07, "loss": 0.5058, "mean_token_accuracy": 0.8411760330200195, "num_tokens": 55414855.0, "step": 1452 }, { "epoch": 0.1848365347920112, "ewc_loss": 0.0157115887850523, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.1525064918678254e-05, "grad_norm": 3.2762115001678467, "learning_rate": 6.155150487494701e-07, "loss": 0.4312, "mean_token_accuracy": 0.860878050327301, "num_tokens": 55449699.0, "step": 1453 }, { "epoch": 0.1849637450706017, "ewc_loss": 0.015639444813132286, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.0803631893359125e-05, "grad_norm": 3.244964361190796, "learning_rate": 6.159389571852479e-07, "loss": 0.4586, "mean_token_accuracy": 0.8540546894073486, "num_tokens": 55493202.0, "step": 1454 }, { "epoch": 0.1850909553491922, "ewc_loss": 0.015658117830753326, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.09903475176543e-05, "grad_norm": 3.2230076789855957, "learning_rate": 6.163628656210258e-07, "loss": 0.4785, "mean_token_accuracy": 0.8492871522903442, "num_tokens": 55539873.0, "step": 1455 }, { "epoch": 0.18521816562778273, "ewc_loss": 0.015611980110406876, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.052897904533893e-05, "grad_norm": 3.3364601135253906, "learning_rate": 6.167867740568037e-07, "loss": 0.5391, "mean_token_accuracy": 0.8305236101150513, "num_tokens": 55576338.0, "step": 1456 }, { "epoch": 0.18534537590637323, "ewc_loss": 0.015661034733057022, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.101953138364479e-05, "grad_norm": 3.2093920707702637, "learning_rate": 6.172106824925815e-07, "loss": 0.4609, "mean_token_accuracy": 0.8504973649978638, "num_tokens": 55618330.0, "step": 1457 }, { "epoch": 0.18547258618496373, "ewc_loss": 0.015540365129709244, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 4.9812832003226504e-05, "grad_norm": 3.259904146194458, "learning_rate": 6.176345909283595e-07, "loss": 0.4774, "mean_token_accuracy": 0.8460607528686523, "num_tokens": 55657394.0, "step": 1458 }, { "epoch": 0.18559979646355426, "ewc_loss": 0.01560165360569954, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.042571865487844e-05, "grad_norm": 3.2602193355560303, "learning_rate": 6.180584993641373e-07, "loss": 0.4622, "mean_token_accuracy": 0.8519688248634338, "num_tokens": 55695087.0, "step": 1459 }, { "epoch": 0.18572700674214476, "ewc_loss": 0.015579266473650932, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 5.0201848353026435e-05, "grad_norm": 3.2268426418304443, "learning_rate": 6.184824077999153e-07, "loss": 0.4468, "mean_token_accuracy": 0.8563410043716431, "num_tokens": 55730852.0, "step": 1460 }, { "epoch": 0.18585421702073526, "ewc_loss": 0.015549533069133759, "ewc_loss_diag": 1.055002212524414e-05, "ewc_loss_parallel": 4.990450543118641e-05, "grad_norm": 3.3021528720855713, "learning_rate": 6.189063162356931e-07, "loss": 0.5424, "mean_token_accuracy": 0.8296918869018555, "num_tokens": 55765678.0, "step": 1461 }, { "epoch": 0.1859814272993258, "ewc_loss": 0.015654105693101883, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 5.033988782088272e-05, "grad_norm": 3.234093427658081, "learning_rate": 6.193302246714709e-07, "loss": 0.4693, "mean_token_accuracy": 0.8527222275733948, "num_tokens": 55804409.0, "step": 1462 }, { "epoch": 0.1861086375779163, "ewc_loss": 0.015601380728185177, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 4.981263555237092e-05, "grad_norm": 3.23829984664917, "learning_rate": 6.197541331072488e-07, "loss": 0.4975, "mean_token_accuracy": 0.8406268358230591, "num_tokens": 55839520.0, "step": 1463 }, { "epoch": 0.1862358478565068, "ewc_loss": 0.015621366910636425, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 5.001249519409612e-05, "grad_norm": 3.204057455062866, "learning_rate": 6.201780415430267e-07, "loss": 0.4791, "mean_token_accuracy": 0.8470364809036255, "num_tokens": 55879089.0, "step": 1464 }, { "epoch": 0.18636305813509732, "ewc_loss": 0.015621710568666458, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 5.001592944609001e-05, "grad_norm": 3.260643243789673, "learning_rate": 6.206019499788045e-07, "loss": 0.5195, "mean_token_accuracy": 0.8347762227058411, "num_tokens": 55916655.0, "step": 1465 }, { "epoch": 0.18649026841368782, "ewc_loss": 0.015646420419216156, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 5.0263028242625296e-05, "grad_norm": 3.204005241394043, "learning_rate": 6.210258584145825e-07, "loss": 0.4572, "mean_token_accuracy": 0.851830244064331, "num_tokens": 55953828.0, "step": 1466 }, { "epoch": 0.18661747869227835, "ewc_loss": 0.015628201887011528, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 5.008084190194495e-05, "grad_norm": 3.1989121437072754, "learning_rate": 6.214497668503603e-07, "loss": 0.5051, "mean_token_accuracy": 0.8354994058609009, "num_tokens": 55992037.0, "step": 1467 }, { "epoch": 0.18674468897086885, "ewc_loss": 0.015651486814022064, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 5.0313701649429277e-05, "grad_norm": 3.3594319820404053, "learning_rate": 6.218736752861383e-07, "loss": 0.4661, "mean_token_accuracy": 0.8522023558616638, "num_tokens": 56020424.0, "step": 1468 }, { "epoch": 0.18687189924945935, "ewc_loss": 0.01571049727499485, "ewc_loss_diag": 1.0609626770019531e-05, "ewc_loss_parallel": 5.09038072777912e-05, "grad_norm": 3.2211992740631104, "learning_rate": 6.22297583721916e-07, "loss": 0.5, "mean_token_accuracy": 0.8447256088256836, "num_tokens": 56060839.0, "step": 1469 }, { "epoch": 0.18699910952804988, "ewc_loss": 0.01569541171193123, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.014260023017414e-05, "grad_norm": 3.248483657836914, "learning_rate": 6.227214921576938e-07, "loss": 0.5281, "mean_token_accuracy": 0.8362512588500977, "num_tokens": 56099095.0, "step": 1470 }, { "epoch": 0.18712631980664038, "ewc_loss": 0.015749743208289146, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.068590689916164e-05, "grad_norm": 3.209876775741577, "learning_rate": 6.231454005934718e-07, "loss": 0.4264, "mean_token_accuracy": 0.8638089299201965, "num_tokens": 56133805.0, "step": 1471 }, { "epoch": 0.18725353008523088, "ewc_loss": 0.01571851409971714, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.037361916038208e-05, "grad_norm": 3.2274956703186035, "learning_rate": 6.235693090292496e-07, "loss": 0.5446, "mean_token_accuracy": 0.8305264115333557, "num_tokens": 56173064.0, "step": 1472 }, { "epoch": 0.1873807403638214, "ewc_loss": 0.01575782708823681, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.0766742788255215e-05, "grad_norm": 3.147779703140259, "learning_rate": 6.239932174650275e-07, "loss": 0.4269, "mean_token_accuracy": 0.8649572134017944, "num_tokens": 56213285.0, "step": 1473 }, { "epoch": 0.1875079506424119, "ewc_loss": 0.015717150643467903, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.035998037783429e-05, "grad_norm": 3.322683334350586, "learning_rate": 6.244171259008054e-07, "loss": 0.5083, "mean_token_accuracy": 0.836800754070282, "num_tokens": 56248218.0, "step": 1474 }, { "epoch": 0.1876351609210024, "ewc_loss": 0.015820136293768883, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.1389841246418655e-05, "grad_norm": 3.1807796955108643, "learning_rate": 6.248410343365833e-07, "loss": 0.4523, "mean_token_accuracy": 0.8551948070526123, "num_tokens": 56286098.0, "step": 1475 }, { "epoch": 0.18776237119959294, "ewc_loss": 0.015708550810813904, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.0273974920855835e-05, "grad_norm": 3.1357784271240234, "learning_rate": 6.252649427723612e-07, "loss": 0.4608, "mean_token_accuracy": 0.8551732301712036, "num_tokens": 56329732.0, "step": 1476 }, { "epoch": 0.18788958147818344, "ewc_loss": 0.015746133401989937, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.0649807235458866e-05, "grad_norm": 3.218386173248291, "learning_rate": 6.25688851208139e-07, "loss": 0.465, "mean_token_accuracy": 0.8523213267326355, "num_tokens": 56367838.0, "step": 1477 }, { "epoch": 0.18801679175677394, "ewc_loss": 0.01579378731548786, "ewc_loss_diag": 1.0669231414794922e-05, "ewc_loss_parallel": 5.1126349717378616e-05, "grad_norm": 3.200115203857422, "learning_rate": 6.261127596439168e-07, "loss": 0.5194, "mean_token_accuracy": 0.8333711624145508, "num_tokens": 56413059.0, "step": 1478 }, { "epoch": 0.18814400203536447, "ewc_loss": 0.015818173065781593, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.075985245639458e-05, "grad_norm": 3.181007146835327, "learning_rate": 6.265366680796948e-07, "loss": 0.4217, "mean_token_accuracy": 0.8600741028785706, "num_tokens": 56453806.0, "step": 1479 }, { "epoch": 0.18827121231395497, "ewc_loss": 0.015829743817448616, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.087556201033294e-05, "grad_norm": 3.2201688289642334, "learning_rate": 6.269605765154726e-07, "loss": 0.4691, "mean_token_accuracy": 0.8487383127212524, "num_tokens": 56497448.0, "step": 1480 }, { "epoch": 0.18839842259254547, "ewc_loss": 0.015843220055103302, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.1010330935241655e-05, "grad_norm": 3.2451963424682617, "learning_rate": 6.273844849512505e-07, "loss": 0.472, "mean_token_accuracy": 0.8486592173576355, "num_tokens": 56537731.0, "step": 1481 }, { "epoch": 0.188525632871136, "ewc_loss": 0.0158555805683136, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.11339349031914e-05, "grad_norm": 3.1537065505981445, "learning_rate": 6.278083933870284e-07, "loss": 0.4109, "mean_token_accuracy": 0.8692259192466736, "num_tokens": 56580944.0, "step": 1482 }, { "epoch": 0.1886528431497265, "ewc_loss": 0.01580190658569336, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.0597183872014284e-05, "grad_norm": 3.296962261199951, "learning_rate": 6.282323018228063e-07, "loss": 0.4429, "mean_token_accuracy": 0.8599218726158142, "num_tokens": 56616306.0, "step": 1483 }, { "epoch": 0.188780053428317, "ewc_loss": 0.015894168987870216, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.151981531525962e-05, "grad_norm": 3.1798386573791504, "learning_rate": 6.286562102585841e-07, "loss": 0.5211, "mean_token_accuracy": 0.8378643989562988, "num_tokens": 56661874.0, "step": 1484 }, { "epoch": 0.18890726370690752, "ewc_loss": 0.01580536738038063, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.0631795602384955e-05, "grad_norm": 3.2254834175109863, "learning_rate": 6.29080118694362e-07, "loss": 0.418, "mean_token_accuracy": 0.8649775981903076, "num_tokens": 56696697.0, "step": 1485 }, { "epoch": 0.18903447398549802, "ewc_loss": 0.015866342931985855, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.124155359226279e-05, "grad_norm": 3.278306484222412, "learning_rate": 6.295040271301398e-07, "loss": 0.4172, "mean_token_accuracy": 0.8637280464172363, "num_tokens": 56732581.0, "step": 1486 }, { "epoch": 0.18916168426408853, "ewc_loss": 0.01588042452931404, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.1382372475927696e-05, "grad_norm": 3.2071430683135986, "learning_rate": 6.299279355659178e-07, "loss": 0.4663, "mean_token_accuracy": 0.8520413637161255, "num_tokens": 56777905.0, "step": 1487 }, { "epoch": 0.18928889454267905, "ewc_loss": 0.01582513190805912, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.082944335299544e-05, "grad_norm": 3.2965803146362305, "learning_rate": 6.303518440016956e-07, "loss": 0.4843, "mean_token_accuracy": 0.844805121421814, "num_tokens": 56812915.0, "step": 1488 }, { "epoch": 0.18941610482126955, "ewc_loss": 0.01588674634695053, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.1445582357700914e-05, "grad_norm": 3.3279383182525635, "learning_rate": 6.307757524374735e-07, "loss": 0.4796, "mean_token_accuracy": 0.8497340083122253, "num_tokens": 56846907.0, "step": 1489 }, { "epoch": 0.18954331509986008, "ewc_loss": 0.01586483046412468, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.12264268763829e-05, "grad_norm": 3.2404396533966064, "learning_rate": 6.311996608732514e-07, "loss": 0.5087, "mean_token_accuracy": 0.8431532979011536, "num_tokens": 56884614.0, "step": 1490 }, { "epoch": 0.18967052537845058, "ewc_loss": 0.015831973403692245, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.089787009637803e-05, "grad_norm": 3.3213868141174316, "learning_rate": 6.316235693090292e-07, "loss": 0.4994, "mean_token_accuracy": 0.838951826095581, "num_tokens": 56920508.0, "step": 1491 }, { "epoch": 0.18979773565704108, "ewc_loss": 0.015896163880825043, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.1539762353058904e-05, "grad_norm": 3.249767541885376, "learning_rate": 6.320474777448071e-07, "loss": 0.5036, "mean_token_accuracy": 0.8392438888549805, "num_tokens": 56957629.0, "step": 1492 }, { "epoch": 0.1899249459356316, "ewc_loss": 0.015848373994231224, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.1061866543022916e-05, "grad_norm": 3.222926616668701, "learning_rate": 6.324713861805849e-07, "loss": 0.4609, "mean_token_accuracy": 0.8499466180801392, "num_tokens": 56997265.0, "step": 1493 }, { "epoch": 0.1900521562142221, "ewc_loss": 0.015861429274082184, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.119242632645182e-05, "grad_norm": 3.3503434658050537, "learning_rate": 6.328952946163628e-07, "loss": 0.5615, "mean_token_accuracy": 0.8252897262573242, "num_tokens": 57031519.0, "step": 1494 }, { "epoch": 0.1901793664928126, "ewc_loss": 0.015927959233522415, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.185772170079872e-05, "grad_norm": 3.221388339996338, "learning_rate": 6.333192030521407e-07, "loss": 0.4344, "mean_token_accuracy": 0.856747031211853, "num_tokens": 57067264.0, "step": 1495 }, { "epoch": 0.19030657677140314, "ewc_loss": 0.01584707386791706, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.104886076878756e-05, "grad_norm": 3.2599239349365234, "learning_rate": 6.337431114879186e-07, "loss": 0.4876, "mean_token_accuracy": 0.8458238840103149, "num_tokens": 57104906.0, "step": 1496 }, { "epoch": 0.19043378704999364, "ewc_loss": 0.0158982016146183, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.156014594831504e-05, "grad_norm": 3.2163000106811523, "learning_rate": 6.341670199236965e-07, "loss": 0.456, "mean_token_accuracy": 0.8554204702377319, "num_tokens": 57142534.0, "step": 1497 }, { "epoch": 0.19056099732858414, "ewc_loss": 0.01589033007621765, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.148142372490838e-05, "grad_norm": 3.1741840839385986, "learning_rate": 6.345909283594744e-07, "loss": 0.4367, "mean_token_accuracy": 0.8596757650375366, "num_tokens": 57188735.0, "step": 1498 }, { "epoch": 0.19068820760717467, "ewc_loss": 0.015882844105362892, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.140657231095247e-05, "grad_norm": 3.2428431510925293, "learning_rate": 6.350148367952522e-07, "loss": 0.4883, "mean_token_accuracy": 0.8382049798965454, "num_tokens": 57224610.0, "step": 1499 }, { "epoch": 0.19081541788576517, "ewc_loss": 0.015940384939312935, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.1981969590997323e-05, "grad_norm": 3.306776523590088, "learning_rate": 6.354387452310301e-07, "loss": 0.4441, "mean_token_accuracy": 0.8542660474777222, "num_tokens": 57259553.0, "step": 1500 }, { "epoch": 0.19094262816435567, "ewc_loss": 0.015941184014081955, "ewc_loss_diag": 1.0728836059570312e-05, "ewc_loss_parallel": 5.1989962230436504e-05, "grad_norm": 3.315436363220215, "learning_rate": 6.358626536668079e-07, "loss": 0.4474, "mean_token_accuracy": 0.8572816252708435, "num_tokens": 57295819.0, "step": 1501 }, { "epoch": 0.1910698384429462, "ewc_loss": 0.015990272164344788, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.1870487368432805e-05, "grad_norm": 3.1370019912719727, "learning_rate": 6.362865621025858e-07, "loss": 0.4364, "mean_token_accuracy": 0.8604435920715332, "num_tokens": 57339110.0, "step": 1502 }, { "epoch": 0.1911970487215367, "ewc_loss": 0.015920260921120644, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.117038017488085e-05, "grad_norm": 3.263991355895996, "learning_rate": 6.367104705383637e-07, "loss": 0.4652, "mean_token_accuracy": 0.8509504199028015, "num_tokens": 57377311.0, "step": 1503 }, { "epoch": 0.1913242590001272, "ewc_loss": 0.01603296585381031, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.229743692325428e-05, "grad_norm": 3.233222007751465, "learning_rate": 6.371343789741416e-07, "loss": 0.4765, "mean_token_accuracy": 0.8432093858718872, "num_tokens": 57419281.0, "step": 1504 }, { "epoch": 0.19145146927871773, "ewc_loss": 0.01596902497112751, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.165801849216223e-05, "grad_norm": 3.2005083560943604, "learning_rate": 6.375582874099195e-07, "loss": 0.4654, "mean_token_accuracy": 0.8504343628883362, "num_tokens": 57458134.0, "step": 1505 }, { "epoch": 0.19157867955730823, "ewc_loss": 0.015988539904356003, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.185317058931105e-05, "grad_norm": 3.3607287406921387, "learning_rate": 6.379821958456974e-07, "loss": 0.5184, "mean_token_accuracy": 0.8346376419067383, "num_tokens": 57492931.0, "step": 1506 }, { "epoch": 0.19170588983589873, "ewc_loss": 0.016069402918219566, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.266180232865736e-05, "grad_norm": 3.258110523223877, "learning_rate": 6.384061042814751e-07, "loss": 0.4899, "mean_token_accuracy": 0.8440242409706116, "num_tokens": 57529581.0, "step": 1507 }, { "epoch": 0.19183310011448926, "ewc_loss": 0.015991603955626488, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.188381692278199e-05, "grad_norm": 3.2439846992492676, "learning_rate": 6.38830012717253e-07, "loss": 0.4792, "mean_token_accuracy": 0.8478578925132751, "num_tokens": 57570302.0, "step": 1508 }, { "epoch": 0.19196031039307976, "ewc_loss": 0.016021210700273514, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.217987927608192e-05, "grad_norm": 3.1728909015655518, "learning_rate": 6.392539211530309e-07, "loss": 0.4879, "mean_token_accuracy": 0.84295654296875, "num_tokens": 57615974.0, "step": 1509 }, { "epoch": 0.19208752067167026, "ewc_loss": 0.01599741354584694, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.194190816837363e-05, "grad_norm": 3.3277950286865234, "learning_rate": 6.396778295888087e-07, "loss": 0.4533, "mean_token_accuracy": 0.8564058542251587, "num_tokens": 57653737.0, "step": 1510 }, { "epoch": 0.1922147309502608, "ewc_loss": 0.01607983186841011, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.2766095905099064e-05, "grad_norm": 3.245049238204956, "learning_rate": 6.401017380245867e-07, "loss": 0.4879, "mean_token_accuracy": 0.8429065346717834, "num_tokens": 57693431.0, "step": 1511 }, { "epoch": 0.1923419412288513, "ewc_loss": 0.015997031703591347, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.193809192860499e-05, "grad_norm": 3.1753451824188232, "learning_rate": 6.405256464603645e-07, "loss": 0.4711, "mean_token_accuracy": 0.8475946187973022, "num_tokens": 57736413.0, "step": 1512 }, { "epoch": 0.1924691515074418, "ewc_loss": 0.016011705622076988, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.208483344176784e-05, "grad_norm": 3.2639172077178955, "learning_rate": 6.409495548961425e-07, "loss": 0.4558, "mean_token_accuracy": 0.8549899458885193, "num_tokens": 57773293.0, "step": 1513 }, { "epoch": 0.19259636178603232, "ewc_loss": 0.016058584675192833, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.2553616114892066e-05, "grad_norm": 3.301522970199585, "learning_rate": 6.413734633319203e-07, "loss": 0.4684, "mean_token_accuracy": 0.8506656289100647, "num_tokens": 57809294.0, "step": 1514 }, { "epoch": 0.19272357206462282, "ewc_loss": 0.016055172309279442, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.251949914963916e-05, "grad_norm": 3.271857738494873, "learning_rate": 6.417973717676981e-07, "loss": 0.4404, "mean_token_accuracy": 0.8560178279876709, "num_tokens": 57843610.0, "step": 1515 }, { "epoch": 0.19285078234321335, "ewc_loss": 0.01605253666639328, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.2493149269139394e-05, "grad_norm": 3.2010772228240967, "learning_rate": 6.42221280203476e-07, "loss": 0.5018, "mean_token_accuracy": 0.8371834754943848, "num_tokens": 57893098.0, "step": 1516 }, { "epoch": 0.19297799262180385, "ewc_loss": 0.016031460836529732, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.228238660492934e-05, "grad_norm": 3.2689051628112793, "learning_rate": 6.426451886392539e-07, "loss": 0.5271, "mean_token_accuracy": 0.8367878794670105, "num_tokens": 57933098.0, "step": 1517 }, { "epoch": 0.19310520290039435, "ewc_loss": 0.0160980261862278, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.294803122524172e-05, "grad_norm": 3.2743747234344482, "learning_rate": 6.430690970750317e-07, "loss": 0.4581, "mean_token_accuracy": 0.8528831005096436, "num_tokens": 57971502.0, "step": 1518 }, { "epoch": 0.19323241317898487, "ewc_loss": 0.016063401475548744, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.2601790230255574e-05, "grad_norm": 3.3996808528900146, "learning_rate": 6.434930055108097e-07, "loss": 0.5066, "mean_token_accuracy": 0.8397255539894104, "num_tokens": 58001557.0, "step": 1519 }, { "epoch": 0.19335962345757537, "ewc_loss": 0.0161390732973814, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.335850437404588e-05, "grad_norm": 3.308128595352173, "learning_rate": 6.439169139465875e-07, "loss": 0.5157, "mean_token_accuracy": 0.8347629308700562, "num_tokens": 58038058.0, "step": 1520 }, { "epoch": 0.19348683373616588, "ewc_loss": 0.016063623130321503, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.260400575934909e-05, "grad_norm": 3.25905442237854, "learning_rate": 6.443408223823655e-07, "loss": 0.4846, "mean_token_accuracy": 0.8437684178352356, "num_tokens": 58073138.0, "step": 1521 }, { "epoch": 0.1936140440147564, "ewc_loss": 0.016073813661932945, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.270591645967215e-05, "grad_norm": 3.2220494747161865, "learning_rate": 6.447647308181432e-07, "loss": 0.4766, "mean_token_accuracy": 0.8478261232376099, "num_tokens": 58111793.0, "step": 1522 }, { "epoch": 0.1937412542933469, "ewc_loss": 0.016143355518579483, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 5.279098331811838e-05, "grad_norm": 3.1982803344726562, "learning_rate": 6.451886392539211e-07, "loss": 0.4638, "mean_token_accuracy": 0.8503284454345703, "num_tokens": 58152911.0, "step": 1523 }, { "epoch": 0.1938684645719374, "ewc_loss": 0.016088319942355156, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.2850969950668514e-05, "grad_norm": 3.254347801208496, "learning_rate": 6.45612547689699e-07, "loss": 0.4814, "mean_token_accuracy": 0.8471565842628479, "num_tokens": 58192095.0, "step": 1524 }, { "epoch": 0.19399567485052793, "ewc_loss": 0.016134776175022125, "ewc_loss_diag": 1.0788440704345703e-05, "ewc_loss_parallel": 5.33155252924189e-05, "grad_norm": 3.2224645614624023, "learning_rate": 6.460364561254769e-07, "loss": 0.4369, "mean_token_accuracy": 0.8595802783966064, "num_tokens": 58232651.0, "step": 1525 }, { "epoch": 0.19412288512911843, "ewc_loss": 0.01616714708507061, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 5.302888894220814e-05, "grad_norm": 3.22493314743042, "learning_rate": 6.464603645612547e-07, "loss": 0.435, "mean_token_accuracy": 0.8600801825523376, "num_tokens": 58270802.0, "step": 1526 }, { "epoch": 0.19425009540770893, "ewc_loss": 0.016184542328119278, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 5.3202846174826846e-05, "grad_norm": 3.21515154838562, "learning_rate": 6.468842729970327e-07, "loss": 0.4675, "mean_token_accuracy": 0.8522900938987732, "num_tokens": 58313179.0, "step": 1527 }, { "epoch": 0.19437730568629946, "ewc_loss": 0.016194891184568405, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 5.330632484401576e-05, "grad_norm": 3.3131003379821777, "learning_rate": 6.473081814328105e-07, "loss": 0.5077, "mean_token_accuracy": 0.843975841999054, "num_tokens": 58354728.0, "step": 1528 }, { "epoch": 0.19450451596488996, "ewc_loss": 0.01622353494167328, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 5.359277201932855e-05, "grad_norm": 3.210453748703003, "learning_rate": 6.477320898685885e-07, "loss": 0.489, "mean_token_accuracy": 0.8459061980247498, "num_tokens": 58398649.0, "step": 1529 }, { "epoch": 0.19463172624348046, "ewc_loss": 0.016171108931303024, "ewc_loss_diag": 1.0848045349121094e-05, "ewc_loss_parallel": 5.30685210833326e-05, "grad_norm": 3.2923898696899414, "learning_rate": 6.481559983043662e-07, "loss": 0.5007, "mean_token_accuracy": 0.8393863439559937, "num_tokens": 58438261.0, "step": 1530 }, { "epoch": 0.194758936522071, "ewc_loss": 0.016300063580274582, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.374770626076497e-05, "grad_norm": 3.2586076259613037, "learning_rate": 6.48579906740144e-07, "loss": 0.4463, "mean_token_accuracy": 0.8564965128898621, "num_tokens": 58479141.0, "step": 1531 }, { "epoch": 0.1948861468006615, "ewc_loss": 0.016257813200354576, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.3325198678066954e-05, "grad_norm": 3.265181303024292, "learning_rate": 6.49003815175922e-07, "loss": 0.415, "mean_token_accuracy": 0.8672769069671631, "num_tokens": 58515700.0, "step": 1532 }, { "epoch": 0.195013357079252, "ewc_loss": 0.016275595873594284, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.350301944417879e-05, "grad_norm": 3.3330583572387695, "learning_rate": 6.494277236116998e-07, "loss": 0.4213, "mean_token_accuracy": 0.865153431892395, "num_tokens": 58549825.0, "step": 1533 }, { "epoch": 0.19514056735784252, "ewc_loss": 0.01630472019314766, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.3794283303432167e-05, "grad_norm": 3.263322114944458, "learning_rate": 6.498516320474777e-07, "loss": 0.5609, "mean_token_accuracy": 0.825884222984314, "num_tokens": 58596110.0, "step": 1534 }, { "epoch": 0.19526777763643302, "ewc_loss": 0.01626265048980713, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.337356560630724e-05, "grad_norm": 3.324204206466675, "learning_rate": 6.502755404832556e-07, "loss": 0.471, "mean_token_accuracy": 0.8478243350982666, "num_tokens": 58632410.0, "step": 1535 }, { "epoch": 0.19539498791502352, "ewc_loss": 0.016305115073919296, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.379822323448025e-05, "grad_norm": 3.311896324157715, "learning_rate": 6.506994489190335e-07, "loss": 0.5327, "mean_token_accuracy": 0.8356714844703674, "num_tokens": 58671339.0, "step": 1536 }, { "epoch": 0.19552219819361405, "ewc_loss": 0.016298536211252213, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.373243038775399e-05, "grad_norm": 3.234311819076538, "learning_rate": 6.511233573548114e-07, "loss": 0.4334, "mean_token_accuracy": 0.860236644744873, "num_tokens": 58711786.0, "step": 1537 }, { "epoch": 0.19564940847220455, "ewc_loss": 0.016269780695438385, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.3444884542841464e-05, "grad_norm": 3.3327717781066895, "learning_rate": 6.515472657905892e-07, "loss": 0.476, "mean_token_accuracy": 0.8473467230796814, "num_tokens": 58743600.0, "step": 1538 }, { "epoch": 0.19577661875079505, "ewc_loss": 0.01632516086101532, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.399868678068742e-05, "grad_norm": 3.3408610820770264, "learning_rate": 6.51971174226367e-07, "loss": 0.5313, "mean_token_accuracy": 0.8325526714324951, "num_tokens": 58780622.0, "step": 1539 }, { "epoch": 0.19590382902938558, "ewc_loss": 0.0163152813911438, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.389988291426562e-05, "grad_norm": 3.29524827003479, "learning_rate": 6.52395082662145e-07, "loss": 0.5239, "mean_token_accuracy": 0.8332604765892029, "num_tokens": 58819370.0, "step": 1540 }, { "epoch": 0.19603103930797608, "ewc_loss": 0.016298893839120865, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.373599924496375e-05, "grad_norm": 3.2644131183624268, "learning_rate": 6.528189910979228e-07, "loss": 0.4498, "mean_token_accuracy": 0.855932354927063, "num_tokens": 58856270.0, "step": 1541 }, { "epoch": 0.1961582495865666, "ewc_loss": 0.01631004549562931, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.384752148529515e-05, "grad_norm": 3.224135637283325, "learning_rate": 6.532428995337007e-07, "loss": 0.4673, "mean_token_accuracy": 0.849898099899292, "num_tokens": 58898702.0, "step": 1542 }, { "epoch": 0.1962854598651571, "ewc_loss": 0.01629485934972763, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.3695664973929524e-05, "grad_norm": 3.321270227432251, "learning_rate": 6.536668079694786e-07, "loss": 0.4886, "mean_token_accuracy": 0.8416755199432373, "num_tokens": 58934602.0, "step": 1543 }, { "epoch": 0.1964126701437476, "ewc_loss": 0.01636694371700287, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.4416508646681905e-05, "grad_norm": 3.241748809814453, "learning_rate": 6.540907164052565e-07, "loss": 0.4765, "mean_token_accuracy": 0.8469138741493225, "num_tokens": 58977766.0, "step": 1544 }, { "epoch": 0.19653988042233814, "ewc_loss": 0.016296174377202988, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.370880899135955e-05, "grad_norm": 3.2451324462890625, "learning_rate": 6.545146248410343e-07, "loss": 0.4843, "mean_token_accuracy": 0.8452200293540955, "num_tokens": 59019649.0, "step": 1545 }, { "epoch": 0.19666709070092864, "ewc_loss": 0.016339678317308426, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.414385668700561e-05, "grad_norm": 3.3213160037994385, "learning_rate": 6.549385332768122e-07, "loss": 0.4965, "mean_token_accuracy": 0.843623697757721, "num_tokens": 59060543.0, "step": 1546 }, { "epoch": 0.19679430097951914, "ewc_loss": 0.01637374609708786, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.4484520660480484e-05, "grad_norm": 3.3268184661865234, "learning_rate": 6.5536244171259e-07, "loss": 0.496, "mean_token_accuracy": 0.8411720991134644, "num_tokens": 59100049.0, "step": 1547 }, { "epoch": 0.19692151125810967, "ewc_loss": 0.01635655015707016, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.431256067822687e-05, "grad_norm": 3.282799005508423, "learning_rate": 6.55786350148368e-07, "loss": 0.5005, "mean_token_accuracy": 0.8402784466743469, "num_tokens": 59137370.0, "step": 1548 }, { "epoch": 0.19704872153670017, "ewc_loss": 0.016336914151906967, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.4116218962008134e-05, "grad_norm": 3.340714454650879, "learning_rate": 6.562102585841458e-07, "loss": 0.4551, "mean_token_accuracy": 0.8493001461029053, "num_tokens": 59176196.0, "step": 1549 }, { "epoch": 0.19717593181529067, "ewc_loss": 0.016383789479732513, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.4584954341407865e-05, "grad_norm": 3.239725351333618, "learning_rate": 6.566341670199236e-07, "loss": 0.4527, "mean_token_accuracy": 0.8552307486534119, "num_tokens": 59216215.0, "step": 1550 }, { "epoch": 0.1973031420938812, "ewc_loss": 0.016323113813996315, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.3978204960003495e-05, "grad_norm": 3.350782871246338, "learning_rate": 6.570580754557016e-07, "loss": 0.4683, "mean_token_accuracy": 0.8498402237892151, "num_tokens": 59254173.0, "step": 1551 }, { "epoch": 0.1974303523724717, "ewc_loss": 0.016400925815105438, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.4756328609073535e-05, "grad_norm": 3.3113062381744385, "learning_rate": 6.574819838914794e-07, "loss": 0.4416, "mean_token_accuracy": 0.8585230708122253, "num_tokens": 59288496.0, "step": 1552 }, { "epoch": 0.1975575626510622, "ewc_loss": 0.016352053731679916, "ewc_loss_diag": 1.0907649993896484e-05, "ewc_loss_parallel": 5.4267613450065255e-05, "grad_norm": 3.3056349754333496, "learning_rate": 6.579058923272573e-07, "loss": 0.4986, "mean_token_accuracy": 0.8403439521789551, "num_tokens": 59329043.0, "step": 1553 }, { "epoch": 0.19768477292965272, "ewc_loss": 0.016426092013716698, "ewc_loss_diag": 1.0967254638671875e-05, "ewc_loss_parallel": 5.4397645726567134e-05, "grad_norm": 3.3815360069274902, "learning_rate": 6.583298007630351e-07, "loss": 0.4894, "mean_token_accuracy": 0.8434748649597168, "num_tokens": 59360743.0, "step": 1554 }, { "epoch": 0.19781198320824323, "ewc_loss": 0.01651746593415737, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.470103133120574e-05, "grad_norm": 3.24855899810791, "learning_rate": 6.58753709198813e-07, "loss": 0.488, "mean_token_accuracy": 0.8440932035446167, "num_tokens": 59404167.0, "step": 1555 }, { "epoch": 0.19793919348683373, "ewc_loss": 0.016447363421320915, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.400000009103678e-05, "grad_norm": 3.260068893432617, "learning_rate": 6.591776176345909e-07, "loss": 0.4554, "mean_token_accuracy": 0.8507359027862549, "num_tokens": 59447770.0, "step": 1556 }, { "epoch": 0.19806640376542425, "ewc_loss": 0.016496924683451653, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.44956092198845e-05, "grad_norm": 3.2576656341552734, "learning_rate": 6.596015260703688e-07, "loss": 0.4639, "mean_token_accuracy": 0.8549419045448303, "num_tokens": 59494989.0, "step": 1557 }, { "epoch": 0.19819361404401475, "ewc_loss": 0.016483183950185776, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.4358217312255874e-05, "grad_norm": 3.302164316177368, "learning_rate": 6.600254345061466e-07, "loss": 0.4523, "mean_token_accuracy": 0.8560612201690674, "num_tokens": 59534926.0, "step": 1558 }, { "epoch": 0.19832082432260525, "ewc_loss": 0.016522768884897232, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.4754051234340295e-05, "grad_norm": 3.339702606201172, "learning_rate": 6.604493429419246e-07, "loss": 0.5066, "mean_token_accuracy": 0.8390991687774658, "num_tokens": 59577832.0, "step": 1559 }, { "epoch": 0.19844803460119578, "ewc_loss": 0.016510043293237686, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.462680201162584e-05, "grad_norm": 3.2668206691741943, "learning_rate": 6.608732513777023e-07, "loss": 0.5141, "mean_token_accuracy": 0.8445130586624146, "num_tokens": 59624995.0, "step": 1560 }, { "epoch": 0.19857524487978628, "ewc_loss": 0.016498973593115807, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.4516105592483655e-05, "grad_norm": 3.297600030899048, "learning_rate": 6.612971598134803e-07, "loss": 0.4675, "mean_token_accuracy": 0.8500792980194092, "num_tokens": 59663000.0, "step": 1561 }, { "epoch": 0.19870245515837678, "ewc_loss": 0.016531553119421005, "ewc_loss_diag": 1.1026859283447266e-05, "ewc_loss_parallel": 5.484189750859514e-05, "grad_norm": 3.494532585144043, "learning_rate": 6.617210682492581e-07, "loss": 0.5199, "mean_token_accuracy": 0.834929883480072, "num_tokens": 59700241.0, "step": 1562 }, { "epoch": 0.1988296654369673, "ewc_loss": 0.016664182767271996, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 5.555784809985198e-05, "grad_norm": 3.297569751739502, "learning_rate": 6.62144976685036e-07, "loss": 0.4415, "mean_token_accuracy": 0.8597620129585266, "num_tokens": 59737244.0, "step": 1563 }, { "epoch": 0.1989568757155578, "ewc_loss": 0.016537662595510483, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 5.4292642744258046e-05, "grad_norm": 3.311293363571167, "learning_rate": 6.625688851208139e-07, "loss": 0.4903, "mean_token_accuracy": 0.8447189927101135, "num_tokens": 59775538.0, "step": 1564 }, { "epoch": 0.19908408599414834, "ewc_loss": 0.01660945825278759, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 5.501059422385879e-05, "grad_norm": 3.45749831199646, "learning_rate": 6.629927935565918e-07, "loss": 0.4855, "mean_token_accuracy": 0.84195876121521, "num_tokens": 59810373.0, "step": 1565 }, { "epoch": 0.19921129627273884, "ewc_loss": 0.016647836193442345, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 5.539438279811293e-05, "grad_norm": 3.3449695110321045, "learning_rate": 6.634167019923696e-07, "loss": 0.5175, "mean_token_accuracy": 0.8398221731185913, "num_tokens": 59849992.0, "step": 1566 }, { "epoch": 0.19933850655132934, "ewc_loss": 0.016552552580833435, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 5.444153430289589e-05, "grad_norm": 3.348024845123291, "learning_rate": 6.638406104281476e-07, "loss": 0.477, "mean_token_accuracy": 0.8471872806549072, "num_tokens": 59882552.0, "step": 1567 }, { "epoch": 0.19946571682991987, "ewc_loss": 0.01660347729921341, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 5.495079676620662e-05, "grad_norm": 3.239936590194702, "learning_rate": 6.642645188639253e-07, "loss": 0.4524, "mean_token_accuracy": 0.8534708023071289, "num_tokens": 59925090.0, "step": 1568 }, { "epoch": 0.19959292710851037, "ewc_loss": 0.016558755189180374, "ewc_loss_diag": 1.1086463928222656e-05, "ewc_loss_parallel": 5.450357639347203e-05, "grad_norm": 3.33555269241333, "learning_rate": 6.646884272997032e-07, "loss": 0.5002, "mean_token_accuracy": 0.8411699533462524, "num_tokens": 59965406.0, "step": 1569 }, { "epoch": 0.19972013738710087, "ewc_loss": 0.016692498698830605, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.523064828594215e-05, "grad_norm": 3.340226650238037, "learning_rate": 6.651123357354811e-07, "loss": 0.4874, "mean_token_accuracy": 0.8441561460494995, "num_tokens": 60001770.0, "step": 1570 }, { "epoch": 0.1998473476656914, "ewc_loss": 0.016665494069457054, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.496060839504935e-05, "grad_norm": 3.3364076614379883, "learning_rate": 6.655362441712589e-07, "loss": 0.5043, "mean_token_accuracy": 0.8364301919937134, "num_tokens": 60036911.0, "step": 1571 }, { "epoch": 0.1999745579442819, "ewc_loss": 0.016684114933013916, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.5146807426353917e-05, "grad_norm": 3.2618660926818848, "learning_rate": 6.659601526070369e-07, "loss": 0.5318, "mean_token_accuracy": 0.8342456817626953, "num_tokens": 60080210.0, "step": 1572 }, { "epoch": 0.2001017682228724, "ewc_loss": 0.016661694273352623, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.492260606843047e-05, "grad_norm": 3.4433085918426514, "learning_rate": 6.663840610428147e-07, "loss": 0.5652, "mean_token_accuracy": 0.8261748552322388, "num_tokens": 60111533.0, "step": 1573 }, { "epoch": 0.20022897850146293, "ewc_loss": 0.016777850687503815, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.608417995972559e-05, "grad_norm": 3.281397581100464, "learning_rate": 6.668079694785926e-07, "loss": 0.4446, "mean_token_accuracy": 0.8576599955558777, "num_tokens": 60150733.0, "step": 1574 }, { "epoch": 0.20035618878005343, "ewc_loss": 0.01665782369673252, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.4883894335944206e-05, "grad_norm": 3.3276593685150146, "learning_rate": 6.672318779143704e-07, "loss": 0.4629, "mean_token_accuracy": 0.8530311584472656, "num_tokens": 60185034.0, "step": 1575 }, { "epoch": 0.20048339905864393, "ewc_loss": 0.01676277443766594, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.593341120402329e-05, "grad_norm": 3.425863027572632, "learning_rate": 6.676557863501483e-07, "loss": 0.4931, "mean_token_accuracy": 0.8439407348632812, "num_tokens": 60216865.0, "step": 1576 }, { "epoch": 0.20061060933723446, "ewc_loss": 0.01677093468606472, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.6015011068666354e-05, "grad_norm": 3.2479679584503174, "learning_rate": 6.680796947859262e-07, "loss": 0.4818, "mean_token_accuracy": 0.8457126617431641, "num_tokens": 60257685.0, "step": 1577 }, { "epoch": 0.20073781961582496, "ewc_loss": 0.016717001795768738, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.547568071051501e-05, "grad_norm": 3.353990316390991, "learning_rate": 6.685036032217041e-07, "loss": 0.4779, "mean_token_accuracy": 0.8481734395027161, "num_tokens": 60294258.0, "step": 1578 }, { "epoch": 0.20086502989441546, "ewc_loss": 0.016793381422758102, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.62394670851063e-05, "grad_norm": 3.312286376953125, "learning_rate": 6.689275116574819e-07, "loss": 0.4662, "mean_token_accuracy": 0.8537213206291199, "num_tokens": 60330319.0, "step": 1579 }, { "epoch": 0.200992240173006, "ewc_loss": 0.016742277890443802, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.572843292611651e-05, "grad_norm": 3.304659128189087, "learning_rate": 6.693514200932599e-07, "loss": 0.4842, "mean_token_accuracy": 0.8480331897735596, "num_tokens": 60370482.0, "step": 1580 }, { "epoch": 0.2011194504515965, "ewc_loss": 0.016770565882325172, "ewc_loss_diag": 1.1146068572998047e-05, "ewc_loss_parallel": 5.6011318520177156e-05, "grad_norm": 3.321401357650757, "learning_rate": 6.697753285290377e-07, "loss": 0.489, "mean_token_accuracy": 0.8434196710586548, "num_tokens": 60412436.0, "step": 1581 }, { "epoch": 0.201246660730187, "ewc_loss": 0.016855861991643906, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.62539353268221e-05, "grad_norm": 3.3642959594726562, "learning_rate": 6.701992369648156e-07, "loss": 0.4132, "mean_token_accuracy": 0.8679808974266052, "num_tokens": 60448982.0, "step": 1582 }, { "epoch": 0.20137387100877752, "ewc_loss": 0.016872115433216095, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.6416472943965346e-05, "grad_norm": 3.293914556503296, "learning_rate": 6.706231454005934e-07, "loss": 0.4307, "mean_token_accuracy": 0.8598664999008179, "num_tokens": 60489878.0, "step": 1583 }, { "epoch": 0.20150108128736802, "ewc_loss": 0.0168196689337492, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.589200009126216e-05, "grad_norm": 3.3575892448425293, "learning_rate": 6.710470538363713e-07, "loss": 0.466, "mean_token_accuracy": 0.853670597076416, "num_tokens": 60528979.0, "step": 1584 }, { "epoch": 0.20162829156595852, "ewc_loss": 0.016859572380781174, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.6291046348633245e-05, "grad_norm": 3.400421380996704, "learning_rate": 6.714709622721492e-07, "loss": 0.4951, "mean_token_accuracy": 0.8419046401977539, "num_tokens": 60561670.0, "step": 1585 }, { "epoch": 0.20175550184454905, "ewc_loss": 0.016851797699928284, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.621329546556808e-05, "grad_norm": 3.318415880203247, "learning_rate": 6.718948707079271e-07, "loss": 0.4683, "mean_token_accuracy": 0.8494094014167786, "num_tokens": 60602061.0, "step": 1586 }, { "epoch": 0.20188271212313955, "ewc_loss": 0.0168142206966877, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.583752499660477e-05, "grad_norm": 3.2947747707366943, "learning_rate": 6.723187791437049e-07, "loss": 0.4703, "mean_token_accuracy": 0.8499165773391724, "num_tokens": 60639805.0, "step": 1587 }, { "epoch": 0.20200992240173005, "ewc_loss": 0.016827404499053955, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.596936171059497e-05, "grad_norm": 3.264122247695923, "learning_rate": 6.727426875794829e-07, "loss": 0.4978, "mean_token_accuracy": 0.8425090312957764, "num_tokens": 60681816.0, "step": 1588 }, { "epoch": 0.20213713268032057, "ewc_loss": 0.016840383410453796, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.609915388049558e-05, "grad_norm": 3.3009848594665527, "learning_rate": 6.731665960152607e-07, "loss": 0.4823, "mean_token_accuracy": 0.8432817459106445, "num_tokens": 60726343.0, "step": 1589 }, { "epoch": 0.20226434295891108, "ewc_loss": 0.01685951091349125, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.6290420616278425e-05, "grad_norm": 3.388720989227295, "learning_rate": 6.735905044510385e-07, "loss": 0.527, "mean_token_accuracy": 0.8266445398330688, "num_tokens": 60764814.0, "step": 1590 }, { "epoch": 0.2023915532375016, "ewc_loss": 0.016874495893716812, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.644027623930015e-05, "grad_norm": 3.3781421184539795, "learning_rate": 6.740144128868164e-07, "loss": 0.4935, "mean_token_accuracy": 0.843862771987915, "num_tokens": 60802384.0, "step": 1591 }, { "epoch": 0.2025187635160921, "ewc_loss": 0.01686856709420681, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.638098082272336e-05, "grad_norm": 3.3301243782043457, "learning_rate": 6.744383213225942e-07, "loss": 0.5188, "mean_token_accuracy": 0.8348409533500671, "num_tokens": 60842085.0, "step": 1592 }, { "epoch": 0.2026459737946826, "ewc_loss": 0.01684766262769699, "ewc_loss_diag": 1.1205673217773438e-05, "ewc_loss_parallel": 5.617193164653145e-05, "grad_norm": 3.361114025115967, "learning_rate": 6.748622297583722e-07, "loss": 0.4561, "mean_token_accuracy": 0.8502489328384399, "num_tokens": 60874835.0, "step": 1593 }, { "epoch": 0.20277318407327313, "ewc_loss": 0.016923708841204643, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 5.632204556604847e-05, "grad_norm": 3.341362237930298, "learning_rate": 6.7528613819415e-07, "loss": 0.4807, "mean_token_accuracy": 0.844097375869751, "num_tokens": 60910682.0, "step": 1594 }, { "epoch": 0.20290039435186363, "ewc_loss": 0.016905883327126503, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 5.614379915641621e-05, "grad_norm": 3.2791495323181152, "learning_rate": 6.757100466299279e-07, "loss": 0.4783, "mean_token_accuracy": 0.8494973182678223, "num_tokens": 60951352.0, "step": 1595 }, { "epoch": 0.20302760463045413, "ewc_loss": 0.016900133341550827, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 5.608630453934893e-05, "grad_norm": 3.586134672164917, "learning_rate": 6.761339550657058e-07, "loss": 0.5046, "mean_token_accuracy": 0.8378455638885498, "num_tokens": 60979055.0, "step": 1596 }, { "epoch": 0.20315481490904466, "ewc_loss": 0.017064420506358147, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 5.772917211288586e-05, "grad_norm": 3.311018228530884, "learning_rate": 6.765578635014837e-07, "loss": 0.4855, "mean_token_accuracy": 0.8481179475784302, "num_tokens": 61017437.0, "step": 1597 }, { "epoch": 0.20328202518763516, "ewc_loss": 0.016866786405444145, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 5.575282557401806e-05, "grad_norm": 3.334719181060791, "learning_rate": 6.769817719372614e-07, "loss": 0.4605, "mean_token_accuracy": 0.8524068593978882, "num_tokens": 61052286.0, "step": 1598 }, { "epoch": 0.20340923546622566, "ewc_loss": 0.016968928277492523, "ewc_loss_diag": 1.1265277862548828e-05, "ewc_loss_parallel": 5.677423905581236e-05, "grad_norm": 3.410062074661255, "learning_rate": 6.774056803730394e-07, "loss": 0.5065, "mean_token_accuracy": 0.8384160995483398, "num_tokens": 61086110.0, "step": 1599 }, { "epoch": 0.2035364457448162, "ewc_loss": 0.017174962908029556, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 5.700354086002335e-05, "grad_norm": 3.4945719242095947, "learning_rate": 6.778295888088172e-07, "loss": 0.4776, "mean_token_accuracy": 0.8479707837104797, "num_tokens": 61124181.0, "step": 1600 }, { "epoch": 0.2036636560234067, "ewc_loss": 0.01706702634692192, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 5.714486542274244e-05, "grad_norm": 3.3514914512634277, "learning_rate": 6.782534972445952e-07, "loss": 0.4972, "mean_token_accuracy": 0.8397172689437866, "num_tokens": 61160881.0, "step": 1601 }, { "epoch": 0.2037908663019972, "ewc_loss": 0.016985392197966576, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 5.6328535720240325e-05, "grad_norm": 3.3421618938446045, "learning_rate": 6.78677405680373e-07, "loss": 0.4956, "mean_token_accuracy": 0.8443757891654968, "num_tokens": 61200755.0, "step": 1602 }, { "epoch": 0.20391807658058772, "ewc_loss": 0.017036311328411102, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 5.6837729061953723e-05, "grad_norm": 3.3260881900787354, "learning_rate": 6.791013141161509e-07, "loss": 0.4862, "mean_token_accuracy": 0.8418363928794861, "num_tokens": 61240045.0, "step": 1603 }, { "epoch": 0.20404528685917822, "ewc_loss": 0.017034584656357765, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 5.682045593857765e-05, "grad_norm": 3.3219375610351562, "learning_rate": 6.795252225519288e-07, "loss": 0.5115, "mean_token_accuracy": 0.8351919651031494, "num_tokens": 61282832.0, "step": 1604 }, { "epoch": 0.20417249713776872, "ewc_loss": 0.017039284110069275, "ewc_loss_diag": 1.1324882507324219e-05, "ewc_loss_parallel": 5.686745498678647e-05, "grad_norm": 3.3194472789764404, "learning_rate": 6.799491309877067e-07, "loss": 0.4855, "mean_token_accuracy": 0.8426579236984253, "num_tokens": 61319841.0, "step": 1605 }, { "epoch": 0.20429970741635925, "ewc_loss": 0.017101885750889778, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 5.6883109209593385e-05, "grad_norm": 3.4223880767822266, "learning_rate": 6.803730394234844e-07, "loss": 0.4533, "mean_token_accuracy": 0.8553994297981262, "num_tokens": 61352732.0, "step": 1606 }, { "epoch": 0.20442691769494975, "ewc_loss": 0.01714586466550827, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 5.732289719162509e-05, "grad_norm": 3.3920750617980957, "learning_rate": 6.807969478592624e-07, "loss": 0.4869, "mean_token_accuracy": 0.8466871976852417, "num_tokens": 61388324.0, "step": 1607 }, { "epoch": 0.20455412797354025, "ewc_loss": 0.01711391657590866, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 5.7003420806722715e-05, "grad_norm": 3.314131498336792, "learning_rate": 6.812208562950402e-07, "loss": 0.5111, "mean_token_accuracy": 0.8432415127754211, "num_tokens": 61431222.0, "step": 1608 }, { "epoch": 0.20468133825213078, "ewc_loss": 0.01716301031410694, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 5.688401506631635e-05, "grad_norm": 3.3082547187805176, "learning_rate": 6.816447647308182e-07, "loss": 0.5397, "mean_token_accuracy": 0.8290741443634033, "num_tokens": 61475617.0, "step": 1609 }, { "epoch": 0.20480854853072128, "ewc_loss": 0.0171902384608984, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 5.71562850382179e-05, "grad_norm": 3.3471972942352295, "learning_rate": 6.82068673166596e-07, "loss": 0.4684, "mean_token_accuracy": 0.8520709276199341, "num_tokens": 61511338.0, "step": 1610 }, { "epoch": 0.20493575880931178, "ewc_loss": 0.017137864604592323, "ewc_loss_diag": 1.138448715209961e-05, "ewc_loss_parallel": 5.724290167563595e-05, "grad_norm": 3.369541645050049, "learning_rate": 6.824925816023738e-07, "loss": 0.4893, "mean_token_accuracy": 0.8422166109085083, "num_tokens": 61549093.0, "step": 1611 }, { "epoch": 0.2050629690879023, "ewc_loss": 0.01720777153968811, "ewc_loss_diag": 1.1444091796875e-05, "ewc_loss_parallel": 5.73316247027833e-05, "grad_norm": 3.264030933380127, "learning_rate": 6.829164900381518e-07, "loss": 0.4585, "mean_token_accuracy": 0.8532184958457947, "num_tokens": 61593843.0, "step": 1612 }, { "epoch": 0.2051901793664928, "ewc_loss": 0.01717543415725231, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.700825204257853e-05, "grad_norm": 3.4072248935699463, "learning_rate": 6.833403984739295e-07, "loss": 0.4688, "mean_token_accuracy": 0.8483250737190247, "num_tokens": 61629362.0, "step": 1613 }, { "epoch": 0.2053173896450833, "ewc_loss": 0.017258619889616966, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.784010500065051e-05, "grad_norm": 3.3910505771636963, "learning_rate": 6.837643069097074e-07, "loss": 0.5498, "mean_token_accuracy": 0.8266969919204712, "num_tokens": 61667037.0, "step": 1614 }, { "epoch": 0.20544459992367384, "ewc_loss": 0.01721934974193573, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.7447403378318995e-05, "grad_norm": 3.37507963180542, "learning_rate": 6.841882153454853e-07, "loss": 0.4664, "mean_token_accuracy": 0.8482488393783569, "num_tokens": 61701976.0, "step": 1615 }, { "epoch": 0.20557181020226434, "ewc_loss": 0.017225190997123718, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.750581112806685e-05, "grad_norm": 3.3580291271209717, "learning_rate": 6.846121237812632e-07, "loss": 0.4344, "mean_token_accuracy": 0.8610897660255432, "num_tokens": 61739068.0, "step": 1616 }, { "epoch": 0.20569902048085487, "ewc_loss": 0.01721246726810932, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.737857645726763e-05, "grad_norm": 3.337697744369507, "learning_rate": 6.850360322170411e-07, "loss": 0.4541, "mean_token_accuracy": 0.8548614382743835, "num_tokens": 61781515.0, "step": 1617 }, { "epoch": 0.20582623075944537, "ewc_loss": 0.017223302274942398, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.7486919104121625e-05, "grad_norm": 3.4862911701202393, "learning_rate": 6.85459940652819e-07, "loss": 0.467, "mean_token_accuracy": 0.8481079936027527, "num_tokens": 61815201.0, "step": 1618 }, { "epoch": 0.20595344103803587, "ewc_loss": 0.017275922000408173, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.801312727271579e-05, "grad_norm": 3.3555381298065186, "learning_rate": 6.858838490885968e-07, "loss": 0.5226, "mean_token_accuracy": 0.8332006931304932, "num_tokens": 61856639.0, "step": 1619 }, { "epoch": 0.2060806513166264, "ewc_loss": 0.017197394743561745, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.7227858633268625e-05, "grad_norm": 3.4074606895446777, "learning_rate": 6.863077575243748e-07, "loss": 0.5336, "mean_token_accuracy": 0.8322091102600098, "num_tokens": 61894313.0, "step": 1620 }, { "epoch": 0.2062078615952169, "ewc_loss": 0.017275355756282806, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.8007470215670764e-05, "grad_norm": 3.395660638809204, "learning_rate": 6.867316659601525e-07, "loss": 0.4705, "mean_token_accuracy": 0.8475602865219116, "num_tokens": 61930748.0, "step": 1621 }, { "epoch": 0.2063350718738074, "ewc_loss": 0.017229313030838966, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.754703306593001e-05, "grad_norm": 3.3114278316497803, "learning_rate": 6.871555743959304e-07, "loss": 0.4859, "mean_token_accuracy": 0.846649169921875, "num_tokens": 61972002.0, "step": 1622 }, { "epoch": 0.20646228215239792, "ewc_loss": 0.01721995137631893, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.745341331930831e-05, "grad_norm": 3.4105353355407715, "learning_rate": 6.875794828317083e-07, "loss": 0.4614, "mean_token_accuracy": 0.854656457901001, "num_tokens": 62010141.0, "step": 1623 }, { "epoch": 0.20658949243098843, "ewc_loss": 0.017292506992816925, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.817898636450991e-05, "grad_norm": 3.3052327632904053, "learning_rate": 6.880033912674862e-07, "loss": 0.4498, "mean_token_accuracy": 0.8574909567832947, "num_tokens": 62055907.0, "step": 1624 }, { "epoch": 0.20671670270957893, "ewc_loss": 0.0172035563737154, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.7289464166387916e-05, "grad_norm": 3.3842508792877197, "learning_rate": 6.884272997032641e-07, "loss": 0.5554, "mean_token_accuracy": 0.8231366872787476, "num_tokens": 62095885.0, "step": 1625 }, { "epoch": 0.20684391298816945, "ewc_loss": 0.01729329116642475, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.8186822570860386e-05, "grad_norm": 3.42205810546875, "learning_rate": 6.88851208139042e-07, "loss": 0.4481, "mean_token_accuracy": 0.8592799305915833, "num_tokens": 62130497.0, "step": 1626 }, { "epoch": 0.20697112326675995, "ewc_loss": 0.017270533367991447, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.7959234254667535e-05, "grad_norm": 3.3658535480499268, "learning_rate": 6.892751165748198e-07, "loss": 0.5062, "mean_token_accuracy": 0.8376473784446716, "num_tokens": 62170296.0, "step": 1627 }, { "epoch": 0.20709833354535045, "ewc_loss": 0.017259903252124786, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.785292887594551e-05, "grad_norm": 3.3537042140960693, "learning_rate": 6.896990250105978e-07, "loss": 0.4471, "mean_token_accuracy": 0.8598430156707764, "num_tokens": 62210168.0, "step": 1628 }, { "epoch": 0.20722554382394098, "ewc_loss": 0.017271384596824646, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.7967754401033744e-05, "grad_norm": 3.450326681137085, "learning_rate": 6.901229334463755e-07, "loss": 0.4698, "mean_token_accuracy": 0.8484014272689819, "num_tokens": 62245251.0, "step": 1629 }, { "epoch": 0.20735275410253148, "ewc_loss": 0.017310071736574173, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.83546279813163e-05, "grad_norm": 3.351046323776245, "learning_rate": 6.905468418821534e-07, "loss": 0.4697, "mean_token_accuracy": 0.855157732963562, "num_tokens": 62286124.0, "step": 1630 }, { "epoch": 0.20747996438112198, "ewc_loss": 0.017238926142454147, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.764317756984383e-05, "grad_norm": 3.375277519226074, "learning_rate": 6.909707503179313e-07, "loss": 0.4733, "mean_token_accuracy": 0.8458796739578247, "num_tokens": 62325604.0, "step": 1631 }, { "epoch": 0.2076071746597125, "ewc_loss": 0.017290959134697914, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.8163495850749314e-05, "grad_norm": 3.412139654159546, "learning_rate": 6.913946587537091e-07, "loss": 0.4693, "mean_token_accuracy": 0.8502617478370667, "num_tokens": 62364467.0, "step": 1632 }, { "epoch": 0.207734384938303, "ewc_loss": 0.017262684181332588, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.788074486190453e-05, "grad_norm": 3.3737268447875977, "learning_rate": 6.918185671894871e-07, "loss": 0.5441, "mean_token_accuracy": 0.8273906707763672, "num_tokens": 62402075.0, "step": 1633 }, { "epoch": 0.2078615952168935, "ewc_loss": 0.017275551334023476, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.800942381029017e-05, "grad_norm": 3.4541873931884766, "learning_rate": 6.922424756252649e-07, "loss": 0.5119, "mean_token_accuracy": 0.8448077440261841, "num_tokens": 62437802.0, "step": 1634 }, { "epoch": 0.20798880549548404, "ewc_loss": 0.017306525260210037, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.831915404996835e-05, "grad_norm": 3.3879289627075195, "learning_rate": 6.926663840610428e-07, "loss": 0.43, "mean_token_accuracy": 0.8609639406204224, "num_tokens": 62476176.0, "step": 1635 }, { "epoch": 0.20811601577407454, "ewc_loss": 0.017260294407606125, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.785685061709955e-05, "grad_norm": 3.3917243480682373, "learning_rate": 6.930902924968206e-07, "loss": 0.4737, "mean_token_accuracy": 0.844900369644165, "num_tokens": 62514378.0, "step": 1636 }, { "epoch": 0.20824322605266504, "ewc_loss": 0.01728004962205887, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.805439286632463e-05, "grad_norm": 3.5198614597320557, "learning_rate": 6.935142009325985e-07, "loss": 0.4957, "mean_token_accuracy": 0.8414747714996338, "num_tokens": 62555302.0, "step": 1637 }, { "epoch": 0.20837043633125557, "ewc_loss": 0.017317822203040123, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.843212784384377e-05, "grad_norm": 3.3439395427703857, "learning_rate": 6.939381093683764e-07, "loss": 0.5136, "mean_token_accuracy": 0.8368918299674988, "num_tokens": 62594356.0, "step": 1638 }, { "epoch": 0.20849764660984607, "ewc_loss": 0.017210688441991806, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.736080129281618e-05, "grad_norm": 3.3316726684570312, "learning_rate": 6.943620178041543e-07, "loss": 0.4322, "mean_token_accuracy": 0.8627129197120667, "num_tokens": 62632903.0, "step": 1639 }, { "epoch": 0.2086248568884366, "ewc_loss": 0.017269492149353027, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.794882235932164e-05, "grad_norm": 3.440748453140259, "learning_rate": 6.947859262399321e-07, "loss": 0.482, "mean_token_accuracy": 0.8528883457183838, "num_tokens": 62668979.0, "step": 1640 }, { "epoch": 0.2087520671670271, "ewc_loss": 0.017315449193120003, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.840840458404273e-05, "grad_norm": 3.369922399520874, "learning_rate": 6.952098346757101e-07, "loss": 0.4687, "mean_token_accuracy": 0.8497354984283447, "num_tokens": 62705867.0, "step": 1641 }, { "epoch": 0.2088792774456176, "ewc_loss": 0.01725766621530056, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.783056985819712e-05, "grad_norm": 3.393584728240967, "learning_rate": 6.956337431114879e-07, "loss": 0.4632, "mean_token_accuracy": 0.8533064126968384, "num_tokens": 62744559.0, "step": 1642 }, { "epoch": 0.20900648772420813, "ewc_loss": 0.017309485003352165, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.834875628352165e-05, "grad_norm": 3.3257009983062744, "learning_rate": 6.960576515472658e-07, "loss": 0.4953, "mean_token_accuracy": 0.8430575728416443, "num_tokens": 62785221.0, "step": 1643 }, { "epoch": 0.20913369800279863, "ewc_loss": 0.01728834956884384, "ewc_loss_diag": 1.150369644165039e-05, "ewc_loss_parallel": 5.813740790472366e-05, "grad_norm": 3.456573486328125, "learning_rate": 6.964815599830436e-07, "loss": 0.4843, "mean_token_accuracy": 0.8456540703773499, "num_tokens": 62820095.0, "step": 1644 }, { "epoch": 0.20926090828138913, "ewc_loss": 0.017434002831578255, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.8983583585359156e-05, "grad_norm": 3.379509210586548, "learning_rate": 6.969054684188215e-07, "loss": 0.5343, "mean_token_accuracy": 0.826979398727417, "num_tokens": 62863414.0, "step": 1645 }, { "epoch": 0.20938811855997966, "ewc_loss": 0.017348380759358406, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.8127367083216086e-05, "grad_norm": 3.3181283473968506, "learning_rate": 6.973293768545994e-07, "loss": 0.4455, "mean_token_accuracy": 0.8573110103607178, "num_tokens": 62904647.0, "step": 1646 }, { "epoch": 0.20951532883857016, "ewc_loss": 0.017373625189065933, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.83798173465766e-05, "grad_norm": 3.34741473197937, "learning_rate": 6.977532852903773e-07, "loss": 0.4785, "mean_token_accuracy": 0.8464561700820923, "num_tokens": 62949476.0, "step": 1647 }, { "epoch": 0.20964253911716066, "ewc_loss": 0.01741344854235649, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.877804505871609e-05, "grad_norm": 3.461243152618408, "learning_rate": 6.981771937261551e-07, "loss": 0.5295, "mean_token_accuracy": 0.833121657371521, "num_tokens": 62984784.0, "step": 1648 }, { "epoch": 0.2097697493957512, "ewc_loss": 0.017457550391554832, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.921906267758459e-05, "grad_norm": 3.3514630794525146, "learning_rate": 6.986011021619331e-07, "loss": 0.4862, "mean_token_accuracy": 0.8455984592437744, "num_tokens": 63024395.0, "step": 1649 }, { "epoch": 0.2098969596743417, "ewc_loss": 0.017387162894010544, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.8515171986073256e-05, "grad_norm": 3.4557018280029297, "learning_rate": 6.990250105977109e-07, "loss": 0.5013, "mean_token_accuracy": 0.8428184986114502, "num_tokens": 63061886.0, "step": 1650 }, { "epoch": 0.2100241699529322, "ewc_loss": 0.017478549852967262, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.942904681432992e-05, "grad_norm": 3.3521196842193604, "learning_rate": 6.994489190334886e-07, "loss": 0.4689, "mean_token_accuracy": 0.8483321666717529, "num_tokens": 63104150.0, "step": 1651 }, { "epoch": 0.21015138023152272, "ewc_loss": 0.01741388440132141, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.878239244339056e-05, "grad_norm": 3.3511805534362793, "learning_rate": 6.998728274692666e-07, "loss": 0.4335, "mean_token_accuracy": 0.861686110496521, "num_tokens": 63144346.0, "step": 1652 }, { "epoch": 0.21027859051011322, "ewc_loss": 0.017444731667637825, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.90908712183591e-05, "grad_norm": 3.4328436851501465, "learning_rate": 7.002967359050444e-07, "loss": 0.4745, "mean_token_accuracy": 0.8544294834136963, "num_tokens": 63180042.0, "step": 1653 }, { "epoch": 0.21040580078870372, "ewc_loss": 0.017472129315137863, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.936485467827879e-05, "grad_norm": 3.377774238586426, "learning_rate": 7.007206443408224e-07, "loss": 0.4248, "mean_token_accuracy": 0.8623911142349243, "num_tokens": 63216570.0, "step": 1654 }, { "epoch": 0.21053301106729425, "ewc_loss": 0.017427340149879456, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.8916961279464886e-05, "grad_norm": 3.394418239593506, "learning_rate": 7.011445527766002e-07, "loss": 0.477, "mean_token_accuracy": 0.8481773138046265, "num_tokens": 63253457.0, "step": 1655 }, { "epoch": 0.21066022134588475, "ewc_loss": 0.01746213808655739, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.9264926676405594e-05, "grad_norm": 3.3667142391204834, "learning_rate": 7.015684612123781e-07, "loss": 0.4627, "mean_token_accuracy": 0.8506777286529541, "num_tokens": 63295430.0, "step": 1656 }, { "epoch": 0.21078743162447525, "ewc_loss": 0.017444293946027756, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.9086483815917745e-05, "grad_norm": 3.3837788105010986, "learning_rate": 7.01992369648156e-07, "loss": 0.4847, "mean_token_accuracy": 0.846990168094635, "num_tokens": 63340118.0, "step": 1657 }, { "epoch": 0.21091464190306577, "ewc_loss": 0.017463337630033493, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.927692836849019e-05, "grad_norm": 3.502802848815918, "learning_rate": 7.024162780839339e-07, "loss": 0.5156, "mean_token_accuracy": 0.8281053304672241, "num_tokens": 63372037.0, "step": 1658 }, { "epoch": 0.21104185218165628, "ewc_loss": 0.01752331852912903, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.987674740026705e-05, "grad_norm": 3.3589670658111572, "learning_rate": 7.028401865197116e-07, "loss": 0.5059, "mean_token_accuracy": 0.8382052779197693, "num_tokens": 63415508.0, "step": 1659 }, { "epoch": 0.21116906246024678, "ewc_loss": 0.017428109422326088, "ewc_loss_diag": 1.1563301086425781e-05, "ewc_loss_parallel": 5.892464832868427e-05, "grad_norm": 3.335284948348999, "learning_rate": 7.032640949554896e-07, "loss": 0.4166, "mean_token_accuracy": 0.8620965480804443, "num_tokens": 63459320.0, "step": 1660 }, { "epoch": 0.2112962727388373, "ewc_loss": 0.017535576596856117, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.938897447776981e-05, "grad_norm": 3.3924031257629395, "learning_rate": 7.036880033912674e-07, "loss": 0.5142, "mean_token_accuracy": 0.8385447263717651, "num_tokens": 63499624.0, "step": 1661 }, { "epoch": 0.2114234830174278, "ewc_loss": 0.017563533037900925, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.9668542235158384e-05, "grad_norm": 3.4119222164154053, "learning_rate": 7.041119118270454e-07, "loss": 0.4701, "mean_token_accuracy": 0.8494259119033813, "num_tokens": 63536507.0, "step": 1662 }, { "epoch": 0.2115506932960183, "ewc_loss": 0.017554596066474915, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.9579164371825755e-05, "grad_norm": 3.418330669403076, "learning_rate": 7.045358202628232e-07, "loss": 0.511, "mean_token_accuracy": 0.839970588684082, "num_tokens": 63576483.0, "step": 1663 }, { "epoch": 0.21167790357460883, "ewc_loss": 0.017549587413668633, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.952908395556733e-05, "grad_norm": 3.654592514038086, "learning_rate": 7.049597286986011e-07, "loss": 0.5017, "mean_token_accuracy": 0.8384617567062378, "num_tokens": 63615568.0, "step": 1664 }, { "epoch": 0.21180511385319933, "ewc_loss": 0.0176517516374588, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 6.055071935406886e-05, "grad_norm": 3.4088826179504395, "learning_rate": 7.05383637134379e-07, "loss": 0.4623, "mean_token_accuracy": 0.852012574672699, "num_tokens": 63650023.0, "step": 1665 }, { "epoch": 0.21193232413178986, "ewc_loss": 0.01747172325849533, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.875044735148549e-05, "grad_norm": 3.394913673400879, "learning_rate": 7.058075455701568e-07, "loss": 0.4927, "mean_token_accuracy": 0.8383175730705261, "num_tokens": 63690229.0, "step": 1666 }, { "epoch": 0.21205953441038036, "ewc_loss": 0.017626525834202766, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 5.9688103647204116e-05, "grad_norm": 3.5107765197753906, "learning_rate": 7.062314540059346e-07, "loss": 0.4381, "mean_token_accuracy": 0.8626430630683899, "num_tokens": 63728169.0, "step": 1667 }, { "epoch": 0.21218674468897086, "ewc_loss": 0.01758979633450508, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.993116064928472e-05, "grad_norm": 3.454261064529419, "learning_rate": 7.066553624417126e-07, "loss": 0.4612, "mean_token_accuracy": 0.8505733013153076, "num_tokens": 63762816.0, "step": 1668 }, { "epoch": 0.2123139549675614, "ewc_loss": 0.0175276268273592, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.930947372689843e-05, "grad_norm": 3.411224842071533, "learning_rate": 7.070792708774904e-07, "loss": 0.4836, "mean_token_accuracy": 0.8456080555915833, "num_tokens": 63804288.0, "step": 1669 }, { "epoch": 0.2124411652461519, "ewc_loss": 0.017619045451283455, "ewc_loss_diag": 1.1682510375976562e-05, "ewc_loss_parallel": 5.96133031649515e-05, "grad_norm": 3.3620779514312744, "learning_rate": 7.075031793132684e-07, "loss": 0.5055, "mean_token_accuracy": 0.8429046869277954, "num_tokens": 63851365.0, "step": 1670 }, { "epoch": 0.2125683755247424, "ewc_loss": 0.01766677387058735, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 5.9480244090082124e-05, "grad_norm": 3.4448094367980957, "learning_rate": 7.079270877490462e-07, "loss": 0.4661, "mean_token_accuracy": 0.8523330092430115, "num_tokens": 63886187.0, "step": 1671 }, { "epoch": 0.21269558580333292, "ewc_loss": 0.017613325268030167, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 6.01664578425698e-05, "grad_norm": 3.4324560165405273, "learning_rate": 7.08350996184824e-07, "loss": 0.5391, "mean_token_accuracy": 0.8296084403991699, "num_tokens": 63927187.0, "step": 1672 }, { "epoch": 0.21282279608192342, "ewc_loss": 0.01758001185953617, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.98333244852256e-05, "grad_norm": 3.422736883163452, "learning_rate": 7.08774904620602e-07, "loss": 0.4602, "mean_token_accuracy": 0.8555417060852051, "num_tokens": 63964088.0, "step": 1673 }, { "epoch": 0.21295000636051392, "ewc_loss": 0.017707308754324913, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 5.9885584050789475e-05, "grad_norm": 3.404849052429199, "learning_rate": 7.091988130563797e-07, "loss": 0.5038, "mean_token_accuracy": 0.8395373821258545, "num_tokens": 64004931.0, "step": 1674 }, { "epoch": 0.21307721663910445, "ewc_loss": 0.017583714798092842, "ewc_loss_diag": 1.1622905731201172e-05, "ewc_loss_parallel": 5.987034455756657e-05, "grad_norm": 3.39019775390625, "learning_rate": 7.096227214921576e-07, "loss": 0.4979, "mean_token_accuracy": 0.8405671715736389, "num_tokens": 64043966.0, "step": 1675 }, { "epoch": 0.21320442691769495, "ewc_loss": 0.017707891762256622, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 5.989140845485963e-05, "grad_norm": 3.440567970275879, "learning_rate": 7.100466299279355e-07, "loss": 0.4815, "mean_token_accuracy": 0.8488194942474365, "num_tokens": 64078886.0, "step": 1676 }, { "epoch": 0.21333163719628545, "ewc_loss": 0.017753519117832184, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 6.03476983087603e-05, "grad_norm": 3.450091600418091, "learning_rate": 7.104705383637134e-07, "loss": 0.5043, "mean_token_accuracy": 0.845421552658081, "num_tokens": 64117291.0, "step": 1677 }, { "epoch": 0.21345884747487598, "ewc_loss": 0.017745114862918854, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 6.026365008438006e-05, "grad_norm": 3.3835794925689697, "learning_rate": 7.108944467994913e-07, "loss": 0.4631, "mean_token_accuracy": 0.8516296148300171, "num_tokens": 64156369.0, "step": 1678 }, { "epoch": 0.21358605775346648, "ewc_loss": 0.017717592418193817, "ewc_loss_diag": 1.1742115020751953e-05, "ewc_loss_parallel": 5.998842971166596e-05, "grad_norm": 3.472169876098633, "learning_rate": 7.113183552352692e-07, "loss": 0.5351, "mean_token_accuracy": 0.8266466856002808, "num_tokens": 64193646.0, "step": 1679 }, { "epoch": 0.21371326803205698, "ewc_loss": 0.017865262925624847, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 6.085478526074439e-05, "grad_norm": 3.4719398021698, "learning_rate": 7.11742263671047e-07, "loss": 0.4286, "mean_token_accuracy": 0.8612537384033203, "num_tokens": 64231933.0, "step": 1680 }, { "epoch": 0.2138404783106475, "ewc_loss": 0.017824359238147736, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 6.0445749113569036e-05, "grad_norm": 3.5212368965148926, "learning_rate": 7.12166172106825e-07, "loss": 0.5591, "mean_token_accuracy": 0.8213056325912476, "num_tokens": 64267418.0, "step": 1681 }, { "epoch": 0.213967688589238, "ewc_loss": 0.017854809761047363, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 6.075025521568023e-05, "grad_norm": 3.4911413192749023, "learning_rate": 7.125900805426027e-07, "loss": 0.4417, "mean_token_accuracy": 0.8580398559570312, "num_tokens": 64302440.0, "step": 1682 }, { "epoch": 0.2140948988678285, "ewc_loss": 0.017833083868026733, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 6.053298420738429e-05, "grad_norm": 3.4385907649993896, "learning_rate": 7.130139889783806e-07, "loss": 0.479, "mean_token_accuracy": 0.8478661179542542, "num_tokens": 64338240.0, "step": 1683 }, { "epoch": 0.21422210914641904, "ewc_loss": 0.01782418042421341, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 6.0443962865974754e-05, "grad_norm": 3.3950960636138916, "learning_rate": 7.134378974141585e-07, "loss": 0.4707, "mean_token_accuracy": 0.8484461903572083, "num_tokens": 64375734.0, "step": 1684 }, { "epoch": 0.21434931942500954, "ewc_loss": 0.017818467691540718, "ewc_loss_diag": 1.1801719665527344e-05, "ewc_loss_parallel": 6.038682840880938e-05, "grad_norm": 3.472771644592285, "learning_rate": 7.138618058499364e-07, "loss": 0.4655, "mean_token_accuracy": 0.851746678352356, "num_tokens": 64411401.0, "step": 1685 }, { "epoch": 0.21447652970360004, "ewc_loss": 0.017935235053300858, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 6.094414129620418e-05, "grad_norm": 3.407992124557495, "learning_rate": 7.142857142857143e-07, "loss": 0.44, "mean_token_accuracy": 0.8599479794502258, "num_tokens": 64454719.0, "step": 1686 }, { "epoch": 0.21460373998219057, "ewc_loss": 0.017882635816931725, "ewc_loss_diag": 1.1861324310302734e-05, "ewc_loss_parallel": 6.041815868229605e-05, "grad_norm": 3.407503128051758, "learning_rate": 7.147096227214922e-07, "loss": 0.4658, "mean_token_accuracy": 0.8532129526138306, "num_tokens": 64495736.0, "step": 1687 }, { "epoch": 0.21473095026078107, "ewc_loss": 0.017974046990275383, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 6.0721911722794175e-05, "grad_norm": 3.430692434310913, "learning_rate": 7.1513353115727e-07, "loss": 0.4453, "mean_token_accuracy": 0.856246292591095, "num_tokens": 64532843.0, "step": 1688 }, { "epoch": 0.21485816053937157, "ewc_loss": 0.018000589683651924, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 6.0987338656559587e-05, "grad_norm": 3.4695022106170654, "learning_rate": 7.155574395930479e-07, "loss": 0.4901, "mean_token_accuracy": 0.8397126197814941, "num_tokens": 64568950.0, "step": 1689 }, { "epoch": 0.2149853708179621, "ewc_loss": 0.017995722591876984, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 6.0938666138099506e-05, "grad_norm": 3.4229071140289307, "learning_rate": 7.159813480288257e-07, "loss": 0.4684, "mean_token_accuracy": 0.8468574285507202, "num_tokens": 64606878.0, "step": 1690 }, { "epoch": 0.2151125810965526, "ewc_loss": 0.01798381470143795, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 6.081958781578578e-05, "grad_norm": 3.4780049324035645, "learning_rate": 7.164052564646035e-07, "loss": 0.4373, "mean_token_accuracy": 0.8592840433120728, "num_tokens": 64643012.0, "step": 1691 }, { "epoch": 0.21523979137514312, "ewc_loss": 0.01802341639995575, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 6.121559999883175e-05, "grad_norm": 3.426372528076172, "learning_rate": 7.168291649003815e-07, "loss": 0.4583, "mean_token_accuracy": 0.8563405871391296, "num_tokens": 64684444.0, "step": 1692 }, { "epoch": 0.21536700165373363, "ewc_loss": 0.0179792158305645, "ewc_loss_diag": 1.1920928955078125e-05, "ewc_loss_parallel": 6.077359648770653e-05, "grad_norm": 3.453026294708252, "learning_rate": 7.172530733361593e-07, "loss": 0.5058, "mean_token_accuracy": 0.8407270908355713, "num_tokens": 64722068.0, "step": 1693 }, { "epoch": 0.21549421193232413, "ewc_loss": 0.018083205446600914, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.120315083535388e-05, "grad_norm": 3.436866521835327, "learning_rate": 7.176769817719373e-07, "loss": 0.4178, "mean_token_accuracy": 0.8718618750572205, "num_tokens": 64761829.0, "step": 1694 }, { "epoch": 0.21562142221091465, "ewc_loss": 0.018057797104120255, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.0949074395466596e-05, "grad_norm": 3.400282382965088, "learning_rate": 7.181008902077151e-07, "loss": 0.4662, "mean_token_accuracy": 0.8517060875892639, "num_tokens": 64799185.0, "step": 1695 }, { "epoch": 0.21574863248950515, "ewc_loss": 0.01806209236383438, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.0992006183369085e-05, "grad_norm": 3.462252378463745, "learning_rate": 7.18524798643493e-07, "loss": 0.4933, "mean_token_accuracy": 0.8444095253944397, "num_tokens": 64839306.0, "step": 1696 }, { "epoch": 0.21587584276809565, "ewc_loss": 0.018104828894138336, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.141939229564741e-05, "grad_norm": 3.4010016918182373, "learning_rate": 7.189487070792708e-07, "loss": 0.4702, "mean_token_accuracy": 0.8520348072052002, "num_tokens": 64876011.0, "step": 1697 }, { "epoch": 0.21600305304668618, "ewc_loss": 0.01805565133690834, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.0927595768589526e-05, "grad_norm": 3.399662494659424, "learning_rate": 7.193726155150487e-07, "loss": 0.4689, "mean_token_accuracy": 0.8533883094787598, "num_tokens": 64918510.0, "step": 1698 }, { "epoch": 0.21613026332527668, "ewc_loss": 0.01809833012521267, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.135439616627991e-05, "grad_norm": 3.45682692527771, "learning_rate": 7.197965239508265e-07, "loss": 0.475, "mean_token_accuracy": 0.8490633368492126, "num_tokens": 64957156.0, "step": 1699 }, { "epoch": 0.21625747360386718, "ewc_loss": 0.01810814067721367, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.145249062683433e-05, "grad_norm": 3.4000675678253174, "learning_rate": 7.202204323866045e-07, "loss": 0.4735, "mean_token_accuracy": 0.8504253029823303, "num_tokens": 64998761.0, "step": 1700 }, { "epoch": 0.2163846838824577, "ewc_loss": 0.018079517409205437, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.116626900620759e-05, "grad_norm": 3.4086201190948486, "learning_rate": 7.206443408223823e-07, "loss": 0.4764, "mean_token_accuracy": 0.8467267751693726, "num_tokens": 65038849.0, "step": 1701 }, { "epoch": 0.2165118941610482, "ewc_loss": 0.01811104454100132, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.148154352558777e-05, "grad_norm": 3.492218017578125, "learning_rate": 7.210682492581603e-07, "loss": 0.4555, "mean_token_accuracy": 0.8524337410926819, "num_tokens": 65071130.0, "step": 1702 }, { "epoch": 0.2166391044396387, "ewc_loss": 0.018129730597138405, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.166840466903523e-05, "grad_norm": 3.4014713764190674, "learning_rate": 7.214921576939381e-07, "loss": 0.4974, "mean_token_accuracy": 0.8385635614395142, "num_tokens": 65111619.0, "step": 1703 }, { "epoch": 0.21676631471822924, "ewc_loss": 0.018079528585076332, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.116638542152941e-05, "grad_norm": 3.4805996417999268, "learning_rate": 7.219160661297159e-07, "loss": 0.4446, "mean_token_accuracy": 0.8560782670974731, "num_tokens": 65147537.0, "step": 1704 }, { "epoch": 0.21689352499681974, "ewc_loss": 0.01816209964454174, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.19920901954174e-05, "grad_norm": 3.462402820587158, "learning_rate": 7.223399745654938e-07, "loss": 0.483, "mean_token_accuracy": 0.8466441035270691, "num_tokens": 65187254.0, "step": 1705 }, { "epoch": 0.21702073527541024, "ewc_loss": 0.0181061252951622, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.143235077615827e-05, "grad_norm": 3.4493162631988525, "learning_rate": 7.227638830012717e-07, "loss": 0.4941, "mean_token_accuracy": 0.8402984738349915, "num_tokens": 65226130.0, "step": 1706 }, { "epoch": 0.21714794555400077, "ewc_loss": 0.018122710287570953, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.159820622997358e-05, "grad_norm": 3.593045949935913, "learning_rate": 7.231877914370495e-07, "loss": 0.4563, "mean_token_accuracy": 0.8554513454437256, "num_tokens": 65261282.0, "step": 1707 }, { "epoch": 0.21727515583259127, "ewc_loss": 0.018201470375061035, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.238579953787848e-05, "grad_norm": 3.534346103668213, "learning_rate": 7.236116998728275e-07, "loss": 0.5084, "mean_token_accuracy": 0.8381023406982422, "num_tokens": 65294850.0, "step": 1708 }, { "epoch": 0.21740236611118177, "ewc_loss": 0.018114468082785606, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.151576963020489e-05, "grad_norm": 3.3560826778411865, "learning_rate": 7.240356083086053e-07, "loss": 0.4018, "mean_token_accuracy": 0.8703160285949707, "num_tokens": 65332323.0, "step": 1709 }, { "epoch": 0.2175295763897723, "ewc_loss": 0.01807936280965805, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.116471922723576e-05, "grad_norm": 3.4113643169403076, "learning_rate": 7.244595167443833e-07, "loss": 0.4941, "mean_token_accuracy": 0.8443742990493774, "num_tokens": 65376033.0, "step": 1710 }, { "epoch": 0.2176567866683628, "ewc_loss": 0.018164323642849922, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.201432552188635e-05, "grad_norm": 3.442185878753662, "learning_rate": 7.248834251801611e-07, "loss": 0.4607, "mean_token_accuracy": 0.8546307682991028, "num_tokens": 65417875.0, "step": 1711 }, { "epoch": 0.2177839969469533, "ewc_loss": 0.018140768632292747, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.177878094604239e-05, "grad_norm": 3.428403615951538, "learning_rate": 7.253073336159388e-07, "loss": 0.4377, "mean_token_accuracy": 0.8601815700531006, "num_tokens": 65456675.0, "step": 1712 }, { "epoch": 0.21791120722554383, "ewc_loss": 0.01814883202314377, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.185940583236516e-05, "grad_norm": 3.5428266525268555, "learning_rate": 7.257312420517168e-07, "loss": 0.4285, "mean_token_accuracy": 0.8622955083847046, "num_tokens": 65487699.0, "step": 1713 }, { "epoch": 0.21803841750413433, "ewc_loss": 0.018201196566224098, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.238305650185794e-05, "grad_norm": 3.4771981239318848, "learning_rate": 7.261551504874946e-07, "loss": 0.4565, "mean_token_accuracy": 0.8499435186386108, "num_tokens": 65526143.0, "step": 1714 }, { "epoch": 0.21816562778272486, "ewc_loss": 0.01813894882798195, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.17605765000917e-05, "grad_norm": 3.4286224842071533, "learning_rate": 7.265790589232725e-07, "loss": 0.4537, "mean_token_accuracy": 0.8568549752235413, "num_tokens": 65562969.0, "step": 1715 }, { "epoch": 0.21829283806131536, "ewc_loss": 0.018148273229599, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.185381789691746e-05, "grad_norm": 3.406545400619507, "learning_rate": 7.270029673590504e-07, "loss": 0.4867, "mean_token_accuracy": 0.8466165661811829, "num_tokens": 65605999.0, "step": 1716 }, { "epoch": 0.21842004833990586, "ewc_loss": 0.01817166805267334, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.208777631400153e-05, "grad_norm": 3.5395641326904297, "learning_rate": 7.274268757948283e-07, "loss": 0.5304, "mean_token_accuracy": 0.8328555822372437, "num_tokens": 65644877.0, "step": 1717 }, { "epoch": 0.2185472586184964, "ewc_loss": 0.01824716106057167, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.284270784817636e-05, "grad_norm": 3.503359794616699, "learning_rate": 7.278507842306062e-07, "loss": 0.4515, "mean_token_accuracy": 0.8556356430053711, "num_tokens": 65678823.0, "step": 1718 }, { "epoch": 0.2186744688970869, "ewc_loss": 0.01818356290459633, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.220671639312059e-05, "grad_norm": 3.494358777999878, "learning_rate": 7.282746926663841e-07, "loss": 0.5312, "mean_token_accuracy": 0.8325768709182739, "num_tokens": 65717185.0, "step": 1719 }, { "epoch": 0.2188016791756774, "ewc_loss": 0.018213877454400063, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.250986189115793e-05, "grad_norm": 3.511411428451538, "learning_rate": 7.286986011021618e-07, "loss": 0.4512, "mean_token_accuracy": 0.8559485077857971, "num_tokens": 65754422.0, "step": 1720 }, { "epoch": 0.21892888945426792, "ewc_loss": 0.01821364462375641, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.2507540860679e-05, "grad_norm": 3.4697518348693848, "learning_rate": 7.291225095379398e-07, "loss": 0.488, "mean_token_accuracy": 0.8475152850151062, "num_tokens": 65799390.0, "step": 1721 }, { "epoch": 0.21905609973285842, "ewc_loss": 0.018190929666161537, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.228039273992181e-05, "grad_norm": 3.4983162879943848, "learning_rate": 7.295464179737176e-07, "loss": 0.4781, "mean_token_accuracy": 0.8504646420478821, "num_tokens": 65837566.0, "step": 1722 }, { "epoch": 0.21918331001144892, "ewc_loss": 0.01821887120604515, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.25598113401793e-05, "grad_norm": 3.5089311599731445, "learning_rate": 7.299703264094955e-07, "loss": 0.4644, "mean_token_accuracy": 0.8513801097869873, "num_tokens": 65874537.0, "step": 1723 }, { "epoch": 0.21931052029003945, "ewc_loss": 0.018215762451291084, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.252871389733627e-05, "grad_norm": 3.477294683456421, "learning_rate": 7.303942348452734e-07, "loss": 0.4903, "mean_token_accuracy": 0.844087541103363, "num_tokens": 65915616.0, "step": 1724 }, { "epoch": 0.21943773056862995, "ewc_loss": 0.018200241029262543, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.23734958935529e-05, "grad_norm": 3.4467971324920654, "learning_rate": 7.308181432810513e-07, "loss": 0.4396, "mean_token_accuracy": 0.8577314019203186, "num_tokens": 65954263.0, "step": 1725 }, { "epoch": 0.21956494084722045, "ewc_loss": 0.01820705458521843, "ewc_loss_diag": 1.1980533599853516e-05, "ewc_loss_parallel": 6.244164978852496e-05, "grad_norm": 3.5013461112976074, "learning_rate": 7.312420517168292e-07, "loss": 0.4729, "mean_token_accuracy": 0.8480713367462158, "num_tokens": 65992880.0, "step": 1726 }, { "epoch": 0.21969215112581097, "ewc_loss": 0.018304064869880676, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 6.280139496084303e-05, "grad_norm": 3.4972264766693115, "learning_rate": 7.31665960152607e-07, "loss": 0.4915, "mean_token_accuracy": 0.8407917618751526, "num_tokens": 66033958.0, "step": 1727 }, { "epoch": 0.21981936140440148, "ewc_loss": 0.01827245019376278, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 6.248524732654914e-05, "grad_norm": 3.479776382446289, "learning_rate": 7.320898685883848e-07, "loss": 0.4625, "mean_token_accuracy": 0.8511186242103577, "num_tokens": 66072448.0, "step": 1728 }, { "epoch": 0.21994657168299198, "ewc_loss": 0.018296729773283005, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 6.272804603213444e-05, "grad_norm": 3.4981207847595215, "learning_rate": 7.325137770241628e-07, "loss": 0.5728, "mean_token_accuracy": 0.8213720321655273, "num_tokens": 66112648.0, "step": 1729 }, { "epoch": 0.2200737819615825, "ewc_loss": 0.018308483064174652, "ewc_loss_diag": 1.2040138244628906e-05, "ewc_loss_parallel": 6.284557457547635e-05, "grad_norm": 3.4538846015930176, "learning_rate": 7.329376854599406e-07, "loss": 0.4845, "mean_token_accuracy": 0.846889853477478, "num_tokens": 66154791.0, "step": 1730 }, { "epoch": 0.220200992240173, "ewc_loss": 0.018338629975914955, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 6.253669562283903e-05, "grad_norm": 3.5643486976623535, "learning_rate": 7.333615938957184e-07, "loss": 0.4705, "mean_token_accuracy": 0.8495616316795349, "num_tokens": 66192450.0, "step": 1731 }, { "epoch": 0.2203282025187635, "ewc_loss": 0.018420472741127014, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 6.335510988719761e-05, "grad_norm": 3.4874184131622314, "learning_rate": 7.337855023314964e-07, "loss": 0.4673, "mean_token_accuracy": 0.8498326539993286, "num_tokens": 66231173.0, "step": 1732 }, { "epoch": 0.22045541279735403, "ewc_loss": 0.018349727615714073, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 6.264766125241295e-05, "grad_norm": 3.515082597732544, "learning_rate": 7.342094107672742e-07, "loss": 0.5203, "mean_token_accuracy": 0.8340151309967041, "num_tokens": 66272956.0, "step": 1733 }, { "epoch": 0.22058262307594453, "ewc_loss": 0.01840265654027462, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 6.317695078905672e-05, "grad_norm": 3.522745132446289, "learning_rate": 7.346333192030522e-07, "loss": 0.4739, "mean_token_accuracy": 0.853472888469696, "num_tokens": 66309450.0, "step": 1734 }, { "epoch": 0.22070983335453503, "ewc_loss": 0.018403813242912292, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 6.318852683762088e-05, "grad_norm": 3.5202674865722656, "learning_rate": 7.350572276388299e-07, "loss": 0.5246, "mean_token_accuracy": 0.8321417570114136, "num_tokens": 66350947.0, "step": 1735 }, { "epoch": 0.22083704363312556, "ewc_loss": 0.018394729122519493, "ewc_loss_diag": 1.2099742889404297e-05, "ewc_loss_parallel": 6.309767923085019e-05, "grad_norm": 3.5205726623535156, "learning_rate": 7.354811360746078e-07, "loss": 0.4819, "mean_token_accuracy": 0.8446521759033203, "num_tokens": 66388364.0, "step": 1736 }, { "epoch": 0.22096425391171606, "ewc_loss": 0.018459554761648178, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.313557969406247e-05, "grad_norm": 3.513043165206909, "learning_rate": 7.359050445103857e-07, "loss": 0.4972, "mean_token_accuracy": 0.8403753042221069, "num_tokens": 66425029.0, "step": 1737 }, { "epoch": 0.22109146419030656, "ewc_loss": 0.018453609198331833, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.307612056843936e-05, "grad_norm": 3.6070363521575928, "learning_rate": 7.363289529461636e-07, "loss": 0.4729, "mean_token_accuracy": 0.8504537343978882, "num_tokens": 66463924.0, "step": 1738 }, { "epoch": 0.2212186744688971, "ewc_loss": 0.01850772462785244, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.36172917438671e-05, "grad_norm": 3.4947354793548584, "learning_rate": 7.367528613819415e-07, "loss": 0.428, "mean_token_accuracy": 0.861987829208374, "num_tokens": 66504824.0, "step": 1739 }, { "epoch": 0.2213458847474876, "ewc_loss": 0.018431538715958595, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.285542622208595e-05, "grad_norm": 3.4803357124328613, "learning_rate": 7.371767698177194e-07, "loss": 0.4468, "mean_token_accuracy": 0.8559721112251282, "num_tokens": 66539308.0, "step": 1740 }, { "epoch": 0.22147309502607812, "ewc_loss": 0.018464894965291023, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.318899249890819e-05, "grad_norm": 3.471480131149292, "learning_rate": 7.376006782534972e-07, "loss": 0.4421, "mean_token_accuracy": 0.8593106865882874, "num_tokens": 66572785.0, "step": 1741 }, { "epoch": 0.22160030530466862, "ewc_loss": 0.01848461478948593, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.338617822621018e-05, "grad_norm": 3.5250957012176514, "learning_rate": 7.380245866892751e-07, "loss": 0.4598, "mean_token_accuracy": 0.8529759049415588, "num_tokens": 66607890.0, "step": 1742 }, { "epoch": 0.22172751558325912, "ewc_loss": 0.018513286486268044, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.36729018879123e-05, "grad_norm": 3.483227491378784, "learning_rate": 7.384484951250529e-07, "loss": 0.462, "mean_token_accuracy": 0.8549298048019409, "num_tokens": 66650553.0, "step": 1743 }, { "epoch": 0.22185472586184965, "ewc_loss": 0.018478218466043472, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.332222983473912e-05, "grad_norm": 3.508624792098999, "learning_rate": 7.388724035608308e-07, "loss": 0.4581, "mean_token_accuracy": 0.8553168773651123, "num_tokens": 66685330.0, "step": 1744 }, { "epoch": 0.22198193614044015, "ewc_loss": 0.018529357388615608, "ewc_loss_diag": 1.2159347534179688e-05, "ewc_loss_parallel": 6.383361323969439e-05, "grad_norm": 3.582347869873047, "learning_rate": 7.392963119966087e-07, "loss": 0.4409, "mean_token_accuracy": 0.857839822769165, "num_tokens": 66716816.0, "step": 1745 }, { "epoch": 0.22210914641903065, "ewc_loss": 0.01859617978334427, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 6.389149348251522e-05, "grad_norm": 3.517943859100342, "learning_rate": 7.397202204323866e-07, "loss": 0.4847, "mean_token_accuracy": 0.8465242385864258, "num_tokens": 66754030.0, "step": 1746 }, { "epoch": 0.22223635669762118, "ewc_loss": 0.018699366599321365, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.37026532785967e-05, "grad_norm": 3.5931410789489746, "learning_rate": 7.401441288681645e-07, "loss": 0.4435, "mean_token_accuracy": 0.8587307929992676, "num_tokens": 66793650.0, "step": 1747 }, { "epoch": 0.22236356697621168, "ewc_loss": 0.01872824877500534, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.399147969204932e-05, "grad_norm": 3.444047689437866, "learning_rate": 7.405680373039424e-07, "loss": 0.4279, "mean_token_accuracy": 0.8663437366485596, "num_tokens": 66833709.0, "step": 1748 }, { "epoch": 0.22249077725480218, "ewc_loss": 0.018519440665841103, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 6.312409823294729e-05, "grad_norm": 3.653954267501831, "learning_rate": 7.409919457397202e-07, "loss": 0.4941, "mean_token_accuracy": 0.8412019610404968, "num_tokens": 66866456.0, "step": 1749 }, { "epoch": 0.2226179875333927, "ewc_loss": 0.018806342035531998, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.477241549873725e-05, "grad_norm": 3.5181636810302734, "learning_rate": 7.414158541754981e-07, "loss": 0.5219, "mean_token_accuracy": 0.8322920203208923, "num_tokens": 66905875.0, "step": 1750 }, { "epoch": 0.2227451978119832, "ewc_loss": 0.018553348258137703, "ewc_loss_diag": 1.2218952178955078e-05, "ewc_loss_parallel": 6.346316513372585e-05, "grad_norm": 3.7108852863311768, "learning_rate": 7.418397626112759e-07, "loss": 0.4759, "mean_token_accuracy": 0.8551338315010071, "num_tokens": 66936009.0, "step": 1751 }, { "epoch": 0.2228724080905737, "ewc_loss": 0.018830422312021255, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.501320604002103e-05, "grad_norm": 3.499772310256958, "learning_rate": 7.422636710470537e-07, "loss": 0.479, "mean_token_accuracy": 0.847780168056488, "num_tokens": 66978111.0, "step": 1752 }, { "epoch": 0.22299961836916424, "ewc_loss": 0.01867365837097168, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.344555731629953e-05, "grad_norm": 3.6554410457611084, "learning_rate": 7.426875794828317e-07, "loss": 0.4909, "mean_token_accuracy": 0.8467997908592224, "num_tokens": 67015938.0, "step": 1753 }, { "epoch": 0.22312682864775474, "ewc_loss": 0.018804974853992462, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.475872214650735e-05, "grad_norm": 3.529632091522217, "learning_rate": 7.431114879186095e-07, "loss": 0.5138, "mean_token_accuracy": 0.8383411169052124, "num_tokens": 67055456.0, "step": 1754 }, { "epoch": 0.22325403892634524, "ewc_loss": 0.018624141812324524, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 6.356074300128967e-05, "grad_norm": 3.6677069664001465, "learning_rate": 7.435353963543875e-07, "loss": 0.4718, "mean_token_accuracy": 0.8485612869262695, "num_tokens": 67096298.0, "step": 1755 }, { "epoch": 0.22338124920493577, "ewc_loss": 0.01879730075597763, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.468198262155056e-05, "grad_norm": 3.5325894355773926, "learning_rate": 7.439593047901653e-07, "loss": 0.429, "mean_token_accuracy": 0.8625152111053467, "num_tokens": 67135280.0, "step": 1756 }, { "epoch": 0.22350845948352627, "ewc_loss": 0.018602628260850906, "ewc_loss_diag": 1.2278556823730469e-05, "ewc_loss_parallel": 6.33456147625111e-05, "grad_norm": 3.59444522857666, "learning_rate": 7.443832132259431e-07, "loss": 0.4925, "mean_token_accuracy": 0.8415902853012085, "num_tokens": 67171936.0, "step": 1757 }, { "epoch": 0.22363566976211677, "ewc_loss": 0.018755901604890823, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.426800246117637e-05, "grad_norm": 3.5234756469726562, "learning_rate": 7.44807121661721e-07, "loss": 0.4787, "mean_token_accuracy": 0.846226692199707, "num_tokens": 67209854.0, "step": 1758 }, { "epoch": 0.2237628800407073, "ewc_loss": 0.0186944380402565, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.36533586657606e-05, "grad_norm": 3.5336086750030518, "learning_rate": 7.452310300974989e-07, "loss": 0.4353, "mean_token_accuracy": 0.857071042060852, "num_tokens": 67244369.0, "step": 1759 }, { "epoch": 0.2238900903192978, "ewc_loss": 0.01874799281358719, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.418891280191019e-05, "grad_norm": 3.534916877746582, "learning_rate": 7.456549385332767e-07, "loss": 0.5133, "mean_token_accuracy": 0.8385872840881348, "num_tokens": 67283994.0, "step": 1760 }, { "epoch": 0.2240173005978883, "ewc_loss": 0.018735354766249657, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.406252941815183e-05, "grad_norm": 3.420869827270508, "learning_rate": 7.460788469690547e-07, "loss": 0.3988, "mean_token_accuracy": 0.8687721490859985, "num_tokens": 67323230.0, "step": 1761 }, { "epoch": 0.22414451087647883, "ewc_loss": 0.01870657689869404, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.377475801855326e-05, "grad_norm": 3.558465003967285, "learning_rate": 7.465027554048325e-07, "loss": 0.5502, "mean_token_accuracy": 0.8278807401657104, "num_tokens": 67366198.0, "step": 1762 }, { "epoch": 0.22427172115506933, "ewc_loss": 0.018798746168613434, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.469643994932994e-05, "grad_norm": 3.500255823135376, "learning_rate": 7.469266638406105e-07, "loss": 0.535, "mean_token_accuracy": 0.83328777551651, "num_tokens": 67410685.0, "step": 1763 }, { "epoch": 0.22439893143365983, "ewc_loss": 0.01871884986758232, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.389747431967407e-05, "grad_norm": 3.4971673488616943, "learning_rate": 7.473505722763883e-07, "loss": 0.4576, "mean_token_accuracy": 0.8549631834030151, "num_tokens": 67445818.0, "step": 1764 }, { "epoch": 0.22452614171225035, "ewc_loss": 0.018767565488815308, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.438463606173173e-05, "grad_norm": 3.554703950881958, "learning_rate": 7.477744807121661e-07, "loss": 0.5005, "mean_token_accuracy": 0.8408304452896118, "num_tokens": 67483873.0, "step": 1765 }, { "epoch": 0.22465335199084085, "ewc_loss": 0.018790626898407936, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.461524753831327e-05, "grad_norm": 3.6147987842559814, "learning_rate": 7.48198389147944e-07, "loss": 0.4776, "mean_token_accuracy": 0.8515610098838806, "num_tokens": 67519524.0, "step": 1766 }, { "epoch": 0.22478056226943138, "ewc_loss": 0.018812576308846474, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.483474135166034e-05, "grad_norm": 3.5155930519104004, "learning_rate": 7.486222975837219e-07, "loss": 0.4625, "mean_token_accuracy": 0.850132167339325, "num_tokens": 67560920.0, "step": 1767 }, { "epoch": 0.22490777254802188, "ewc_loss": 0.018756655976176262, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.427554762922227e-05, "grad_norm": 3.5987141132354736, "learning_rate": 7.490462060194997e-07, "loss": 0.4342, "mean_token_accuracy": 0.8566250801086426, "num_tokens": 67594948.0, "step": 1768 }, { "epoch": 0.22503498282661238, "ewc_loss": 0.018828794360160828, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.49969297228381e-05, "grad_norm": 3.587092876434326, "learning_rate": 7.494701144552777e-07, "loss": 0.5196, "mean_token_accuracy": 0.834378182888031, "num_tokens": 67628867.0, "step": 1769 }, { "epoch": 0.2251621931052029, "ewc_loss": 0.018791504204273224, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.46240368951112e-05, "grad_norm": 3.53826642036438, "learning_rate": 7.498940228910555e-07, "loss": 0.4823, "mean_token_accuracy": 0.8463756442070007, "num_tokens": 67669762.0, "step": 1770 }, { "epoch": 0.2252894033837934, "ewc_loss": 0.018803473562002182, "ewc_loss_diag": 1.233816146850586e-05, "ewc_loss_parallel": 6.474371184594929e-05, "grad_norm": 3.6082444190979004, "learning_rate": 7.503179313268335e-07, "loss": 0.4683, "mean_token_accuracy": 0.8481304049491882, "num_tokens": 67704750.0, "step": 1771 }, { "epoch": 0.2254166136623839, "ewc_loss": 0.01896023564040661, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 6.509063678095117e-05, "grad_norm": 4.264225006103516, "learning_rate": 7.507418397626113e-07, "loss": 0.505, "mean_token_accuracy": 0.8385688066482544, "num_tokens": 67745181.0, "step": 1772 }, { "epoch": 0.22554382394097444, "ewc_loss": 0.01919942907989025, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.809292972320691e-05, "grad_norm": 3.5407028198242188, "learning_rate": 7.51165748198389e-07, "loss": 0.4933, "mean_token_accuracy": 0.8406457304954529, "num_tokens": 67785929.0, "step": 1773 }, { "epoch": 0.22567103421956494, "ewc_loss": 0.01865869201719761, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.268555443966761e-05, "grad_norm": 3.4817392826080322, "learning_rate": 7.51589656634167e-07, "loss": 0.4496, "mean_token_accuracy": 0.8522579073905945, "num_tokens": 67821887.0, "step": 1774 }, { "epoch": 0.22579824449815544, "ewc_loss": 0.018878718838095665, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.488582585006952e-05, "grad_norm": 3.5264933109283447, "learning_rate": 7.520135650699448e-07, "loss": 0.4841, "mean_token_accuracy": 0.8451547622680664, "num_tokens": 67859608.0, "step": 1775 }, { "epoch": 0.22592545477674597, "ewc_loss": 0.018834933638572693, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.444795872084796e-05, "grad_norm": 3.480830192565918, "learning_rate": 7.524374735057227e-07, "loss": 0.4918, "mean_token_accuracy": 0.8440499305725098, "num_tokens": 67895287.0, "step": 1776 }, { "epoch": 0.22605266505533647, "ewc_loss": 0.01884327456355095, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.453137029893696e-05, "grad_norm": 3.568686008453369, "learning_rate": 7.528613819415006e-07, "loss": 0.4525, "mean_token_accuracy": 0.8597569465637207, "num_tokens": 67930358.0, "step": 1777 }, { "epoch": 0.22617987533392697, "ewc_loss": 0.018906941637396812, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.516805296996608e-05, "grad_norm": 3.5743563175201416, "learning_rate": 7.532852903772785e-07, "loss": 0.4921, "mean_token_accuracy": 0.8426505923271179, "num_tokens": 67966112.0, "step": 1778 }, { "epoch": 0.2263070856125175, "ewc_loss": 0.018904205411672592, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.514069536933675e-05, "grad_norm": 3.5768187046051025, "learning_rate": 7.537091988130564e-07, "loss": 0.4986, "mean_token_accuracy": 0.839991569519043, "num_tokens": 68001297.0, "step": 1779 }, { "epoch": 0.226434295891108, "ewc_loss": 0.018917132169008255, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.526995275635272e-05, "grad_norm": 3.558420419692993, "learning_rate": 7.541331072488342e-07, "loss": 0.4805, "mean_token_accuracy": 0.8473303318023682, "num_tokens": 68037346.0, "step": 1780 }, { "epoch": 0.2265615061696985, "ewc_loss": 0.018910860642790794, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.520724127767608e-05, "grad_norm": 3.549610137939453, "learning_rate": 7.54557015684612e-07, "loss": 0.4796, "mean_token_accuracy": 0.8452543020248413, "num_tokens": 68076566.0, "step": 1781 }, { "epoch": 0.22668871644828903, "ewc_loss": 0.018928546458482742, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.538409070344642e-05, "grad_norm": 3.5368008613586426, "learning_rate": 7.5498092412039e-07, "loss": 0.4586, "mean_token_accuracy": 0.854506254196167, "num_tokens": 68116834.0, "step": 1782 }, { "epoch": 0.22681592672687953, "ewc_loss": 0.01892842911183834, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.538292655022815e-05, "grad_norm": 3.5085110664367676, "learning_rate": 7.554048325561678e-07, "loss": 0.4307, "mean_token_accuracy": 0.8629519939422607, "num_tokens": 68155573.0, "step": 1783 }, { "epoch": 0.22694313700547003, "ewc_loss": 0.018926460295915604, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.536323780892417e-05, "grad_norm": 3.5404157638549805, "learning_rate": 7.558287409919457e-07, "loss": 0.5232, "mean_token_accuracy": 0.8371297121047974, "num_tokens": 68193378.0, "step": 1784 }, { "epoch": 0.22707034728406056, "ewc_loss": 0.019013039767742157, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 6.561868940480053e-05, "grad_norm": 3.5555269718170166, "learning_rate": 7.562526494277236e-07, "loss": 0.4994, "mean_token_accuracy": 0.8386334180831909, "num_tokens": 68230183.0, "step": 1785 }, { "epoch": 0.22719755756265106, "ewc_loss": 0.019018305465579033, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 6.567133823409677e-05, "grad_norm": 3.519420862197876, "learning_rate": 7.566765578635015e-07, "loss": 0.4669, "mean_token_accuracy": 0.853041410446167, "num_tokens": 68270526.0, "step": 1786 }, { "epoch": 0.22732476784124156, "ewc_loss": 0.019010817632079124, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 6.55964613542892e-05, "grad_norm": 3.5781991481781006, "learning_rate": 7.571004662992794e-07, "loss": 0.4936, "mean_token_accuracy": 0.8467745780944824, "num_tokens": 68308690.0, "step": 1787 }, { "epoch": 0.2274519781198321, "ewc_loss": 0.01903524622321129, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 6.584073707927018e-05, "grad_norm": 3.513737916946411, "learning_rate": 7.575243747350572e-07, "loss": 0.5357, "mean_token_accuracy": 0.8331595063209534, "num_tokens": 68348660.0, "step": 1788 }, { "epoch": 0.2275791883984226, "ewc_loss": 0.019000032916665077, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 6.548860983457416e-05, "grad_norm": 3.528970956802368, "learning_rate": 7.57948283170835e-07, "loss": 0.4842, "mean_token_accuracy": 0.84930419921875, "num_tokens": 68389129.0, "step": 1789 }, { "epoch": 0.2277063986770131, "ewc_loss": 0.019041359424591064, "ewc_loss_diag": 1.245737075805664e-05, "ewc_loss_parallel": 6.590188422705978e-05, "grad_norm": 3.5819196701049805, "learning_rate": 7.58372191606613e-07, "loss": 0.4774, "mean_token_accuracy": 0.8491363525390625, "num_tokens": 68423234.0, "step": 1790 }, { "epoch": 0.22783360895560362, "ewc_loss": 0.019006675109267235, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.616539030801505e-05, "grad_norm": 3.555285692214966, "learning_rate": 7.587961000423908e-07, "loss": 0.4841, "mean_token_accuracy": 0.8475081324577332, "num_tokens": 68460834.0, "step": 1791 }, { "epoch": 0.22796081923419412, "ewc_loss": 0.018984606489539146, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.594470323761925e-05, "grad_norm": 3.623119592666626, "learning_rate": 7.592200084781686e-07, "loss": 0.5282, "mean_token_accuracy": 0.8375477194786072, "num_tokens": 68495605.0, "step": 1792 }, { "epoch": 0.22808802951278465, "ewc_loss": 0.01905045285820961, "ewc_loss_diag": 1.239776611328125e-05, "ewc_loss_parallel": 6.660317012574524e-05, "grad_norm": 3.5734171867370605, "learning_rate": 7.596439169139466e-07, "loss": 0.4749, "mean_token_accuracy": 0.8485724925994873, "num_tokens": 68530838.0, "step": 1793 }, { "epoch": 0.22821523979137515, "ewc_loss": 0.019130246713757515, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.618040060857311e-05, "grad_norm": 3.491285800933838, "learning_rate": 7.600678253497244e-07, "loss": 0.4686, "mean_token_accuracy": 0.8532567620277405, "num_tokens": 68571225.0, "step": 1794 }, { "epoch": 0.22834245006996565, "ewc_loss": 0.01911003142595291, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.597824540222064e-05, "grad_norm": 3.5977835655212402, "learning_rate": 7.604917337855023e-07, "loss": 0.4494, "mean_token_accuracy": 0.8583379983901978, "num_tokens": 68608284.0, "step": 1795 }, { "epoch": 0.22846966034855618, "ewc_loss": 0.019195953384041786, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.683746323687956e-05, "grad_norm": 3.5211215019226074, "learning_rate": 7.609156422212801e-07, "loss": 0.4319, "mean_token_accuracy": 0.8610211610794067, "num_tokens": 68645943.0, "step": 1796 }, { "epoch": 0.22859687062714668, "ewc_loss": 0.019110698252916336, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.598491745535284e-05, "grad_norm": 3.4862003326416016, "learning_rate": 7.61339550657058e-07, "loss": 0.4546, "mean_token_accuracy": 0.854222297668457, "num_tokens": 68684245.0, "step": 1797 }, { "epoch": 0.22872408090573718, "ewc_loss": 0.019151415675878525, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.639208731940016e-05, "grad_norm": 3.658205509185791, "learning_rate": 7.617634590928359e-07, "loss": 0.4689, "mean_token_accuracy": 0.8535648584365845, "num_tokens": 68717532.0, "step": 1798 }, { "epoch": 0.2288512911843277, "ewc_loss": 0.019229847937822342, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.717639917042106e-05, "grad_norm": 3.4893691539764404, "learning_rate": 7.621873675286138e-07, "loss": 0.4876, "mean_token_accuracy": 0.8467211723327637, "num_tokens": 68759515.0, "step": 1799 }, { "epoch": 0.2289785014629182, "ewc_loss": 0.019091639667749405, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.579432374564931e-05, "grad_norm": 3.609576940536499, "learning_rate": 7.626112759643916e-07, "loss": 0.4933, "mean_token_accuracy": 0.8420759439468384, "num_tokens": 68796969.0, "step": 1800 }, { "epoch": 0.2291057117415087, "ewc_loss": 0.019238632172346115, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.726425635861233e-05, "grad_norm": 3.6432878971099854, "learning_rate": 7.630351844001696e-07, "loss": 0.4752, "mean_token_accuracy": 0.8483133316040039, "num_tokens": 68831138.0, "step": 1801 }, { "epoch": 0.22923292202009923, "ewc_loss": 0.019195150583982468, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.682943785563111e-05, "grad_norm": 3.5253164768218994, "learning_rate": 7.634590928359474e-07, "loss": 0.463, "mean_token_accuracy": 0.8546661734580994, "num_tokens": 68868741.0, "step": 1802 }, { "epoch": 0.22936013229868973, "ewc_loss": 0.01914256066083908, "ewc_loss_diag": 1.2516975402832031e-05, "ewc_loss_parallel": 6.630353891523555e-05, "grad_norm": 3.503938674926758, "learning_rate": 7.638830012717253e-07, "loss": 0.4861, "mean_token_accuracy": 0.8442665934562683, "num_tokens": 68909191.0, "step": 1803 }, { "epoch": 0.22948734257728023, "ewc_loss": 0.019226400181651115, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.653157470282167e-05, "grad_norm": 3.5384833812713623, "learning_rate": 7.643069097075031e-07, "loss": 0.4612, "mean_token_accuracy": 0.852046549320221, "num_tokens": 68949261.0, "step": 1804 }, { "epoch": 0.22961455285587076, "ewc_loss": 0.01923147216439247, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.658229540335014e-05, "grad_norm": 3.560253858566284, "learning_rate": 7.64730818143281e-07, "loss": 0.4351, "mean_token_accuracy": 0.8573649525642395, "num_tokens": 68990434.0, "step": 1805 }, { "epoch": 0.22974176313446126, "ewc_loss": 0.019246093928813934, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.672851304756477e-05, "grad_norm": 3.579035997390747, "learning_rate": 7.651547265790589e-07, "loss": 0.517, "mean_token_accuracy": 0.8384479880332947, "num_tokens": 69029482.0, "step": 1806 }, { "epoch": 0.22986897341305176, "ewc_loss": 0.01923363469541073, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.660392682533711e-05, "grad_norm": 3.528521776199341, "learning_rate": 7.655786350148368e-07, "loss": 0.4973, "mean_token_accuracy": 0.8416849970817566, "num_tokens": 69069766.0, "step": 1807 }, { "epoch": 0.2299961836916423, "ewc_loss": 0.019286803901195526, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 6.652526644757017e-05, "grad_norm": 3.495279312133789, "learning_rate": 7.660025434506146e-07, "loss": 0.484, "mean_token_accuracy": 0.8463261723518372, "num_tokens": 69114592.0, "step": 1808 }, { "epoch": 0.2301233939702328, "ewc_loss": 0.019227564334869385, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.65432307869196e-05, "grad_norm": 3.557255983352661, "learning_rate": 7.664264518863926e-07, "loss": 0.4731, "mean_token_accuracy": 0.8504990339279175, "num_tokens": 69152599.0, "step": 1809 }, { "epoch": 0.2302506042488233, "ewc_loss": 0.019269302487373352, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.6960601543542e-05, "grad_norm": 3.5423872470855713, "learning_rate": 7.668503603221704e-07, "loss": 0.5257, "mean_token_accuracy": 0.8312575817108154, "num_tokens": 69193157.0, "step": 1810 }, { "epoch": 0.23037781452741382, "ewc_loss": 0.01925276778638363, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.679525540675968e-05, "grad_norm": 3.5402281284332275, "learning_rate": 7.672742687579483e-07, "loss": 0.4367, "mean_token_accuracy": 0.8582139611244202, "num_tokens": 69230681.0, "step": 1811 }, { "epoch": 0.23050502480600432, "ewc_loss": 0.019258756190538406, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.685514381388202e-05, "grad_norm": 3.6063106060028076, "learning_rate": 7.676981771937261e-07, "loss": 0.4165, "mean_token_accuracy": 0.8624222278594971, "num_tokens": 69261292.0, "step": 1812 }, { "epoch": 0.23063223508459482, "ewc_loss": 0.019291456788778305, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.718213990097865e-05, "grad_norm": 3.608476161956787, "learning_rate": 7.681220856295039e-07, "loss": 0.4463, "mean_token_accuracy": 0.8579905033111572, "num_tokens": 69301795.0, "step": 1813 }, { "epoch": 0.23075944536318535, "ewc_loss": 0.019258862361311913, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.68562061036937e-05, "grad_norm": 3.5640926361083984, "learning_rate": 7.685459940652819e-07, "loss": 0.47, "mean_token_accuracy": 0.8473000526428223, "num_tokens": 69337038.0, "step": 1814 }, { "epoch": 0.23088665564177585, "ewc_loss": 0.019236955791711807, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.663713429588825e-05, "grad_norm": 3.51900315284729, "learning_rate": 7.689699025010597e-07, "loss": 0.466, "mean_token_accuracy": 0.850031852722168, "num_tokens": 69379871.0, "step": 1815 }, { "epoch": 0.23101386592036638, "ewc_loss": 0.019245583564043045, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.672340532531962e-05, "grad_norm": 3.6490793228149414, "learning_rate": 7.693938109368376e-07, "loss": 0.4876, "mean_token_accuracy": 0.8432396650314331, "num_tokens": 69418021.0, "step": 1816 }, { "epoch": 0.23114107619895688, "ewc_loss": 0.01933346688747406, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.76022536936216e-05, "grad_norm": 3.565483808517456, "learning_rate": 7.698177193726155e-07, "loss": 0.468, "mean_token_accuracy": 0.8500598669052124, "num_tokens": 69455574.0, "step": 1817 }, { "epoch": 0.23126828647754738, "ewc_loss": 0.019241223111748695, "ewc_loss_diag": 1.2576580047607422e-05, "ewc_loss_parallel": 6.667980778729543e-05, "grad_norm": 3.643218755722046, "learning_rate": 7.702416278083933e-07, "loss": 0.4914, "mean_token_accuracy": 0.8436911702156067, "num_tokens": 69489822.0, "step": 1818 }, { "epoch": 0.2313954967561379, "ewc_loss": 0.019390905275940895, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 6.756627408321947e-05, "grad_norm": 3.5248987674713135, "learning_rate": 7.706655362441712e-07, "loss": 0.4616, "mean_token_accuracy": 0.8516119122505188, "num_tokens": 69532821.0, "step": 1819 }, { "epoch": 0.2315227070347284, "ewc_loss": 0.019298307597637177, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 6.664029933745041e-05, "grad_norm": 3.6503853797912598, "learning_rate": 7.710894446799491e-07, "loss": 0.4457, "mean_token_accuracy": 0.8561000227928162, "num_tokens": 69565674.0, "step": 1820 }, { "epoch": 0.2316499173133189, "ewc_loss": 0.019412800669670105, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 6.778522947570309e-05, "grad_norm": 3.6070075035095215, "learning_rate": 7.715133531157269e-07, "loss": 0.4474, "mean_token_accuracy": 0.8592989444732666, "num_tokens": 69606118.0, "step": 1821 }, { "epoch": 0.23177712759190944, "ewc_loss": 0.019336795434355736, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 6.702517566736788e-05, "grad_norm": 3.5621767044067383, "learning_rate": 7.719372615515049e-07, "loss": 0.5155, "mean_token_accuracy": 0.8371456265449524, "num_tokens": 69650102.0, "step": 1822 }, { "epoch": 0.23190433787049994, "ewc_loss": 0.019363276660442352, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 6.728999869665131e-05, "grad_norm": 3.5385520458221436, "learning_rate": 7.723611699872827e-07, "loss": 0.4612, "mean_token_accuracy": 0.851064145565033, "num_tokens": 69693440.0, "step": 1823 }, { "epoch": 0.23203154814909044, "ewc_loss": 0.019359124824404716, "ewc_loss_diag": 1.2636184692382812e-05, "ewc_loss_parallel": 6.724847480654716e-05, "grad_norm": 3.5869860649108887, "learning_rate": 7.727850784230606e-07, "loss": 0.4807, "mean_token_accuracy": 0.8431075215339661, "num_tokens": 69732651.0, "step": 1824 }, { "epoch": 0.23215875842768097, "ewc_loss": 0.01945822685956955, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.762914563296363e-05, "grad_norm": 3.778731107711792, "learning_rate": 7.732089868588385e-07, "loss": 0.4971, "mean_token_accuracy": 0.8445786237716675, "num_tokens": 69769915.0, "step": 1825 }, { "epoch": 0.23228596870627147, "ewc_loss": 0.019536344334483147, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.84103142702952e-05, "grad_norm": 3.5229392051696777, "learning_rate": 7.736328952946163e-07, "loss": 0.4187, "mean_token_accuracy": 0.8667604327201843, "num_tokens": 69813943.0, "step": 1826 }, { "epoch": 0.23241317898486197, "ewc_loss": 0.01935531385242939, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.660001236014068e-05, "grad_norm": 3.6356539726257324, "learning_rate": 7.740568037303942e-07, "loss": 0.4482, "mean_token_accuracy": 0.8548154234886169, "num_tokens": 69847039.0, "step": 1827 }, { "epoch": 0.2325403892634525, "ewc_loss": 0.019522719085216522, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.827405741205439e-05, "grad_norm": 3.541358232498169, "learning_rate": 7.744807121661721e-07, "loss": 0.4125, "mean_token_accuracy": 0.8698019981384277, "num_tokens": 69884588.0, "step": 1828 }, { "epoch": 0.232667599542043, "ewc_loss": 0.01941017061471939, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.714859046041965e-05, "grad_norm": 3.541856288909912, "learning_rate": 7.749046206019499e-07, "loss": 0.4765, "mean_token_accuracy": 0.8492757081985474, "num_tokens": 69925552.0, "step": 1829 }, { "epoch": 0.2327948098206335, "ewc_loss": 0.01947782188653946, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.78250944474712e-05, "grad_norm": 3.5964274406433105, "learning_rate": 7.753285290377279e-07, "loss": 0.4114, "mean_token_accuracy": 0.8667187094688416, "num_tokens": 69961776.0, "step": 1830 }, { "epoch": 0.23292202009922403, "ewc_loss": 0.019484825432300568, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.789514009142295e-05, "grad_norm": 3.6317667961120605, "learning_rate": 7.757524374735057e-07, "loss": 0.5184, "mean_token_accuracy": 0.834641695022583, "num_tokens": 69999775.0, "step": 1831 }, { "epoch": 0.23304923037781453, "ewc_loss": 0.019470486789941788, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.775175279472023e-05, "grad_norm": 3.5476012229919434, "learning_rate": 7.761763459092836e-07, "loss": 0.4283, "mean_token_accuracy": 0.864540159702301, "num_tokens": 70037294.0, "step": 1832 }, { "epoch": 0.23317644065640503, "ewc_loss": 0.019453147426247597, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.757835217285901e-05, "grad_norm": 3.6427903175354004, "learning_rate": 7.766002543450614e-07, "loss": 0.4634, "mean_token_accuracy": 0.8529769778251648, "num_tokens": 70075014.0, "step": 1833 }, { "epoch": 0.23330365093499555, "ewc_loss": 0.019516654312610626, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.821341958129779e-05, "grad_norm": 3.5497798919677734, "learning_rate": 7.770241627808392e-07, "loss": 0.421, "mean_token_accuracy": 0.8647183179855347, "num_tokens": 70112186.0, "step": 1834 }, { "epoch": 0.23343086121358606, "ewc_loss": 0.019446346908807755, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.751033652108163e-05, "grad_norm": 3.566553831100464, "learning_rate": 7.774480712166172e-07, "loss": 0.4967, "mean_token_accuracy": 0.8421038389205933, "num_tokens": 70153285.0, "step": 1835 }, { "epoch": 0.23355807149217656, "ewc_loss": 0.0195250753313303, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.829763151472434e-05, "grad_norm": 3.653857707977295, "learning_rate": 7.77871979652395e-07, "loss": 0.4802, "mean_token_accuracy": 0.8474130034446716, "num_tokens": 70193382.0, "step": 1836 }, { "epoch": 0.23368528177076708, "ewc_loss": 0.01954437606036663, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.849063356639817e-05, "grad_norm": 3.6457080841064453, "learning_rate": 7.782958880881729e-07, "loss": 0.4695, "mean_token_accuracy": 0.8499224781990051, "num_tokens": 70227279.0, "step": 1837 }, { "epoch": 0.23381249204935758, "ewc_loss": 0.019513316452503204, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.818004476372153e-05, "grad_norm": 3.5926506519317627, "learning_rate": 7.787197965239508e-07, "loss": 0.4328, "mean_token_accuracy": 0.8587912917137146, "num_tokens": 70264395.0, "step": 1838 }, { "epoch": 0.23393970232794808, "ewc_loss": 0.019501350820064545, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.806039164075628e-05, "grad_norm": 3.733849048614502, "learning_rate": 7.791437049597287e-07, "loss": 0.5461, "mean_token_accuracy": 0.8276908993721008, "num_tokens": 70295016.0, "step": 1839 }, { "epoch": 0.2340669126065386, "ewc_loss": 0.019589755684137344, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.894442049087957e-05, "grad_norm": 3.6108133792877197, "learning_rate": 7.795676133955065e-07, "loss": 0.49, "mean_token_accuracy": 0.8452256917953491, "num_tokens": 70332774.0, "step": 1840 }, { "epoch": 0.2341941228851291, "ewc_loss": 0.019506439566612244, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.811126513639465e-05, "grad_norm": 3.617948293685913, "learning_rate": 7.799915218312844e-07, "loss": 0.476, "mean_token_accuracy": 0.8489394783973694, "num_tokens": 70366880.0, "step": 1841 }, { "epoch": 0.23432133316371964, "ewc_loss": 0.0196259543299675, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 6.869607750559226e-05, "grad_norm": 3.6288299560546875, "learning_rate": 7.804154302670622e-07, "loss": 0.4516, "mean_token_accuracy": 0.8553858995437622, "num_tokens": 70405235.0, "step": 1842 }, { "epoch": 0.23444854344231014, "ewc_loss": 0.01961902715265751, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 6.862678856123239e-05, "grad_norm": 3.566518783569336, "learning_rate": 7.808393387028402e-07, "loss": 0.4606, "mean_token_accuracy": 0.8524911403656006, "num_tokens": 70446503.0, "step": 1843 }, { "epoch": 0.23457575372090064, "ewc_loss": 0.01958901435136795, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 6.832666258560494e-05, "grad_norm": 3.6108500957489014, "learning_rate": 7.81263247138618e-07, "loss": 0.5119, "mean_token_accuracy": 0.8370452523231506, "num_tokens": 70483514.0, "step": 1844 }, { "epoch": 0.23470296399949117, "ewc_loss": 0.01958978921175003, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.894477701280266e-05, "grad_norm": 3.5800678730010986, "learning_rate": 7.816871555743959e-07, "loss": 0.473, "mean_token_accuracy": 0.8487594127655029, "num_tokens": 70520944.0, "step": 1845 }, { "epoch": 0.23483017427808167, "ewc_loss": 0.01954510249197483, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.849789497209713e-05, "grad_norm": 3.6011414527893066, "learning_rate": 7.821110640101738e-07, "loss": 0.4961, "mean_token_accuracy": 0.8393306136131287, "num_tokens": 70564399.0, "step": 1846 }, { "epoch": 0.23495738455667217, "ewc_loss": 0.01957646757364273, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.881156150484458e-05, "grad_norm": 3.612942695617676, "learning_rate": 7.825349724459517e-07, "loss": 0.4703, "mean_token_accuracy": 0.8511171936988831, "num_tokens": 70604000.0, "step": 1847 }, { "epoch": 0.2350845948352627, "ewc_loss": 0.019582586362957954, "ewc_loss_diag": 1.2695789337158203e-05, "ewc_loss_parallel": 6.887274503242224e-05, "grad_norm": 3.623976945877075, "learning_rate": 7.829588808817294e-07, "loss": 0.4631, "mean_token_accuracy": 0.8514880537986755, "num_tokens": 70645133.0, "step": 1848 }, { "epoch": 0.2352118051138532, "ewc_loss": 0.01965205930173397, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 6.895711703691632e-05, "grad_norm": 3.686023473739624, "learning_rate": 7.833827893175074e-07, "loss": 0.4968, "mean_token_accuracy": 0.8362826108932495, "num_tokens": 70675824.0, "step": 1849 }, { "epoch": 0.2353390153924437, "ewc_loss": 0.01968434825539589, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 6.927999493200332e-05, "grad_norm": 3.5914294719696045, "learning_rate": 7.838066977532852e-07, "loss": 0.4598, "mean_token_accuracy": 0.8524188995361328, "num_tokens": 70714103.0, "step": 1850 }, { "epoch": 0.23546622567103423, "ewc_loss": 0.019621698185801506, "ewc_loss_diag": 1.2755393981933594e-05, "ewc_loss_parallel": 6.865350587759167e-05, "grad_norm": 3.596550226211548, "learning_rate": 7.842306061890632e-07, "loss": 0.5071, "mean_token_accuracy": 0.840623140335083, "num_tokens": 70754265.0, "step": 1851 }, { "epoch": 0.23559343594962473, "ewc_loss": 0.019736338406801224, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 6.91895475029014e-05, "grad_norm": 3.6274771690368652, "learning_rate": 7.84654514624841e-07, "loss": 0.4781, "mean_token_accuracy": 0.8448072671890259, "num_tokens": 70791368.0, "step": 1852 }, { "epoch": 0.23572064622821523, "ewc_loss": 0.019740702584385872, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 6.92332032485865e-05, "grad_norm": 3.643998861312866, "learning_rate": 7.850784230606188e-07, "loss": 0.5209, "mean_token_accuracy": 0.8378887176513672, "num_tokens": 70827397.0, "step": 1853 }, { "epoch": 0.23584785650680576, "ewc_loss": 0.019744550809264183, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 6.927167851245031e-05, "grad_norm": 3.5114998817443848, "learning_rate": 7.855023314963968e-07, "loss": 0.4915, "mean_token_accuracy": 0.8452508449554443, "num_tokens": 70872503.0, "step": 1854 }, { "epoch": 0.23597506678539626, "ewc_loss": 0.01974516734480858, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.866750482004136e-05, "grad_norm": 3.6223249435424805, "learning_rate": 7.859262399321746e-07, "loss": 0.4922, "mean_token_accuracy": 0.8418801426887512, "num_tokens": 70914376.0, "step": 1855 }, { "epoch": 0.23610227706398676, "ewc_loss": 0.01987173967063427, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.993321585468948e-05, "grad_norm": 3.603285551071167, "learning_rate": 7.863501483679524e-07, "loss": 0.4498, "mean_token_accuracy": 0.8558545112609863, "num_tokens": 70954004.0, "step": 1856 }, { "epoch": 0.2362294873425773, "ewc_loss": 0.0197917427867651, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.913325341884047e-05, "grad_norm": 3.626479148864746, "learning_rate": 7.867740568037303e-07, "loss": 0.4492, "mean_token_accuracy": 0.852851927280426, "num_tokens": 70991765.0, "step": 1857 }, { "epoch": 0.2363566976211678, "ewc_loss": 0.01984361559152603, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.965197826502845e-05, "grad_norm": 3.6492340564727783, "learning_rate": 7.871979652395082e-07, "loss": 0.4784, "mean_token_accuracy": 0.8442214727401733, "num_tokens": 71025478.0, "step": 1858 }, { "epoch": 0.2364839078997583, "ewc_loss": 0.019846171140670776, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.967753870412707e-05, "grad_norm": 3.5536422729492188, "learning_rate": 7.876218736752861e-07, "loss": 0.4331, "mean_token_accuracy": 0.8642735481262207, "num_tokens": 71067287.0, "step": 1859 }, { "epoch": 0.23661111817834882, "ewc_loss": 0.019802067428827286, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.923649198142812e-05, "grad_norm": 3.6986255645751953, "learning_rate": 7.88045782111064e-07, "loss": 0.5011, "mean_token_accuracy": 0.8415451645851135, "num_tokens": 71108095.0, "step": 1860 }, { "epoch": 0.23673832845693932, "ewc_loss": 0.01989508979022503, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 7.016671588644385e-05, "grad_norm": 3.5710818767547607, "learning_rate": 7.884696905468418e-07, "loss": 0.4362, "mean_token_accuracy": 0.8605928421020508, "num_tokens": 71150297.0, "step": 1861 }, { "epoch": 0.23686553873552982, "ewc_loss": 0.01978279836475849, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.904381007188931e-05, "grad_norm": 3.6165566444396973, "learning_rate": 7.888935989826198e-07, "loss": 0.4842, "mean_token_accuracy": 0.843148946762085, "num_tokens": 71189205.0, "step": 1862 }, { "epoch": 0.23699274901412035, "ewc_loss": 0.019788116216659546, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 6.970732647459954e-05, "grad_norm": 3.615438461303711, "learning_rate": 7.893175074183976e-07, "loss": 0.4726, "mean_token_accuracy": 0.8498004674911499, "num_tokens": 71228141.0, "step": 1863 }, { "epoch": 0.23711995929271085, "ewc_loss": 0.01982821151614189, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.949794624233618e-05, "grad_norm": 3.5992908477783203, "learning_rate": 7.897414158541754e-07, "loss": 0.4723, "mean_token_accuracy": 0.8488181829452515, "num_tokens": 71267854.0, "step": 1864 }, { "epoch": 0.23724716957130135, "ewc_loss": 0.019753897562623024, "ewc_loss_diag": 1.2814998626708984e-05, "ewc_loss_parallel": 6.936515273991972e-05, "grad_norm": 3.627060651779175, "learning_rate": 7.901653242899533e-07, "loss": 0.5078, "mean_token_accuracy": 0.8430553078651428, "num_tokens": 71305030.0, "step": 1865 }, { "epoch": 0.23737437984989188, "ewc_loss": 0.019853509962558746, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.975092401262373e-05, "grad_norm": 3.643597364425659, "learning_rate": 7.905892327257312e-07, "loss": 0.4649, "mean_token_accuracy": 0.850278377532959, "num_tokens": 71342202.0, "step": 1866 }, { "epoch": 0.23750159012848238, "ewc_loss": 0.019841499626636505, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.963080522837117e-05, "grad_norm": 3.636145830154419, "learning_rate": 7.910131411615091e-07, "loss": 0.4655, "mean_token_accuracy": 0.8540281057357788, "num_tokens": 71379702.0, "step": 1867 }, { "epoch": 0.2376288004070729, "ewc_loss": 0.019842471927404404, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.964054045965895e-05, "grad_norm": 3.6125874519348145, "learning_rate": 7.91437049597287e-07, "loss": 0.4661, "mean_token_accuracy": 0.8499804735183716, "num_tokens": 71417785.0, "step": 1868 }, { "epoch": 0.2377560106856634, "ewc_loss": 0.019853033125400543, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.974615098442882e-05, "grad_norm": 3.6291306018829346, "learning_rate": 7.918609580330648e-07, "loss": 0.5102, "mean_token_accuracy": 0.837540864944458, "num_tokens": 71461358.0, "step": 1869 }, { "epoch": 0.2378832209642539, "ewc_loss": 0.01991475187242031, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 6.975299038458616e-05, "grad_norm": 3.677976369857788, "learning_rate": 7.922848664688428e-07, "loss": 0.5073, "mean_token_accuracy": 0.8411716222763062, "num_tokens": 71500059.0, "step": 1870 }, { "epoch": 0.23801043124284443, "ewc_loss": 0.019874513149261475, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.996095908107236e-05, "grad_norm": 3.6113152503967285, "learning_rate": 7.927087749046205e-07, "loss": 0.4191, "mean_token_accuracy": 0.8665347099304199, "num_tokens": 71537043.0, "step": 1871 }, { "epoch": 0.23813764152143493, "ewc_loss": 0.019836444407701492, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.958026642678306e-05, "grad_norm": 3.731765031814575, "learning_rate": 7.931326833403983e-07, "loss": 0.4538, "mean_token_accuracy": 0.8536233901977539, "num_tokens": 71570671.0, "step": 1872 }, { "epoch": 0.23826485180002543, "ewc_loss": 0.019912337884306908, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 7.033919246168807e-05, "grad_norm": 3.6641886234283447, "learning_rate": 7.935565917761763e-07, "loss": 0.4627, "mean_token_accuracy": 0.8525947332382202, "num_tokens": 71614813.0, "step": 1873 }, { "epoch": 0.23839206207861596, "ewc_loss": 0.01984863169491291, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 6.970213871682063e-05, "grad_norm": 3.703568696975708, "learning_rate": 7.939805002119541e-07, "loss": 0.4782, "mean_token_accuracy": 0.8468068838119507, "num_tokens": 71650441.0, "step": 1874 }, { "epoch": 0.23851927235720646, "ewc_loss": 0.01989293284714222, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 7.014514994807541e-05, "grad_norm": 3.6950571537017822, "learning_rate": 7.944044086477321e-07, "loss": 0.4854, "mean_token_accuracy": 0.8461837768554688, "num_tokens": 71690425.0, "step": 1875 }, { "epoch": 0.23864648263579696, "ewc_loss": 0.019926825538277626, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 6.987372762523592e-05, "grad_norm": 3.6445538997650146, "learning_rate": 7.948283170835099e-07, "loss": 0.4329, "mean_token_accuracy": 0.8595880270004272, "num_tokens": 71727349.0, "step": 1876 }, { "epoch": 0.2387736929143875, "ewc_loss": 0.019905155524611473, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 6.965702050365508e-05, "grad_norm": 3.559890031814575, "learning_rate": 7.952522255192878e-07, "loss": 0.4006, "mean_token_accuracy": 0.870829701423645, "num_tokens": 71770386.0, "step": 1877 }, { "epoch": 0.238900903192978, "ewc_loss": 0.01990503817796707, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 6.96558563504368e-05, "grad_norm": 3.6865246295928955, "learning_rate": 7.956761339550657e-07, "loss": 0.4294, "mean_token_accuracy": 0.8624931573867798, "num_tokens": 71808946.0, "step": 1878 }, { "epoch": 0.2390281134715685, "ewc_loss": 0.019933871924877167, "ewc_loss_diag": 1.2874603271484375e-05, "ewc_loss_parallel": 7.055454625515267e-05, "grad_norm": 3.787320375442505, "learning_rate": 7.961000423908435e-07, "loss": 0.4722, "mean_token_accuracy": 0.8547173738479614, "num_tokens": 71848605.0, "step": 1879 }, { "epoch": 0.23915532375015902, "ewc_loss": 0.01998227834701538, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.042824290692806e-05, "grad_norm": 3.6188225746154785, "learning_rate": 7.965239508266214e-07, "loss": 0.4505, "mean_token_accuracy": 0.8550348281860352, "num_tokens": 71886771.0, "step": 1880 }, { "epoch": 0.23928253402874952, "ewc_loss": 0.019895661622285843, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 6.956208380870521e-05, "grad_norm": 3.5988495349884033, "learning_rate": 7.969478592623993e-07, "loss": 0.4279, "mean_token_accuracy": 0.8607767224311829, "num_tokens": 71924905.0, "step": 1881 }, { "epoch": 0.23940974430734002, "ewc_loss": 0.019949475303292274, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.010022818576545e-05, "grad_norm": 3.6224911212921143, "learning_rate": 7.973717676981771e-07, "loss": 0.455, "mean_token_accuracy": 0.855420708656311, "num_tokens": 71964513.0, "step": 1882 }, { "epoch": 0.23953695458593055, "ewc_loss": 0.019958559423685074, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.01910539646633e-05, "grad_norm": 3.6376216411590576, "learning_rate": 7.977956761339551e-07, "loss": 0.4269, "mean_token_accuracy": 0.8620023727416992, "num_tokens": 72000918.0, "step": 1883 }, { "epoch": 0.23966416486452105, "ewc_loss": 0.0199607964605093, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.021343481028453e-05, "grad_norm": 3.667123794555664, "learning_rate": 7.982195845697329e-07, "loss": 0.5552, "mean_token_accuracy": 0.8242529034614563, "num_tokens": 72040840.0, "step": 1884 }, { "epoch": 0.23979137514311155, "ewc_loss": 0.019983313977718353, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.043861842248589e-05, "grad_norm": 3.676271677017212, "learning_rate": 7.986434930055108e-07, "loss": 0.4288, "mean_token_accuracy": 0.8605855107307434, "num_tokens": 72076757.0, "step": 1885 }, { "epoch": 0.23991858542170208, "ewc_loss": 0.01998516544699669, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.045711390674114e-05, "grad_norm": 3.6051013469696045, "learning_rate": 7.990674014412886e-07, "loss": 0.4833, "mean_token_accuracy": 0.8470567464828491, "num_tokens": 72120760.0, "step": 1886 }, { "epoch": 0.24004579570029258, "ewc_loss": 0.019940122961997986, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.000670302659273e-05, "grad_norm": 3.637420415878296, "learning_rate": 7.994913098770665e-07, "loss": 0.4712, "mean_token_accuracy": 0.8490802645683289, "num_tokens": 72159985.0, "step": 1887 }, { "epoch": 0.24017300597888308, "ewc_loss": 0.020002536475658417, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.063082739477977e-05, "grad_norm": 3.6714606285095215, "learning_rate": 7.999152183128444e-07, "loss": 0.5042, "mean_token_accuracy": 0.8382083177566528, "num_tokens": 72196377.0, "step": 1888 }, { "epoch": 0.2403002162574736, "ewc_loss": 0.02001134306192398, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.071888830978423e-05, "grad_norm": 3.608968496322632, "learning_rate": 8.003391267486223e-07, "loss": 0.451, "mean_token_accuracy": 0.8554834127426147, "num_tokens": 72236585.0, "step": 1889 }, { "epoch": 0.2404274265360641, "ewc_loss": 0.019972607493400574, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.033154543023556e-05, "grad_norm": 3.759766101837158, "learning_rate": 8.007630351844001e-07, "loss": 0.4644, "mean_token_accuracy": 0.8503821492195129, "num_tokens": 72268450.0, "step": 1890 }, { "epoch": 0.24055463681465464, "ewc_loss": 0.020090920850634575, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.151468162192032e-05, "grad_norm": 3.6348702907562256, "learning_rate": 8.011869436201781e-07, "loss": 0.45, "mean_token_accuracy": 0.8558470010757446, "num_tokens": 72304092.0, "step": 1891 }, { "epoch": 0.24068184709324514, "ewc_loss": 0.01996772363781929, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.028270192677155e-05, "grad_norm": 3.640226125717163, "learning_rate": 8.016108520559559e-07, "loss": 0.4755, "mean_token_accuracy": 0.8480489253997803, "num_tokens": 72347312.0, "step": 1892 }, { "epoch": 0.24080905737183564, "ewc_loss": 0.020043885335326195, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.104432006599382e-05, "grad_norm": 3.697704792022705, "learning_rate": 8.020347604917338e-07, "loss": 0.4398, "mean_token_accuracy": 0.8585414886474609, "num_tokens": 72382135.0, "step": 1893 }, { "epoch": 0.24093626765042617, "ewc_loss": 0.02005033567547798, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.110883598215878e-05, "grad_norm": 3.6417922973632812, "learning_rate": 8.024586689275116e-07, "loss": 0.4645, "mean_token_accuracy": 0.851353645324707, "num_tokens": 72421526.0, "step": 1894 }, { "epoch": 0.24106347792901667, "ewc_loss": 0.02002066746354103, "ewc_loss_diag": 1.2934207916259766e-05, "ewc_loss_parallel": 7.081215153448284e-05, "grad_norm": 3.7003653049468994, "learning_rate": 8.028825773632894e-07, "loss": 0.4679, "mean_token_accuracy": 0.8530167937278748, "num_tokens": 72458592.0, "step": 1895 }, { "epoch": 0.24119068820760717, "ewc_loss": 0.020210664719343185, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 7.149141310947016e-05, "grad_norm": 3.67014479637146, "learning_rate": 8.033064857990674e-07, "loss": 0.3996, "mean_token_accuracy": 0.8713433742523193, "num_tokens": 72494734.0, "step": 1896 }, { "epoch": 0.2413178984861977, "ewc_loss": 0.02017161250114441, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 7.110088336048648e-05, "grad_norm": 3.6330819129943848, "learning_rate": 8.037303942348452e-07, "loss": 0.4487, "mean_token_accuracy": 0.8575983047485352, "num_tokens": 72531560.0, "step": 1897 }, { "epoch": 0.2414451087647882, "ewc_loss": 0.020238012075424194, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 7.115454354789108e-05, "grad_norm": 3.65246319770813, "learning_rate": 8.041543026706231e-07, "loss": 0.4459, "mean_token_accuracy": 0.8577075004577637, "num_tokens": 72570144.0, "step": 1898 }, { "epoch": 0.2415723190433787, "ewc_loss": 0.02025013603270054, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 7.12757755536586e-05, "grad_norm": 3.6092162132263184, "learning_rate": 8.04578211106401e-07, "loss": 0.4548, "mean_token_accuracy": 0.8528503179550171, "num_tokens": 72606738.0, "step": 1899 }, { "epoch": 0.24169952932196923, "ewc_loss": 0.02024148963391781, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 7.118930807337165e-05, "grad_norm": 3.629671573638916, "learning_rate": 8.050021195421789e-07, "loss": 0.4987, "mean_token_accuracy": 0.8418755531311035, "num_tokens": 72646230.0, "step": 1900 }, { "epoch": 0.24182673960055973, "ewc_loss": 0.02027742937207222, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 7.154871127568185e-05, "grad_norm": 3.6634232997894287, "learning_rate": 8.054260279779567e-07, "loss": 0.4683, "mean_token_accuracy": 0.8477305173873901, "num_tokens": 72681811.0, "step": 1901 }, { "epoch": 0.24195394987915023, "ewc_loss": 0.02031455561518669, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 7.191998156486079e-05, "grad_norm": 3.6585512161254883, "learning_rate": 8.058499364137346e-07, "loss": 0.5038, "mean_token_accuracy": 0.8403576016426086, "num_tokens": 72721805.0, "step": 1902 }, { "epoch": 0.24208116015774075, "ewc_loss": 0.020159486681222916, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 7.158998778322712e-05, "grad_norm": 3.581902027130127, "learning_rate": 8.062738448495124e-07, "loss": 0.4192, "mean_token_accuracy": 0.865373432636261, "num_tokens": 72763142.0, "step": 1903 }, { "epoch": 0.24220837043633126, "ewc_loss": 0.02027473971247673, "ewc_loss_diag": 1.3113021850585938e-05, "ewc_loss_parallel": 7.15218047844246e-05, "grad_norm": 3.6249899864196777, "learning_rate": 8.066977532852904e-07, "loss": 0.4101, "mean_token_accuracy": 0.8728716373443604, "num_tokens": 72802047.0, "step": 1904 }, { "epoch": 0.24233558071492176, "ewc_loss": 0.020202098414301872, "ewc_loss_diag": 1.2993812561035156e-05, "ewc_loss_parallel": 7.201609696494415e-05, "grad_norm": 3.7343640327453613, "learning_rate": 8.071216617210682e-07, "loss": 0.4858, "mean_token_accuracy": 0.8461506366729736, "num_tokens": 72834470.0, "step": 1905 }, { "epoch": 0.24246279099351228, "ewc_loss": 0.020290223881602287, "ewc_loss_diag": 1.3053417205810547e-05, "ewc_loss_parallel": 7.228700997075066e-05, "grad_norm": 3.634253978729248, "learning_rate": 8.075455701568461e-07, "loss": 0.457, "mean_token_accuracy": 0.8546128273010254, "num_tokens": 72873101.0, "step": 1906 }, { "epoch": 0.24259000127210278, "ewc_loss": 0.02034190110862255, "ewc_loss_diag": 1.3172626495361328e-05, "ewc_loss_parallel": 7.158306834753603e-05, "grad_norm": 3.6048080921173096, "learning_rate": 8.07969478592624e-07, "loss": 0.4382, "mean_token_accuracy": 0.8623783588409424, "num_tokens": 72915504.0, "step": 1907 }, { "epoch": 0.24271721155069328, "ewc_loss": 0.020377114415168762, "ewc_loss_diag": 1.3172626495361328e-05, "ewc_loss_parallel": 7.193519559223205e-05, "grad_norm": 3.6874582767486572, "learning_rate": 8.083933870284019e-07, "loss": 0.412, "mean_token_accuracy": 0.8697307109832764, "num_tokens": 72947179.0, "step": 1908 }, { "epoch": 0.2428444218292838, "ewc_loss": 0.020413033664226532, "ewc_loss_diag": 1.3172626495361328e-05, "ewc_loss_parallel": 7.229439506772906e-05, "grad_norm": 3.6715705394744873, "learning_rate": 8.088172954641796e-07, "loss": 0.4514, "mean_token_accuracy": 0.8554095029830933, "num_tokens": 72983313.0, "step": 1909 }, { "epoch": 0.2429716321078743, "ewc_loss": 0.02039152942597866, "ewc_loss_diag": 1.3172626495361328e-05, "ewc_loss_parallel": 7.207936141639948e-05, "grad_norm": 3.760650157928467, "learning_rate": 8.092412038999576e-07, "loss": 0.4997, "mean_token_accuracy": 0.8423085808753967, "num_tokens": 73014218.0, "step": 1910 }, { "epoch": 0.24309884238646481, "ewc_loss": 0.020444586873054504, "ewc_loss_diag": 1.3172626495361328e-05, "ewc_loss_parallel": 7.260993152158335e-05, "grad_norm": 3.7163288593292236, "learning_rate": 8.096651123357354e-07, "loss": 0.51, "mean_token_accuracy": 0.8345142602920532, "num_tokens": 73048282.0, "step": 1911 }, { "epoch": 0.24322605266505534, "ewc_loss": 0.02041386067867279, "ewc_loss_diag": 1.3172626495361328e-05, "ewc_loss_parallel": 7.2302675107494e-05, "grad_norm": 3.742595911026001, "learning_rate": 8.100890207715134e-07, "loss": 0.4973, "mean_token_accuracy": 0.8420407772064209, "num_tokens": 73084390.0, "step": 1912 }, { "epoch": 0.24335326294364584, "ewc_loss": 0.020495861768722534, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.251232455018908e-05, "grad_norm": 3.681124687194824, "learning_rate": 8.105129292072912e-07, "loss": 0.4644, "mean_token_accuracy": 0.8522320985794067, "num_tokens": 73123296.0, "step": 1913 }, { "epoch": 0.24348047322223634, "ewc_loss": 0.0205227043479681, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.217039819806814e-05, "grad_norm": 3.7377641201019287, "learning_rate": 8.10936837643069e-07, "loss": 0.5072, "mean_token_accuracy": 0.8401477336883545, "num_tokens": 73157533.0, "step": 1914 }, { "epoch": 0.24360768350082687, "ewc_loss": 0.020523210987448692, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.278582052094862e-05, "grad_norm": 3.6612708568573, "learning_rate": 8.11360746078847e-07, "loss": 0.4438, "mean_token_accuracy": 0.857085108757019, "num_tokens": 73195314.0, "step": 1915 }, { "epoch": 0.24373489377941737, "ewc_loss": 0.02052871324121952, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.223049033200368e-05, "grad_norm": 3.636107921600342, "learning_rate": 8.117846545146248e-07, "loss": 0.4124, "mean_token_accuracy": 0.8654029369354248, "num_tokens": 73236763.0, "step": 1916 }, { "epoch": 0.2438621040580079, "ewc_loss": 0.02054043859243393, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.234775694087148e-05, "grad_norm": 3.697446346282959, "learning_rate": 8.122085629504026e-07, "loss": 0.4993, "mean_token_accuracy": 0.8390320539474487, "num_tokens": 73276719.0, "step": 1917 }, { "epoch": 0.2439893143365984, "ewc_loss": 0.020582716912031174, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.27705264580436e-05, "grad_norm": 3.651430368423462, "learning_rate": 8.126324713861805e-07, "loss": 0.4665, "mean_token_accuracy": 0.8528737425804138, "num_tokens": 73318594.0, "step": 1918 }, { "epoch": 0.2441165246151889, "ewc_loss": 0.020549794659018517, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.244131120387465e-05, "grad_norm": 3.6339282989501953, "learning_rate": 8.130563798219584e-07, "loss": 0.4424, "mean_token_accuracy": 0.8607444167137146, "num_tokens": 73358760.0, "step": 1919 }, { "epoch": 0.24424373489377943, "ewc_loss": 0.02056543156504631, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.259767153300345e-05, "grad_norm": 3.690502643585205, "learning_rate": 8.134802882577363e-07, "loss": 0.4847, "mean_token_accuracy": 0.850072979927063, "num_tokens": 73398623.0, "step": 1920 }, { "epoch": 0.24437094517236993, "ewc_loss": 0.020597361028194427, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.291697693290189e-05, "grad_norm": 3.6488306522369385, "learning_rate": 8.139041966935142e-07, "loss": 0.4673, "mean_token_accuracy": 0.8529161214828491, "num_tokens": 73446369.0, "step": 1921 }, { "epoch": 0.24449815545096043, "ewc_loss": 0.02053855173289776, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.232888310682029e-05, "grad_norm": 3.7048025131225586, "learning_rate": 8.14328105129292e-07, "loss": 0.5267, "mean_token_accuracy": 0.8383501768112183, "num_tokens": 73483152.0, "step": 1922 }, { "epoch": 0.24462536572955096, "ewc_loss": 0.02061256393790245, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.306900806725025e-05, "grad_norm": 3.7292182445526123, "learning_rate": 8.1475201356507e-07, "loss": 0.4802, "mean_token_accuracy": 0.8471797704696655, "num_tokens": 73519179.0, "step": 1923 }, { "epoch": 0.24475257600814146, "ewc_loss": 0.02058735303580761, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.281688885996118e-05, "grad_norm": 3.689356803894043, "learning_rate": 8.151759220008477e-07, "loss": 0.4529, "mean_token_accuracy": 0.8539470434188843, "num_tokens": 73561947.0, "step": 1924 }, { "epoch": 0.24487978628673196, "ewc_loss": 0.020552482455968857, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.246817403938621e-05, "grad_norm": 3.7175087928771973, "learning_rate": 8.155998304366256e-07, "loss": 0.4676, "mean_token_accuracy": 0.85535728931427, "num_tokens": 73595841.0, "step": 1925 }, { "epoch": 0.2450069965653225, "ewc_loss": 0.020534569397568703, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.289940549526364e-05, "grad_norm": 3.770181655883789, "learning_rate": 8.160237388724035e-07, "loss": 0.4934, "mean_token_accuracy": 0.8418065309524536, "num_tokens": 73629716.0, "step": 1926 }, { "epoch": 0.245134206843913, "ewc_loss": 0.020550131797790527, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.305503822863102e-05, "grad_norm": 3.707207202911377, "learning_rate": 8.164476473081814e-07, "loss": 0.4809, "mean_token_accuracy": 0.8483375310897827, "num_tokens": 73667285.0, "step": 1927 }, { "epoch": 0.2452614171225035, "ewc_loss": 0.020494986325502396, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.250357157317922e-05, "grad_norm": 3.628448724746704, "learning_rate": 8.168715557439593e-07, "loss": 0.4254, "mean_token_accuracy": 0.8639601469039917, "num_tokens": 73708059.0, "step": 1928 }, { "epoch": 0.24538862740109402, "ewc_loss": 0.020499542355537415, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.254914089571685e-05, "grad_norm": 3.670210123062134, "learning_rate": 8.172954641797372e-07, "loss": 0.4385, "mean_token_accuracy": 0.8591117858886719, "num_tokens": 73748553.0, "step": 1929 }, { "epoch": 0.24551583767968452, "ewc_loss": 0.020525116473436356, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.280487625394017e-05, "grad_norm": 3.6647181510925293, "learning_rate": 8.17719372615515e-07, "loss": 0.5379, "mean_token_accuracy": 0.8295499086380005, "num_tokens": 73796295.0, "step": 1930 }, { "epoch": 0.24564304795827502, "ewc_loss": 0.020502407103776932, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.257777178892866e-05, "grad_norm": 3.706885576248169, "learning_rate": 8.18143281051293e-07, "loss": 0.5161, "mean_token_accuracy": 0.8383682370185852, "num_tokens": 73835960.0, "step": 1931 }, { "epoch": 0.24577025823686555, "ewc_loss": 0.020546870306134224, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.302241283468902e-05, "grad_norm": 3.689810037612915, "learning_rate": 8.185671894870707e-07, "loss": 0.5318, "mean_token_accuracy": 0.833008885383606, "num_tokens": 73875302.0, "step": 1932 }, { "epoch": 0.24589746851545605, "ewc_loss": 0.02051534876227379, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.270720379892737e-05, "grad_norm": 3.726483106613159, "learning_rate": 8.189910979228485e-07, "loss": 0.4242, "mean_token_accuracy": 0.862821102142334, "num_tokens": 73907703.0, "step": 1933 }, { "epoch": 0.24602467879404655, "ewc_loss": 0.020565474405884743, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.320845907088369e-05, "grad_norm": 3.7267050743103027, "learning_rate": 8.194150063586265e-07, "loss": 0.5193, "mean_token_accuracy": 0.8383136987686157, "num_tokens": 73945449.0, "step": 1934 }, { "epoch": 0.24615188907263708, "ewc_loss": 0.020539812743663788, "ewc_loss_diag": 1.3232231140136719e-05, "ewc_loss_parallel": 7.295183604583144e-05, "grad_norm": 3.7296242713928223, "learning_rate": 8.198389147944043e-07, "loss": 0.5057, "mean_token_accuracy": 0.8370054364204407, "num_tokens": 73982461.0, "step": 1935 }, { "epoch": 0.24627909935122758, "ewc_loss": 0.0206160731613636, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.310409273486584e-05, "grad_norm": 3.683140993118286, "learning_rate": 8.202628232301823e-07, "loss": 0.4788, "mean_token_accuracy": 0.8515331149101257, "num_tokens": 74018252.0, "step": 1936 }, { "epoch": 0.24640630962981808, "ewc_loss": 0.020603707060217857, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.298042328329757e-05, "grad_norm": 3.6155576705932617, "learning_rate": 8.206867316659601e-07, "loss": 0.5009, "mean_token_accuracy": 0.8386850357055664, "num_tokens": 74065843.0, "step": 1937 }, { "epoch": 0.2465335199084086, "ewc_loss": 0.02059594914317131, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.290283974725753e-05, "grad_norm": 3.645707368850708, "learning_rate": 8.21110640101738e-07, "loss": 0.3914, "mean_token_accuracy": 0.8719752430915833, "num_tokens": 74106389.0, "step": 1938 }, { "epoch": 0.2466607301869991, "ewc_loss": 0.020624756813049316, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.319092401303351e-05, "grad_norm": 3.6897642612457275, "learning_rate": 8.215345485375159e-07, "loss": 0.5117, "mean_token_accuracy": 0.8383891582489014, "num_tokens": 74150786.0, "step": 1939 }, { "epoch": 0.2467879404655896, "ewc_loss": 0.020643971860408783, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.338308932958171e-05, "grad_norm": 3.7091755867004395, "learning_rate": 8.219584569732937e-07, "loss": 0.4814, "mean_token_accuracy": 0.8435405492782593, "num_tokens": 74190474.0, "step": 1940 }, { "epoch": 0.24691515074418013, "ewc_loss": 0.020657401531934738, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.351736712735146e-05, "grad_norm": 3.702255964279175, "learning_rate": 8.223823654090715e-07, "loss": 0.4321, "mean_token_accuracy": 0.8621881604194641, "num_tokens": 74226437.0, "step": 1941 }, { "epoch": 0.24704236102277063, "ewc_loss": 0.02062338963150978, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.317724521271884e-05, "grad_norm": 3.7213656902313232, "learning_rate": 8.228062738448495e-07, "loss": 0.4592, "mean_token_accuracy": 0.8511735200881958, "num_tokens": 74267292.0, "step": 1942 }, { "epoch": 0.24716957130136116, "ewc_loss": 0.0206461139023304, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.340448792092502e-05, "grad_norm": 3.6751787662506104, "learning_rate": 8.232301822806273e-07, "loss": 0.4855, "mean_token_accuracy": 0.8457046747207642, "num_tokens": 74306738.0, "step": 1943 }, { "epoch": 0.24729678157995166, "ewc_loss": 0.02061578258872032, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.310118962777779e-05, "grad_norm": 3.7914953231811523, "learning_rate": 8.236540907164053e-07, "loss": 0.4736, "mean_token_accuracy": 0.8506534099578857, "num_tokens": 74343151.0, "step": 1944 }, { "epoch": 0.24742399185854216, "ewc_loss": 0.02068919688463211, "ewc_loss_diag": 1.329183578491211e-05, "ewc_loss_parallel": 7.383533375104889e-05, "grad_norm": 3.7021777629852295, "learning_rate": 8.240779991521831e-07, "loss": 0.4511, "mean_token_accuracy": 0.8549516797065735, "num_tokens": 74382755.0, "step": 1945 }, { "epoch": 0.2475512021371327, "ewc_loss": 0.020674560219049454, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 7.307861960725859e-05, "grad_norm": 3.6700377464294434, "learning_rate": 8.24501907587961e-07, "loss": 0.487, "mean_token_accuracy": 0.8450083136558533, "num_tokens": 74426129.0, "step": 1946 }, { "epoch": 0.2476784124157232, "ewc_loss": 0.02079245075583458, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.36471702111885e-05, "grad_norm": 3.764285087585449, "learning_rate": 8.249258160237388e-07, "loss": 0.497, "mean_token_accuracy": 0.8477840423583984, "num_tokens": 74460904.0, "step": 1947 }, { "epoch": 0.2478056226943137, "ewc_loss": 0.02081485651433468, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.387122604995966e-05, "grad_norm": 3.721465826034546, "learning_rate": 8.253497244595167e-07, "loss": 0.4872, "mean_token_accuracy": 0.8452145457267761, "num_tokens": 74497993.0, "step": 1948 }, { "epoch": 0.24793283297290422, "ewc_loss": 0.020785052329301834, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.357317372225225e-05, "grad_norm": 3.7130165100097656, "learning_rate": 8.257736328952945e-07, "loss": 0.5073, "mean_token_accuracy": 0.8406184911727905, "num_tokens": 74541108.0, "step": 1949 }, { "epoch": 0.24806004325149472, "ewc_loss": 0.02075650356709957, "ewc_loss_diag": 1.33514404296875e-05, "ewc_loss_parallel": 7.389804522972554e-05, "grad_norm": 3.718987226486206, "learning_rate": 8.261975413310725e-07, "loss": 0.4757, "mean_token_accuracy": 0.8504207134246826, "num_tokens": 74579389.0, "step": 1950 }, { "epoch": 0.24818725353008522, "ewc_loss": 0.02082567662000656, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.397941953968257e-05, "grad_norm": 3.6341540813446045, "learning_rate": 8.266214497668503e-07, "loss": 0.4301, "mean_token_accuracy": 0.8646566867828369, "num_tokens": 74621546.0, "step": 1951 }, { "epoch": 0.24831446380867575, "ewc_loss": 0.020786819979548454, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.359086157521233e-05, "grad_norm": 3.7381482124328613, "learning_rate": 8.270453582026283e-07, "loss": 0.5091, "mean_token_accuracy": 0.8361477851867676, "num_tokens": 74660178.0, "step": 1952 }, { "epoch": 0.24844167408726625, "ewc_loss": 0.02088293619453907, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.455201557604596e-05, "grad_norm": 3.745589256286621, "learning_rate": 8.274692666384061e-07, "loss": 0.4561, "mean_token_accuracy": 0.8550292253494263, "num_tokens": 74693867.0, "step": 1953 }, { "epoch": 0.24856888436585675, "ewc_loss": 0.020827749744057655, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.400015601888299e-05, "grad_norm": 3.7493324279785156, "learning_rate": 8.27893175074184e-07, "loss": 0.4692, "mean_token_accuracy": 0.8489383459091187, "num_tokens": 74727561.0, "step": 1954 }, { "epoch": 0.24869609464444728, "ewc_loss": 0.020862963050603867, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.435229053953663e-05, "grad_norm": 3.7625346183776855, "learning_rate": 8.283170835099618e-07, "loss": 0.4725, "mean_token_accuracy": 0.8503892421722412, "num_tokens": 74763719.0, "step": 1955 }, { "epoch": 0.24882330492303778, "ewc_loss": 0.0208536759018898, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.425941294059157e-05, "grad_norm": 3.7238388061523438, "learning_rate": 8.287409919457396e-07, "loss": 0.4358, "mean_token_accuracy": 0.8595488667488098, "num_tokens": 74802666.0, "step": 1956 }, { "epoch": 0.24895051520162828, "ewc_loss": 0.020839212462306023, "ewc_loss_diag": 1.341104507446289e-05, "ewc_loss_parallel": 7.411477417917922e-05, "grad_norm": 3.7727696895599365, "learning_rate": 8.291649003815175e-07, "loss": 0.4872, "mean_token_accuracy": 0.8458899259567261, "num_tokens": 74840307.0, "step": 1957 }, { "epoch": 0.2490777254802188, "ewc_loss": 0.02093372493982315, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 7.444954826496542e-05, "grad_norm": 3.7089180946350098, "learning_rate": 8.295888088172954e-07, "loss": 0.4985, "mean_token_accuracy": 0.8439825773239136, "num_tokens": 74880181.0, "step": 1958 }, { "epoch": 0.2492049357588093, "ewc_loss": 0.020874518901109695, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 7.38574963179417e-05, "grad_norm": 3.690316915512085, "learning_rate": 8.300127172530733e-07, "loss": 0.4065, "mean_token_accuracy": 0.8680696487426758, "num_tokens": 74917148.0, "step": 1959 }, { "epoch": 0.2493321460373998, "ewc_loss": 0.020932340994477272, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 7.443571666954085e-05, "grad_norm": 3.760638952255249, "learning_rate": 8.304366256888512e-07, "loss": 0.4441, "mean_token_accuracy": 0.8563442826271057, "num_tokens": 74953124.0, "step": 1960 }, { "epoch": 0.24945935631599034, "ewc_loss": 0.020964795723557472, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 7.47602607589215e-05, "grad_norm": 3.8183627128601074, "learning_rate": 8.308605341246291e-07, "loss": 0.456, "mean_token_accuracy": 0.8541314005851746, "num_tokens": 74989726.0, "step": 1961 }, { "epoch": 0.24958656659458084, "ewc_loss": 0.020954225212335587, "ewc_loss_diag": 1.3470649719238281e-05, "ewc_loss_parallel": 7.465454837074503e-05, "grad_norm": 3.722459077835083, "learning_rate": 8.312844425604068e-07, "loss": 0.4584, "mean_token_accuracy": 0.8596965074539185, "num_tokens": 75024849.0, "step": 1962 }, { "epoch": 0.24971377687317134, "ewc_loss": 0.020963961258530617, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.414156425511464e-05, "grad_norm": 3.6593735218048096, "learning_rate": 8.317083509961848e-07, "loss": 0.4851, "mean_token_accuracy": 0.8479447960853577, "num_tokens": 75066464.0, "step": 1963 }, { "epoch": 0.24984098715176187, "ewc_loss": 0.02097698114812374, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.427176024066284e-05, "grad_norm": 3.68491530418396, "learning_rate": 8.321322594319626e-07, "loss": 0.4374, "mean_token_accuracy": 0.8592438697814941, "num_tokens": 75106829.0, "step": 1964 }, { "epoch": 0.24996819743035237, "ewc_loss": 0.021028263494372368, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.478459156118333e-05, "grad_norm": 3.720856189727783, "learning_rate": 8.325561678677405e-07, "loss": 0.492, "mean_token_accuracy": 0.8455315828323364, "num_tokens": 75148227.0, "step": 1965 }, { "epoch": 0.2500954077089429, "ewc_loss": 0.02103770337998867, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.487898255931213e-05, "grad_norm": 3.754596471786499, "learning_rate": 8.329800763035184e-07, "loss": 0.5039, "mean_token_accuracy": 0.8387046456336975, "num_tokens": 75188719.0, "step": 1966 }, { "epoch": 0.25022261798753337, "ewc_loss": 0.021048609167337418, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.498804916394874e-05, "grad_norm": 3.7819626331329346, "learning_rate": 8.334039847392963e-07, "loss": 0.4549, "mean_token_accuracy": 0.8524141311645508, "num_tokens": 75222260.0, "step": 1967 }, { "epoch": 0.2503498282661239, "ewc_loss": 0.021042823791503906, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.493018347304314e-05, "grad_norm": 3.732327938079834, "learning_rate": 8.338278931750742e-07, "loss": 0.4305, "mean_token_accuracy": 0.8624724745750427, "num_tokens": 75255489.0, "step": 1968 }, { "epoch": 0.2504770385447144, "ewc_loss": 0.021019699051976204, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.469893898814917e-05, "grad_norm": 3.731356620788574, "learning_rate": 8.342518016108521e-07, "loss": 0.4561, "mean_token_accuracy": 0.8511038422584534, "num_tokens": 75290548.0, "step": 1969 }, { "epoch": 0.2506042488233049, "ewc_loss": 0.02105526626110077, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.50546096242033e-05, "grad_norm": 3.798248767852783, "learning_rate": 8.346757100466298e-07, "loss": 0.458, "mean_token_accuracy": 0.8553021550178528, "num_tokens": 75323912.0, "step": 1970 }, { "epoch": 0.2507314591018954, "ewc_loss": 0.02109384536743164, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.544040272478014e-05, "grad_norm": 3.7936720848083496, "learning_rate": 8.350996184824078e-07, "loss": 0.4973, "mean_token_accuracy": 0.8393828272819519, "num_tokens": 75361206.0, "step": 1971 }, { "epoch": 0.25085866938048595, "ewc_loss": 0.021063197404146194, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.513393211411312e-05, "grad_norm": 3.7147107124328613, "learning_rate": 8.355235269181856e-07, "loss": 0.4775, "mean_token_accuracy": 0.846435546875, "num_tokens": 75397292.0, "step": 1972 }, { "epoch": 0.2509858796590764, "ewc_loss": 0.021048694849014282, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.49889004509896e-05, "grad_norm": 3.7050793170928955, "learning_rate": 8.359474353539635e-07, "loss": 0.4394, "mean_token_accuracy": 0.860836386680603, "num_tokens": 75437616.0, "step": 1973 }, { "epoch": 0.25111308993766696, "ewc_loss": 0.021075204014778137, "ewc_loss_diag": 1.3530254364013672e-05, "ewc_loss_parallel": 7.525398541474715e-05, "grad_norm": 3.699040412902832, "learning_rate": 8.363713437897414e-07, "loss": 0.5271, "mean_token_accuracy": 0.8347579836845398, "num_tokens": 75486172.0, "step": 1974 }, { "epoch": 0.2512403002162575, "ewc_loss": 0.021206559613347054, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.534684118581936e-05, "grad_norm": 3.725567102432251, "learning_rate": 8.367952522255193e-07, "loss": 0.4632, "mean_token_accuracy": 0.849098265171051, "num_tokens": 75526637.0, "step": 1975 }, { "epoch": 0.25136751049484796, "ewc_loss": 0.021222252398729324, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.550376903964207e-05, "grad_norm": 3.7691211700439453, "learning_rate": 8.372191606612972e-07, "loss": 0.4656, "mean_token_accuracy": 0.85411536693573, "num_tokens": 75564632.0, "step": 1976 }, { "epoch": 0.2514947207734385, "ewc_loss": 0.021237147971987724, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.565272971987724e-05, "grad_norm": 3.7795650959014893, "learning_rate": 8.376430690970749e-07, "loss": 0.4325, "mean_token_accuracy": 0.8632538914680481, "num_tokens": 75595553.0, "step": 1977 }, { "epoch": 0.251621931052029, "ewc_loss": 0.021213360130786896, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.541484956163913e-05, "grad_norm": 3.6786577701568604, "learning_rate": 8.380669775328528e-07, "loss": 0.4274, "mean_token_accuracy": 0.8638966083526611, "num_tokens": 75635847.0, "step": 1978 }, { "epoch": 0.25174914133061954, "ewc_loss": 0.021189652383327484, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.517777703469619e-05, "grad_norm": 3.7063660621643066, "learning_rate": 8.384908859686307e-07, "loss": 0.4256, "mean_token_accuracy": 0.8649804592132568, "num_tokens": 75677403.0, "step": 1979 }, { "epoch": 0.25187635160921, "ewc_loss": 0.02124529331922531, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.573419134132564e-05, "grad_norm": 3.7162058353424072, "learning_rate": 8.389147944044086e-07, "loss": 0.4667, "mean_token_accuracy": 0.8509351015090942, "num_tokens": 75720703.0, "step": 1980 }, { "epoch": 0.25200356188780054, "ewc_loss": 0.021234944462776184, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.563070539617911e-05, "grad_norm": 3.7761635780334473, "learning_rate": 8.393387028401864e-07, "loss": 0.4914, "mean_token_accuracy": 0.8441861271858215, "num_tokens": 75758000.0, "step": 1981 }, { "epoch": 0.25213077216639107, "ewc_loss": 0.02126089669764042, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.589021697640419e-05, "grad_norm": 3.8033173084259033, "learning_rate": 8.397626112759644e-07, "loss": 0.5167, "mean_token_accuracy": 0.8378827571868896, "num_tokens": 75791113.0, "step": 1982 }, { "epoch": 0.25225798244498154, "ewc_loss": 0.021248875185847282, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.577000360470265e-05, "grad_norm": 3.8339786529541016, "learning_rate": 8.401865197117422e-07, "loss": 0.463, "mean_token_accuracy": 0.8531385064125061, "num_tokens": 75821733.0, "step": 1983 }, { "epoch": 0.25238519272357207, "ewc_loss": 0.021274521946907043, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.602648111060262e-05, "grad_norm": 3.671114444732666, "learning_rate": 8.406104281475202e-07, "loss": 0.4415, "mean_token_accuracy": 0.8568891286849976, "num_tokens": 75861404.0, "step": 1984 }, { "epoch": 0.2525124030021626, "ewc_loss": 0.02119402401149273, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.52214909880422e-05, "grad_norm": 3.7031760215759277, "learning_rate": 8.410343365832979e-07, "loss": 0.4428, "mean_token_accuracy": 0.8575602769851685, "num_tokens": 75903059.0, "step": 1985 }, { "epoch": 0.2526396132807531, "ewc_loss": 0.021263975650072098, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.592100882902741e-05, "grad_norm": 3.6687769889831543, "learning_rate": 8.414582450190758e-07, "loss": 0.4546, "mean_token_accuracy": 0.8559460639953613, "num_tokens": 75947125.0, "step": 1986 }, { "epoch": 0.2527668235593436, "ewc_loss": 0.021247483789920807, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.575608469778672e-05, "grad_norm": 3.7789595127105713, "learning_rate": 8.418821534548537e-07, "loss": 0.5389, "mean_token_accuracy": 0.8310814499855042, "num_tokens": 75987519.0, "step": 1987 }, { "epoch": 0.25289403383793413, "ewc_loss": 0.021303629502654076, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.63175412430428e-05, "grad_norm": 3.7477872371673584, "learning_rate": 8.423060618906316e-07, "loss": 0.4648, "mean_token_accuracy": 0.8528885245323181, "num_tokens": 76027728.0, "step": 1988 }, { "epoch": 0.2530212441165246, "ewc_loss": 0.021244974806904793, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.573100447189063e-05, "grad_norm": 3.7164697647094727, "learning_rate": 8.427299703264095e-07, "loss": 0.4643, "mean_token_accuracy": 0.854374349117279, "num_tokens": 76070942.0, "step": 1989 }, { "epoch": 0.25314845439511513, "ewc_loss": 0.021256858482956886, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.584982813568786e-05, "grad_norm": 3.711625576019287, "learning_rate": 8.431538787621874e-07, "loss": 0.4881, "mean_token_accuracy": 0.8461430072784424, "num_tokens": 76113330.0, "step": 1990 }, { "epoch": 0.25327566467370566, "ewc_loss": 0.02127506583929062, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.603191625094041e-05, "grad_norm": 3.731153964996338, "learning_rate": 8.435777871979652e-07, "loss": 0.4433, "mean_token_accuracy": 0.8600254654884338, "num_tokens": 76153511.0, "step": 1991 }, { "epoch": 0.25340287495229613, "ewc_loss": 0.02128320001065731, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.611324690515175e-05, "grad_norm": 3.7591214179992676, "learning_rate": 8.440016956337432e-07, "loss": 0.4689, "mean_token_accuracy": 0.8530056476593018, "num_tokens": 76191505.0, "step": 1992 }, { "epoch": 0.25353008523088666, "ewc_loss": 0.021300768479704857, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.628893217770383e-05, "grad_norm": 3.7701616287231445, "learning_rate": 8.444256040695209e-07, "loss": 0.4842, "mean_token_accuracy": 0.8479595184326172, "num_tokens": 76232479.0, "step": 1993 }, { "epoch": 0.2536572955094772, "ewc_loss": 0.02129628136754036, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.624406862305477e-05, "grad_norm": 3.7357516288757324, "learning_rate": 8.448495125052988e-07, "loss": 0.4777, "mean_token_accuracy": 0.8499966859817505, "num_tokens": 76270192.0, "step": 1994 }, { "epoch": 0.25378450578806766, "ewc_loss": 0.02128329686820507, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.611421460751444e-05, "grad_norm": 3.7484538555145264, "learning_rate": 8.452734209410767e-07, "loss": 0.4176, "mean_token_accuracy": 0.8674191236495972, "num_tokens": 76303981.0, "step": 1995 }, { "epoch": 0.2539117160666582, "ewc_loss": 0.021370897069573402, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 7.63798743719235e-05, "grad_norm": 3.7723464965820312, "learning_rate": 8.456973293768545e-07, "loss": 0.442, "mean_token_accuracy": 0.8574059009552002, "num_tokens": 76338496.0, "step": 1996 }, { "epoch": 0.2540389263452487, "ewc_loss": 0.02138228341937065, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 7.649374310858548e-05, "grad_norm": 3.782928705215454, "learning_rate": 8.461212378126325e-07, "loss": 0.4959, "mean_token_accuracy": 0.8433310985565186, "num_tokens": 76377904.0, "step": 1997 }, { "epoch": 0.2541661366238392, "ewc_loss": 0.021312586963176727, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.640712283318862e-05, "grad_norm": 3.7595813274383545, "learning_rate": 8.465451462484103e-07, "loss": 0.5187, "mean_token_accuracy": 0.8375678658485413, "num_tokens": 76413905.0, "step": 1998 }, { "epoch": 0.2542933469024297, "ewc_loss": 0.02137519232928753, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 7.642281707376242e-05, "grad_norm": 3.8187599182128906, "learning_rate": 8.469690546841882e-07, "loss": 0.4682, "mean_token_accuracy": 0.8504416942596436, "num_tokens": 76451591.0, "step": 1999 }, { "epoch": 0.25442055718102025, "ewc_loss": 0.021362856030464172, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.690980419283733e-05, "grad_norm": 3.7448220252990723, "learning_rate": 8.47392963119966e-07, "loss": 0.4483, "mean_token_accuracy": 0.8569817543029785, "num_tokens": 76486191.0, "step": 2000 }, { "epoch": 0.2545477674596107, "ewc_loss": 0.021310066804289818, "ewc_loss_diag": 1.3649463653564453e-05, "ewc_loss_parallel": 7.638191891601309e-05, "grad_norm": 3.767131805419922, "learning_rate": 8.478168715557439e-07, "loss": 0.4675, "mean_token_accuracy": 0.8532847166061401, "num_tokens": 76523317.0, "step": 2001 }, { "epoch": 0.25467497773820125, "ewc_loss": 0.021422002464532852, "ewc_loss_diag": 1.3709068298339844e-05, "ewc_loss_parallel": 7.689093035878614e-05, "grad_norm": 3.698911666870117, "learning_rate": 8.482407799915217e-07, "loss": 0.4745, "mean_token_accuracy": 0.8487797975540161, "num_tokens": 76562661.0, "step": 2002 }, { "epoch": 0.2548021880167918, "ewc_loss": 0.021419638767838478, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 7.625693979207426e-05, "grad_norm": 3.716140031814575, "learning_rate": 8.486646884272997e-07, "loss": 0.4691, "mean_token_accuracy": 0.8489786386489868, "num_tokens": 76600914.0, "step": 2003 }, { "epoch": 0.25492939829538225, "ewc_loss": 0.021490804851055145, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 7.696859393035993e-05, "grad_norm": 3.781747817993164, "learning_rate": 8.490885968630775e-07, "loss": 0.4896, "mean_token_accuracy": 0.8424465656280518, "num_tokens": 76636310.0, "step": 2004 }, { "epoch": 0.2550566085739728, "ewc_loss": 0.021518917754292488, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 7.724972965661436e-05, "grad_norm": 3.722834587097168, "learning_rate": 8.495125052988555e-07, "loss": 0.4747, "mean_token_accuracy": 0.8524453043937683, "num_tokens": 76680889.0, "step": 2005 }, { "epoch": 0.2551838188525633, "ewc_loss": 0.02148093283176422, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 7.686987373745069e-05, "grad_norm": 3.723426580429077, "learning_rate": 8.499364137346333e-07, "loss": 0.4664, "mean_token_accuracy": 0.8559396266937256, "num_tokens": 76722469.0, "step": 2006 }, { "epoch": 0.2553110291311538, "ewc_loss": 0.02149292454123497, "ewc_loss_diag": 1.3768672943115234e-05, "ewc_loss_parallel": 7.698980334680527e-05, "grad_norm": 3.79524827003479, "learning_rate": 8.503603221704112e-07, "loss": 0.5013, "mean_token_accuracy": 0.84041428565979, "num_tokens": 76757668.0, "step": 2007 }, { "epoch": 0.2554382394097443, "ewc_loss": 0.02160746604204178, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.75248627178371e-05, "grad_norm": 3.7267708778381348, "learning_rate": 8.50784230606189e-07, "loss": 0.4627, "mean_token_accuracy": 0.8543096780776978, "num_tokens": 76795607.0, "step": 2008 }, { "epoch": 0.25556544968833483, "ewc_loss": 0.02156089060008526, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.705910684308037e-05, "grad_norm": 3.7379984855651855, "learning_rate": 8.512081390419669e-07, "loss": 0.4902, "mean_token_accuracy": 0.853082537651062, "num_tokens": 76837161.0, "step": 2009 }, { "epoch": 0.2556926599669253, "ewc_loss": 0.0216023288667202, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.747348718112335e-05, "grad_norm": 3.796152114868164, "learning_rate": 8.516320474777447e-07, "loss": 0.4708, "mean_token_accuracy": 0.8527523875236511, "num_tokens": 76872198.0, "step": 2010 }, { "epoch": 0.25581987024551583, "ewc_loss": 0.021628983318805695, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.774003461236134e-05, "grad_norm": 3.751124382019043, "learning_rate": 8.520559559135227e-07, "loss": 0.4364, "mean_token_accuracy": 0.8639516234397888, "num_tokens": 76908976.0, "step": 2011 }, { "epoch": 0.25594708052410636, "ewc_loss": 0.021585620939731598, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.730640936642885e-05, "grad_norm": 3.786404609680176, "learning_rate": 8.524798643493005e-07, "loss": 0.4866, "mean_token_accuracy": 0.8497755527496338, "num_tokens": 76947298.0, "step": 2012 }, { "epoch": 0.25607429080269684, "ewc_loss": 0.021639475598931313, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.784494664520025e-05, "grad_norm": 3.7242307662963867, "learning_rate": 8.529037727850785e-07, "loss": 0.443, "mean_token_accuracy": 0.8581398129463196, "num_tokens": 76985112.0, "step": 2013 }, { "epoch": 0.25620150108128736, "ewc_loss": 0.021598104387521744, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.743122841930017e-05, "grad_norm": 3.7934606075286865, "learning_rate": 8.533276812208563e-07, "loss": 0.4161, "mean_token_accuracy": 0.8666632175445557, "num_tokens": 77020309.0, "step": 2014 }, { "epoch": 0.2563287113598779, "ewc_loss": 0.021633388474583626, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.778408325975761e-05, "grad_norm": 3.8047537803649902, "learning_rate": 8.53751589656634e-07, "loss": 0.4547, "mean_token_accuracy": 0.8537817597389221, "num_tokens": 77063245.0, "step": 2015 }, { "epoch": 0.25645592163846836, "ewc_loss": 0.021591320633888245, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.736340921837837e-05, "grad_norm": 3.837256669998169, "learning_rate": 8.54175498092412e-07, "loss": 0.4527, "mean_token_accuracy": 0.8561350107192993, "num_tokens": 77096791.0, "step": 2016 }, { "epoch": 0.2565831319170589, "ewc_loss": 0.021632282063364983, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.777300925226882e-05, "grad_norm": 3.832249164581299, "learning_rate": 8.545994065281898e-07, "loss": 0.4324, "mean_token_accuracy": 0.8632935285568237, "num_tokens": 77129281.0, "step": 2017 }, { "epoch": 0.2567103421956494, "ewc_loss": 0.021689405664801598, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.773390098009259e-05, "grad_norm": 3.799008846282959, "learning_rate": 8.550233149639677e-07, "loss": 0.5191, "mean_token_accuracy": 0.8345320820808411, "num_tokens": 77168785.0, "step": 2018 }, { "epoch": 0.2568375524742399, "ewc_loss": 0.02161143720149994, "ewc_loss_diag": 1.3828277587890625e-05, "ewc_loss_parallel": 7.756457489449531e-05, "grad_norm": 3.8042564392089844, "learning_rate": 8.554472233997456e-07, "loss": 0.4874, "mean_token_accuracy": 0.8421798348426819, "num_tokens": 77208360.0, "step": 2019 }, { "epoch": 0.2569647627528304, "ewc_loss": 0.021680409088730812, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.764393376419321e-05, "grad_norm": 3.836918592453003, "learning_rate": 8.558711318355235e-07, "loss": 0.52, "mean_token_accuracy": 0.8339192867279053, "num_tokens": 77245539.0, "step": 2020 }, { "epoch": 0.25709197303142095, "ewc_loss": 0.021701956167817116, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.785941124893725e-05, "grad_norm": 3.834444046020508, "learning_rate": 8.562950402713014e-07, "loss": 0.4229, "mean_token_accuracy": 0.8692971467971802, "num_tokens": 77279118.0, "step": 2021 }, { "epoch": 0.2572191833100114, "ewc_loss": 0.021692195907235146, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.776179700158536e-05, "grad_norm": 3.8390417098999023, "learning_rate": 8.567189487070793e-07, "loss": 0.5095, "mean_token_accuracy": 0.8414968252182007, "num_tokens": 77316259.0, "step": 2022 }, { "epoch": 0.25734639358860195, "ewc_loss": 0.021763626486063004, "ewc_loss_diag": 1.3947486877441406e-05, "ewc_loss_parallel": 7.78657486080192e-05, "grad_norm": 3.743772029876709, "learning_rate": 8.57142857142857e-07, "loss": 0.5257, "mean_token_accuracy": 0.8351868987083435, "num_tokens": 77359712.0, "step": 2023 }, { "epoch": 0.2574736038671925, "ewc_loss": 0.02174551784992218, "ewc_loss_diag": 1.3947486877441406e-05, "ewc_loss_parallel": 7.768467912683263e-05, "grad_norm": 3.7866568565368652, "learning_rate": 8.57566765578635e-07, "loss": 0.4306, "mean_token_accuracy": 0.8657442331314087, "num_tokens": 77402269.0, "step": 2024 }, { "epoch": 0.25760081414578295, "ewc_loss": 0.02177763357758522, "ewc_loss_diag": 1.3947486877441406e-05, "ewc_loss_parallel": 7.800581806804985e-05, "grad_norm": 3.7249202728271484, "learning_rate": 8.579906740144128e-07, "loss": 0.4597, "mean_token_accuracy": 0.8550251126289368, "num_tokens": 77445791.0, "step": 2025 }, { "epoch": 0.2577280244243735, "ewc_loss": 0.021672571077942848, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.756554987281561e-05, "grad_norm": 3.7990283966064453, "learning_rate": 8.584145824501907e-07, "loss": 0.493, "mean_token_accuracy": 0.8414345979690552, "num_tokens": 77485376.0, "step": 2026 }, { "epoch": 0.257855234702964, "ewc_loss": 0.0217520073056221, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.835991709725931e-05, "grad_norm": 3.771160125732422, "learning_rate": 8.588384908859686e-07, "loss": 0.4897, "mean_token_accuracy": 0.8473149538040161, "num_tokens": 77524344.0, "step": 2027 }, { "epoch": 0.25798244498155454, "ewc_loss": 0.02170294150710106, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.786927017150447e-05, "grad_norm": 3.802759885787964, "learning_rate": 8.592623993217465e-07, "loss": 0.4827, "mean_token_accuracy": 0.8503041863441467, "num_tokens": 77562746.0, "step": 2028 }, { "epoch": 0.258109655260145, "ewc_loss": 0.021753448992967606, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.837433076929301e-05, "grad_norm": 3.8537936210632324, "learning_rate": 8.596863077575244e-07, "loss": 0.5048, "mean_token_accuracy": 0.8435417413711548, "num_tokens": 77601261.0, "step": 2029 }, { "epoch": 0.25823686553873554, "ewc_loss": 0.021768078207969666, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.852062117308378e-05, "grad_norm": 3.788280963897705, "learning_rate": 8.601102161933023e-07, "loss": 0.4853, "mean_token_accuracy": 0.8469334244728088, "num_tokens": 77640054.0, "step": 2030 }, { "epoch": 0.25836407581732607, "ewc_loss": 0.021719595417380333, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.803579501342028e-05, "grad_norm": 3.7972474098205566, "learning_rate": 8.6053412462908e-07, "loss": 0.4597, "mean_token_accuracy": 0.8543341755867004, "num_tokens": 77680098.0, "step": 2031 }, { "epoch": 0.25849128609591654, "ewc_loss": 0.021751057356595993, "ewc_loss_diag": 1.3887882232666016e-05, "ewc_loss_parallel": 7.835041469661519e-05, "grad_norm": 3.877680778503418, "learning_rate": 8.60958033064858e-07, "loss": 0.4548, "mean_token_accuracy": 0.8580957651138306, "num_tokens": 77712517.0, "step": 2032 }, { "epoch": 0.25861849637450707, "ewc_loss": 0.021842321380972862, "ewc_loss_diag": 1.3947486877441406e-05, "ewc_loss_parallel": 7.865270890761167e-05, "grad_norm": 3.8170320987701416, "learning_rate": 8.613819415006358e-07, "loss": 0.5167, "mean_token_accuracy": 0.8443797826766968, "num_tokens": 77751335.0, "step": 2033 }, { "epoch": 0.2587457066530976, "ewc_loss": 0.021803615614771843, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 7.826564979040995e-05, "grad_norm": 3.7975926399230957, "learning_rate": 8.618058499364137e-07, "loss": 0.5041, "mean_token_accuracy": 0.8408324718475342, "num_tokens": 77792300.0, "step": 2034 }, { "epoch": 0.25887291693168807, "ewc_loss": 0.02181241661310196, "ewc_loss_diag": 1.3947486877441406e-05, "ewc_loss_parallel": 7.83536524977535e-05, "grad_norm": 3.7682838439941406, "learning_rate": 8.622297583721916e-07, "loss": 0.4633, "mean_token_accuracy": 0.8565749526023865, "num_tokens": 77836391.0, "step": 2035 }, { "epoch": 0.2590001272102786, "ewc_loss": 0.021804694086313248, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 7.827643275959417e-05, "grad_norm": 3.843454122543335, "learning_rate": 8.626536668079695e-07, "loss": 0.4735, "mean_token_accuracy": 0.8481113910675049, "num_tokens": 77871600.0, "step": 2036 }, { "epoch": 0.2591273374888691, "ewc_loss": 0.0219234861433506, "ewc_loss_diag": 1.4066696166992188e-05, "ewc_loss_parallel": 7.885399827500805e-05, "grad_norm": 4.227754592895508, "learning_rate": 8.630775752437474e-07, "loss": 0.4606, "mean_token_accuracy": 0.853507399559021, "num_tokens": 77904623.0, "step": 2037 }, { "epoch": 0.2592545477674596, "ewc_loss": 0.02201325073838234, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 8.036199142225087e-05, "grad_norm": 3.836561918258667, "learning_rate": 8.635014836795251e-07, "loss": 0.4992, "mean_token_accuracy": 0.8434784412384033, "num_tokens": 77941145.0, "step": 2038 }, { "epoch": 0.2593817580460501, "ewc_loss": 0.0216988455504179, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 7.7217940997798e-05, "grad_norm": 3.818683385848999, "learning_rate": 8.63925392115303e-07, "loss": 0.4366, "mean_token_accuracy": 0.8595172166824341, "num_tokens": 77975087.0, "step": 2039 }, { "epoch": 0.25950896832464065, "ewc_loss": 0.02184484899044037, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 7.867797103244811e-05, "grad_norm": 3.916151762008667, "learning_rate": 8.643493005510809e-07, "loss": 0.4443, "mean_token_accuracy": 0.857455313205719, "num_tokens": 78009526.0, "step": 2040 }, { "epoch": 0.2596361786032311, "ewc_loss": 0.021846938878297806, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 7.869888941058889e-05, "grad_norm": 3.836352586746216, "learning_rate": 8.647732089868588e-07, "loss": 0.522, "mean_token_accuracy": 0.8347944021224976, "num_tokens": 78050918.0, "step": 2041 }, { "epoch": 0.25976338888182166, "ewc_loss": 0.021780669689178467, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 7.803618791513145e-05, "grad_norm": 3.8085391521453857, "learning_rate": 8.651971174226366e-07, "loss": 0.4397, "mean_token_accuracy": 0.8609036207199097, "num_tokens": 78085547.0, "step": 2042 }, { "epoch": 0.2598905991604122, "ewc_loss": 0.021824825555086136, "ewc_loss_diag": 1.4007091522216797e-05, "ewc_loss_parallel": 7.847775123082101e-05, "grad_norm": 3.8615829944610596, "learning_rate": 8.656210258584146e-07, "loss": 0.5079, "mean_token_accuracy": 0.8382959961891174, "num_tokens": 78122916.0, "step": 2043 }, { "epoch": 0.26001780943900266, "ewc_loss": 0.0219741053879261, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.874984294176102e-05, "grad_norm": 3.8174023628234863, "learning_rate": 8.660449342941924e-07, "loss": 0.5004, "mean_token_accuracy": 0.8396806120872498, "num_tokens": 78161800.0, "step": 2044 }, { "epoch": 0.2601450197175932, "ewc_loss": 0.02195601537823677, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.856894808355719e-05, "grad_norm": 3.7944176197052, "learning_rate": 8.664688427299704e-07, "loss": 0.4793, "mean_token_accuracy": 0.848232090473175, "num_tokens": 78199961.0, "step": 2045 }, { "epoch": 0.2602722299961837, "ewc_loss": 0.021978285163640976, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.87916433182545e-05, "grad_norm": 3.8570590019226074, "learning_rate": 8.668927511657481e-07, "loss": 0.4376, "mean_token_accuracy": 0.8596183061599731, "num_tokens": 78231332.0, "step": 2046 }, { "epoch": 0.2603994402747742, "ewc_loss": 0.022007327526807785, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.908207044238225e-05, "grad_norm": 3.8494231700897217, "learning_rate": 8.67316659601526e-07, "loss": 0.4936, "mean_token_accuracy": 0.8440585732460022, "num_tokens": 78265054.0, "step": 2047 }, { "epoch": 0.2605266505533647, "ewc_loss": 0.022001653909683228, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.902533252490684e-05, "grad_norm": 3.7985591888427734, "learning_rate": 8.677405680373039e-07, "loss": 0.4501, "mean_token_accuracy": 0.859882116317749, "num_tokens": 78305445.0, "step": 2048 }, { "epoch": 0.26065386083195524, "ewc_loss": 0.02199694514274597, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.897824980318546e-05, "grad_norm": 3.8826370239257812, "learning_rate": 8.681644764730818e-07, "loss": 0.478, "mean_token_accuracy": 0.8478409647941589, "num_tokens": 78339174.0, "step": 2049 }, { "epoch": 0.2607810711105457, "ewc_loss": 0.022074343636631966, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.975222251843661e-05, "grad_norm": 3.762779712677002, "learning_rate": 8.685883849088596e-07, "loss": 0.4219, "mean_token_accuracy": 0.8659641742706299, "num_tokens": 78377630.0, "step": 2050 }, { "epoch": 0.26090828138913624, "ewc_loss": 0.0219790767878294, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.879955956013873e-05, "grad_norm": 3.8075270652770996, "learning_rate": 8.690122933446376e-07, "loss": 0.4793, "mean_token_accuracy": 0.8514420986175537, "num_tokens": 78419908.0, "step": 2051 }, { "epoch": 0.26103549166772677, "ewc_loss": 0.022069722414016724, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.970601291162893e-05, "grad_norm": 3.8066086769104004, "learning_rate": 8.694362017804154e-07, "loss": 0.4466, "mean_token_accuracy": 0.8592254519462585, "num_tokens": 78459561.0, "step": 2052 }, { "epoch": 0.26116270194631724, "ewc_loss": 0.022028841078281403, "ewc_loss_diag": 1.4126300811767578e-05, "ewc_loss_parallel": 7.929721323307604e-05, "grad_norm": 3.7935352325439453, "learning_rate": 8.698601102161933e-07, "loss": 0.4769, "mean_token_accuracy": 0.847700834274292, "num_tokens": 78502500.0, "step": 2053 }, { "epoch": 0.26128991222490777, "ewc_loss": 0.022110357880592346, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 7.950201688800007e-05, "grad_norm": 3.796067953109741, "learning_rate": 8.702840186519711e-07, "loss": 0.4351, "mean_token_accuracy": 0.8621711134910583, "num_tokens": 78545411.0, "step": 2054 }, { "epoch": 0.2614171225034983, "ewc_loss": 0.022102724760770798, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 7.942567754071206e-05, "grad_norm": 3.7950727939605713, "learning_rate": 8.70707927087749e-07, "loss": 0.4347, "mean_token_accuracy": 0.862309455871582, "num_tokens": 78585878.0, "step": 2055 }, { "epoch": 0.2615443327820888, "ewc_loss": 0.022114146500825882, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 7.953989552333951e-05, "grad_norm": 3.897636890411377, "learning_rate": 8.711318355235269e-07, "loss": 0.5226, "mean_token_accuracy": 0.8331184983253479, "num_tokens": 78621233.0, "step": 2056 }, { "epoch": 0.2616715430606793, "ewc_loss": 0.022179821506142616, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 8.019665256142616e-05, "grad_norm": 3.8733479976654053, "learning_rate": 8.715557439593047e-07, "loss": 0.4435, "mean_token_accuracy": 0.8575609922409058, "num_tokens": 78654644.0, "step": 2057 }, { "epoch": 0.26179875333926983, "ewc_loss": 0.022133316844701767, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 7.973160973051563e-05, "grad_norm": 3.8820745944976807, "learning_rate": 8.719796523950826e-07, "loss": 0.5194, "mean_token_accuracy": 0.8345886468887329, "num_tokens": 78692851.0, "step": 2058 }, { "epoch": 0.2619259636178603, "ewc_loss": 0.022152770310640335, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 7.992614700924605e-05, "grad_norm": 3.812331199645996, "learning_rate": 8.724035608308605e-07, "loss": 0.4142, "mean_token_accuracy": 0.8674956560134888, "num_tokens": 78725329.0, "step": 2059 }, { "epoch": 0.26205317389645083, "ewc_loss": 0.022114217281341553, "ewc_loss_diag": 1.4185905456542969e-05, "ewc_loss_parallel": 7.954061584314331e-05, "grad_norm": 3.847151279449463, "learning_rate": 8.728274692666384e-07, "loss": 0.4702, "mean_token_accuracy": 0.8516594171524048, "num_tokens": 78766645.0, "step": 2060 }, { "epoch": 0.26218038417504136, "ewc_loss": 0.02222302556037903, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.001834794413298e-05, "grad_norm": 3.9118518829345703, "learning_rate": 8.732513777024162e-07, "loss": 0.4656, "mean_token_accuracy": 0.853130042552948, "num_tokens": 78801576.0, "step": 2061 }, { "epoch": 0.26230759445363183, "ewc_loss": 0.022231753915548325, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.010563033167273e-05, "grad_norm": 3.919050931930542, "learning_rate": 8.736752861381941e-07, "loss": 0.5084, "mean_token_accuracy": 0.8392823338508606, "num_tokens": 78845577.0, "step": 2062 }, { "epoch": 0.26243480473222236, "ewc_loss": 0.022208064794540405, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.986873242771253e-05, "grad_norm": 3.789443016052246, "learning_rate": 8.740991945739719e-07, "loss": 0.4725, "mean_token_accuracy": 0.851523220539093, "num_tokens": 78884229.0, "step": 2063 }, { "epoch": 0.2625620150108129, "ewc_loss": 0.0221759881824255, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.954797183629125e-05, "grad_norm": 3.8788669109344482, "learning_rate": 8.745231030097499e-07, "loss": 0.4601, "mean_token_accuracy": 0.8541865944862366, "num_tokens": 78926948.0, "step": 2064 }, { "epoch": 0.26268922528940336, "ewc_loss": 0.022231563925743103, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.010372403077781e-05, "grad_norm": 3.8192341327667236, "learning_rate": 8.749470114455277e-07, "loss": 0.4365, "mean_token_accuracy": 0.8601830005645752, "num_tokens": 78963582.0, "step": 2065 }, { "epoch": 0.2628164355679939, "ewc_loss": 0.022182483226060867, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.961290975799784e-05, "grad_norm": 3.8812878131866455, "learning_rate": 8.753709198813056e-07, "loss": 0.5436, "mean_token_accuracy": 0.8293482661247253, "num_tokens": 79007345.0, "step": 2066 }, { "epoch": 0.2629436458465844, "ewc_loss": 0.022229358553886414, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.008167060324922e-05, "grad_norm": 3.7732598781585693, "learning_rate": 8.757948283170835e-07, "loss": 0.4415, "mean_token_accuracy": 0.8619871139526367, "num_tokens": 79049505.0, "step": 2067 }, { "epoch": 0.2630708561251749, "ewc_loss": 0.022162601351737976, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.941410876810551e-05, "grad_norm": 3.9352023601531982, "learning_rate": 8.762187367528613e-07, "loss": 0.4366, "mean_token_accuracy": 0.8589944243431091, "num_tokens": 79087419.0, "step": 2068 }, { "epoch": 0.2631980664037654, "ewc_loss": 0.022292446345090866, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.071253978414461e-05, "grad_norm": 3.822028875350952, "learning_rate": 8.766426451886392e-07, "loss": 0.4844, "mean_token_accuracy": 0.8446317911148071, "num_tokens": 79131444.0, "step": 2069 }, { "epoch": 0.26332527668235595, "ewc_loss": 0.022161802276968956, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.94061052147299e-05, "grad_norm": 3.9069652557373047, "learning_rate": 8.770665536244171e-07, "loss": 0.5108, "mean_token_accuracy": 0.8419607281684875, "num_tokens": 79171976.0, "step": 2070 }, { "epoch": 0.2634524869609464, "ewc_loss": 0.0222613662481308, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.040174725465477e-05, "grad_norm": 3.8567304611206055, "learning_rate": 8.774904620601949e-07, "loss": 0.4408, "mean_token_accuracy": 0.8573610186576843, "num_tokens": 79206273.0, "step": 2071 }, { "epoch": 0.26357969723953695, "ewc_loss": 0.022174589335918427, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.953398016979918e-05, "grad_norm": 3.862912893295288, "learning_rate": 8.779143704959729e-07, "loss": 0.4397, "mean_token_accuracy": 0.8604657649993896, "num_tokens": 79245564.0, "step": 2072 }, { "epoch": 0.2637069075181275, "ewc_loss": 0.0222189798951149, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.997789361979812e-05, "grad_norm": 3.8573498725891113, "learning_rate": 8.783382789317507e-07, "loss": 0.4732, "mean_token_accuracy": 0.8520289659500122, "num_tokens": 79281857.0, "step": 2073 }, { "epoch": 0.26383411779671795, "ewc_loss": 0.022182215005159378, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.961022492963821e-05, "grad_norm": 3.862767219543457, "learning_rate": 8.787621873675286e-07, "loss": 0.5047, "mean_token_accuracy": 0.8363315463066101, "num_tokens": 79312482.0, "step": 2074 }, { "epoch": 0.2639613280753085, "ewc_loss": 0.02223106287419796, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.009871817193925e-05, "grad_norm": 3.878970146179199, "learning_rate": 8.791860958033065e-07, "loss": 0.4447, "mean_token_accuracy": 0.8597589731216431, "num_tokens": 79348157.0, "step": 2075 }, { "epoch": 0.264088538353899, "ewc_loss": 0.02223421260714531, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.013021579245105e-05, "grad_norm": 3.7812306880950928, "learning_rate": 8.796100042390842e-07, "loss": 0.4315, "mean_token_accuracy": 0.863528847694397, "num_tokens": 79387540.0, "step": 2076 }, { "epoch": 0.2642157486324895, "ewc_loss": 0.022199511528015137, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.97832035459578e-05, "grad_norm": 3.9171857833862305, "learning_rate": 8.800339126748622e-07, "loss": 0.4472, "mean_token_accuracy": 0.8567633628845215, "num_tokens": 79421424.0, "step": 2077 }, { "epoch": 0.26434295891108, "ewc_loss": 0.022314395755529404, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 8.09320408734493e-05, "grad_norm": 3.7961878776550293, "learning_rate": 8.8045782111064e-07, "loss": 0.5172, "mean_token_accuracy": 0.8365920782089233, "num_tokens": 79458733.0, "step": 2078 }, { "epoch": 0.26447016918967053, "ewc_loss": 0.02221551537513733, "ewc_loss_diag": 1.424551010131836e-05, "ewc_loss_parallel": 7.994324550963938e-05, "grad_norm": 3.8355634212493896, "learning_rate": 8.808817295464179e-07, "loss": 0.4699, "mean_token_accuracy": 0.8520718812942505, "num_tokens": 79495394.0, "step": 2079 }, { "epoch": 0.26459737946826106, "ewc_loss": 0.022445783019065857, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.102520951069891e-05, "grad_norm": 3.9731831550598145, "learning_rate": 8.813056379821958e-07, "loss": 0.442, "mean_token_accuracy": 0.8608818054199219, "num_tokens": 79529799.0, "step": 2080 }, { "epoch": 0.26472458974685154, "ewc_loss": 0.022473860532045364, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.130598871503025e-05, "grad_norm": 3.818678617477417, "learning_rate": 8.817295464179737e-07, "loss": 0.4676, "mean_token_accuracy": 0.8516666293144226, "num_tokens": 79564664.0, "step": 2081 }, { "epoch": 0.26485180002544206, "ewc_loss": 0.022375069558620453, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.031808829400688e-05, "grad_norm": 3.895378589630127, "learning_rate": 8.821534548537515e-07, "loss": 0.5448, "mean_token_accuracy": 0.8266408443450928, "num_tokens": 79606104.0, "step": 2082 }, { "epoch": 0.2649790103040326, "ewc_loss": 0.022441888228058815, "ewc_loss_diag": 1.430511474609375e-05, "ewc_loss_parallel": 8.159661229001358e-05, "grad_norm": 3.8008222579956055, "learning_rate": 8.825773632895295e-07, "loss": 0.4416, "mean_token_accuracy": 0.8602941036224365, "num_tokens": 79647688.0, "step": 2083 }, { "epoch": 0.26510622058262306, "ewc_loss": 0.022400468587875366, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.05720774224028e-05, "grad_norm": 3.866617441177368, "learning_rate": 8.830012717253072e-07, "loss": 0.459, "mean_token_accuracy": 0.857244610786438, "num_tokens": 79685500.0, "step": 2084 }, { "epoch": 0.2652334308612136, "ewc_loss": 0.02250487171113491, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.161610458046198e-05, "grad_norm": 3.7873826026916504, "learning_rate": 8.834251801610852e-07, "loss": 0.4868, "mean_token_accuracy": 0.8457978963851929, "num_tokens": 79729912.0, "step": 2085 }, { "epoch": 0.2653606411398041, "ewc_loss": 0.022426126524806023, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.082864951575175e-05, "grad_norm": 3.8344693183898926, "learning_rate": 8.83849088596863e-07, "loss": 0.4424, "mean_token_accuracy": 0.8600050210952759, "num_tokens": 79767487.0, "step": 2086 }, { "epoch": 0.2654878514183946, "ewc_loss": 0.022512460127472878, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.16919855424203e-05, "grad_norm": 3.815694570541382, "learning_rate": 8.842729970326409e-07, "loss": 0.4255, "mean_token_accuracy": 0.8668599128723145, "num_tokens": 79804802.0, "step": 2087 }, { "epoch": 0.2656150616969851, "ewc_loss": 0.02245963178575039, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.116370008792728e-05, "grad_norm": 3.862992525100708, "learning_rate": 8.846969054684188e-07, "loss": 0.5412, "mean_token_accuracy": 0.8315348029136658, "num_tokens": 79845685.0, "step": 2088 }, { "epoch": 0.26574227197557565, "ewc_loss": 0.02252158336341381, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.178321877494454e-05, "grad_norm": 3.8469619750976562, "learning_rate": 8.851208139041967e-07, "loss": 0.4302, "mean_token_accuracy": 0.8623889684677124, "num_tokens": 79884692.0, "step": 2089 }, { "epoch": 0.2658694822541661, "ewc_loss": 0.022488202899694443, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.144941966747865e-05, "grad_norm": 3.9373764991760254, "learning_rate": 8.855447223399745e-07, "loss": 0.489, "mean_token_accuracy": 0.8429102897644043, "num_tokens": 79919780.0, "step": 2090 }, { "epoch": 0.26599669253275665, "ewc_loss": 0.022577518597245216, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.234256529249251e-05, "grad_norm": 3.8820080757141113, "learning_rate": 8.859686307757524e-07, "loss": 0.5151, "mean_token_accuracy": 0.8374239206314087, "num_tokens": 79958317.0, "step": 2091 }, { "epoch": 0.2661239028113472, "ewc_loss": 0.022488676011562347, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.145414176397026e-05, "grad_norm": 3.8812742233276367, "learning_rate": 8.863925392115302e-07, "loss": 0.4488, "mean_token_accuracy": 0.8555718660354614, "num_tokens": 79994126.0, "step": 2092 }, { "epoch": 0.26625111308993765, "ewc_loss": 0.022534560412168503, "ewc_loss_diag": 1.436471939086914e-05, "ewc_loss_parallel": 8.19129854789935e-05, "grad_norm": 3.80366849899292, "learning_rate": 8.868164476473082e-07, "loss": 0.4173, "mean_token_accuracy": 0.8675848841667175, "num_tokens": 80033157.0, "step": 2093 }, { "epoch": 0.2663783233685282, "ewc_loss": 0.022611163556575775, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.14583181636408e-05, "grad_norm": 3.8789632320404053, "learning_rate": 8.87240356083086e-07, "loss": 0.4472, "mean_token_accuracy": 0.8586791157722473, "num_tokens": 80073354.0, "step": 2094 }, { "epoch": 0.2665055336471187, "ewc_loss": 0.022660546004772186, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.195213013095781e-05, "grad_norm": 3.8789284229278564, "learning_rate": 8.876642645188639e-07, "loss": 0.5082, "mean_token_accuracy": 0.837816596031189, "num_tokens": 80111819.0, "step": 2095 }, { "epoch": 0.2666327439257092, "ewc_loss": 0.022635992616415024, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.170659566530958e-05, "grad_norm": 3.839923858642578, "learning_rate": 8.880881729546418e-07, "loss": 0.4214, "mean_token_accuracy": 0.8657884001731873, "num_tokens": 80147951.0, "step": 2096 }, { "epoch": 0.2667599542042997, "ewc_loss": 0.022641614079475403, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.176280971383676e-05, "grad_norm": 3.8834991455078125, "learning_rate": 8.885120813904197e-07, "loss": 0.437, "mean_token_accuracy": 0.8610535860061646, "num_tokens": 80191319.0, "step": 2097 }, { "epoch": 0.26688716448289024, "ewc_loss": 0.02267095446586609, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.205621998058632e-05, "grad_norm": 3.9255599975585938, "learning_rate": 8.889359898261976e-07, "loss": 0.5235, "mean_token_accuracy": 0.8378096222877502, "num_tokens": 80226300.0, "step": 2098 }, { "epoch": 0.2670143747614807, "ewc_loss": 0.022637151181697845, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.171818626578897e-05, "grad_norm": 3.8282783031463623, "learning_rate": 8.893598982619753e-07, "loss": 0.4489, "mean_token_accuracy": 0.8574725389480591, "num_tokens": 80265465.0, "step": 2099 }, { "epoch": 0.26714158504007124, "ewc_loss": 0.022605065256357193, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.139734563883394e-05, "grad_norm": 3.8453786373138428, "learning_rate": 8.897838066977532e-07, "loss": 0.4523, "mean_token_accuracy": 0.8570343255996704, "num_tokens": 80312696.0, "step": 2100 }, { "epoch": 0.26726879531866177, "ewc_loss": 0.022647617384791374, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.1822850916069e-05, "grad_norm": 3.8194472789764404, "learning_rate": 8.902077151335311e-07, "loss": 0.4706, "mean_token_accuracy": 0.8490503430366516, "num_tokens": 80353507.0, "step": 2101 }, { "epoch": 0.26739600559725224, "ewc_loss": 0.022592755034565926, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.127422916004434e-05, "grad_norm": 3.8975143432617188, "learning_rate": 8.90631623569309e-07, "loss": 0.5541, "mean_token_accuracy": 0.8230056762695312, "num_tokens": 80390795.0, "step": 2102 }, { "epoch": 0.26752321587584277, "ewc_loss": 0.02267533168196678, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.210000669350848e-05, "grad_norm": 3.853947877883911, "learning_rate": 8.910555320050868e-07, "loss": 0.4642, "mean_token_accuracy": 0.8532116413116455, "num_tokens": 80427599.0, "step": 2103 }, { "epoch": 0.2676504261544333, "ewc_loss": 0.022634610533714294, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.169278589775786e-05, "grad_norm": 3.87447190284729, "learning_rate": 8.914794404408648e-07, "loss": 0.4668, "mean_token_accuracy": 0.8525928258895874, "num_tokens": 80468084.0, "step": 2104 }, { "epoch": 0.26777763643302377, "ewc_loss": 0.022669605910778046, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.204273035516962e-05, "grad_norm": 3.825944662094116, "learning_rate": 8.919033488766426e-07, "loss": 0.4102, "mean_token_accuracy": 0.8698329925537109, "num_tokens": 80507218.0, "step": 2105 }, { "epoch": 0.2679048467116143, "ewc_loss": 0.022629886865615845, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.16455576568842e-05, "grad_norm": 3.9205777645111084, "learning_rate": 8.923272573124204e-07, "loss": 0.4949, "mean_token_accuracy": 0.8447911739349365, "num_tokens": 80541730.0, "step": 2106 }, { "epoch": 0.2680320569902048, "ewc_loss": 0.02272326499223709, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.257933950517327e-05, "grad_norm": 3.8284404277801514, "learning_rate": 8.927511657481983e-07, "loss": 0.5058, "mean_token_accuracy": 0.8392128348350525, "num_tokens": 80584677.0, "step": 2107 }, { "epoch": 0.2681592672687953, "ewc_loss": 0.02263512834906578, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.169795910362154e-05, "grad_norm": 3.8702919483184814, "learning_rate": 8.931750741839762e-07, "loss": 0.4395, "mean_token_accuracy": 0.8621385097503662, "num_tokens": 80616332.0, "step": 2108 }, { "epoch": 0.2682864775473858, "ewc_loss": 0.022731052711606026, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.265720680356026e-05, "grad_norm": 3.8520476818084717, "learning_rate": 8.935989826197541e-07, "loss": 0.4499, "mean_token_accuracy": 0.859650194644928, "num_tokens": 80654435.0, "step": 2109 }, { "epoch": 0.26841368782597635, "ewc_loss": 0.022670861333608627, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.205528138205409e-05, "grad_norm": 3.867424249649048, "learning_rate": 8.94022891055532e-07, "loss": 0.4259, "mean_token_accuracy": 0.8676767945289612, "num_tokens": 80691246.0, "step": 2110 }, { "epoch": 0.2685408981045668, "ewc_loss": 0.022703221067786217, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.237889414886013e-05, "grad_norm": 3.832871675491333, "learning_rate": 8.944467994913098e-07, "loss": 0.4772, "mean_token_accuracy": 0.8453466296195984, "num_tokens": 80730510.0, "step": 2111 }, { "epoch": 0.26866810838315736, "ewc_loss": 0.022690128535032272, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.22479632915929e-05, "grad_norm": 3.831672191619873, "learning_rate": 8.948707079270878e-07, "loss": 0.3958, "mean_token_accuracy": 0.8739468455314636, "num_tokens": 80769369.0, "step": 2112 }, { "epoch": 0.2687953186617479, "ewc_loss": 0.022708814591169357, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.243483898695558e-05, "grad_norm": 3.9144022464752197, "learning_rate": 8.952946163628656e-07, "loss": 0.4503, "mean_token_accuracy": 0.8569314479827881, "num_tokens": 80802168.0, "step": 2113 }, { "epoch": 0.26892252894033836, "ewc_loss": 0.02275652438402176, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.291192352771759e-05, "grad_norm": 3.822873830795288, "learning_rate": 8.957185247986434e-07, "loss": 0.4017, "mean_token_accuracy": 0.8719989061355591, "num_tokens": 80839389.0, "step": 2114 }, { "epoch": 0.2690497392189289, "ewc_loss": 0.02269400842487812, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.228676597354934e-05, "grad_norm": 3.8852834701538086, "learning_rate": 8.961424332344213e-07, "loss": 0.4571, "mean_token_accuracy": 0.8562874794006348, "num_tokens": 80881139.0, "step": 2115 }, { "epoch": 0.2691769494975194, "ewc_loss": 0.022755052894353867, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.289721881737933e-05, "grad_norm": 3.8903565406799316, "learning_rate": 8.965663416701992e-07, "loss": 0.458, "mean_token_accuracy": 0.852860689163208, "num_tokens": 80919809.0, "step": 2116 }, { "epoch": 0.2693041597761099, "ewc_loss": 0.022749487310647964, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.284156501758844e-05, "grad_norm": 3.873537540435791, "learning_rate": 8.969902501059771e-07, "loss": 0.4334, "mean_token_accuracy": 0.8642832636833191, "num_tokens": 80956860.0, "step": 2117 }, { "epoch": 0.2694313700547004, "ewc_loss": 0.0227481909096241, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.282859926111996e-05, "grad_norm": 3.8519961833953857, "learning_rate": 8.97414158541755e-07, "loss": 0.4016, "mean_token_accuracy": 0.8724185228347778, "num_tokens": 80994322.0, "step": 2118 }, { "epoch": 0.26955858033329094, "ewc_loss": 0.022868763655424118, "ewc_loss_diag": 1.4603137969970703e-05, "ewc_loss_parallel": 8.281361078843474e-05, "grad_norm": 4.2006940841674805, "learning_rate": 8.978380669775328e-07, "loss": 0.4845, "mean_token_accuracy": 0.846515417098999, "num_tokens": 81032792.0, "step": 2119 }, { "epoch": 0.2696857906118814, "ewc_loss": 0.022920861840248108, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.455530041828752e-05, "grad_norm": 3.8774356842041016, "learning_rate": 8.982619754133107e-07, "loss": 0.4839, "mean_token_accuracy": 0.8480483293533325, "num_tokens": 81070011.0, "step": 2120 }, { "epoch": 0.26981300089047194, "ewc_loss": 0.02260671555995941, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.141383295878768e-05, "grad_norm": 3.846879720687866, "learning_rate": 8.986858838490886e-07, "loss": 0.4864, "mean_token_accuracy": 0.8462806344032288, "num_tokens": 81113145.0, "step": 2121 }, { "epoch": 0.26994021116906247, "ewc_loss": 0.02273612655699253, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.270794933196157e-05, "grad_norm": 3.817748546600342, "learning_rate": 8.991097922848663e-07, "loss": 0.4539, "mean_token_accuracy": 0.8567226529121399, "num_tokens": 81154175.0, "step": 2122 }, { "epoch": 0.27006742144765294, "ewc_loss": 0.022673852741718292, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.208521467167884e-05, "grad_norm": 3.8830056190490723, "learning_rate": 8.995337007206443e-07, "loss": 0.4789, "mean_token_accuracy": 0.8502870798110962, "num_tokens": 81193319.0, "step": 2123 }, { "epoch": 0.2701946317262435, "ewc_loss": 0.022731952369213104, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.266621443908662e-05, "grad_norm": 3.855724573135376, "learning_rate": 8.999576091564221e-07, "loss": 0.4466, "mean_token_accuracy": 0.859062671661377, "num_tokens": 81229057.0, "step": 2124 }, { "epoch": 0.270321842004834, "ewc_loss": 0.02269759029150009, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.232258551288396e-05, "grad_norm": 3.9053473472595215, "learning_rate": 9.003815175922001e-07, "loss": 0.4545, "mean_token_accuracy": 0.8558582067489624, "num_tokens": 81262380.0, "step": 2125 }, { "epoch": 0.2704490522834245, "ewc_loss": 0.022755209356546402, "ewc_loss_diag": 1.4483928680419922e-05, "ewc_loss_parallel": 8.289876859635115e-05, "grad_norm": 3.8154447078704834, "learning_rate": 9.008054260279779e-07, "loss": 0.4594, "mean_token_accuracy": 0.8551337122917175, "num_tokens": 81304080.0, "step": 2126 }, { "epoch": 0.270576262562015, "ewc_loss": 0.022768860682845116, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 8.242493640864268e-05, "grad_norm": 3.9496729373931885, "learning_rate": 9.012293344637558e-07, "loss": 0.4885, "mean_token_accuracy": 0.8461148738861084, "num_tokens": 81340226.0, "step": 2127 }, { "epoch": 0.27070347284060553, "ewc_loss": 0.02287915349006653, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 8.352786971954629e-05, "grad_norm": 3.8536195755004883, "learning_rate": 9.016532428995337e-07, "loss": 0.4357, "mean_token_accuracy": 0.8612312078475952, "num_tokens": 81378046.0, "step": 2128 }, { "epoch": 0.27083068311919606, "ewc_loss": 0.022788910195231438, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 8.262543269665912e-05, "grad_norm": 3.8161983489990234, "learning_rate": 9.020771513353115e-07, "loss": 0.4803, "mean_token_accuracy": 0.8483039140701294, "num_tokens": 81419486.0, "step": 2129 }, { "epoch": 0.27095789339778653, "ewc_loss": 0.022836629301309586, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 8.310263365274295e-05, "grad_norm": 3.938307762145996, "learning_rate": 9.025010597710894e-07, "loss": 0.4786, "mean_token_accuracy": 0.8485695123672485, "num_tokens": 81457488.0, "step": 2130 }, { "epoch": 0.27108510367637706, "ewc_loss": 0.022870786488056183, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 8.344418165506795e-05, "grad_norm": 3.8293275833129883, "learning_rate": 9.029249682068673e-07, "loss": 0.4472, "mean_token_accuracy": 0.8579704165458679, "num_tokens": 81494812.0, "step": 2131 }, { "epoch": 0.2712123139549676, "ewc_loss": 0.02283013053238392, "ewc_loss_diag": 1.4543533325195312e-05, "ewc_loss_parallel": 8.303763752337545e-05, "grad_norm": 3.890385150909424, "learning_rate": 9.033488766426451e-07, "loss": 0.4631, "mean_token_accuracy": 0.8516029715538025, "num_tokens": 81531421.0, "step": 2132 }, { "epoch": 0.27133952423355806, "ewc_loss": 0.023156102746725082, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.385594992432743e-05, "grad_norm": 3.908942222595215, "learning_rate": 9.037727850784231e-07, "loss": 0.4305, "mean_token_accuracy": 0.8613125085830688, "num_tokens": 81567221.0, "step": 2133 }, { "epoch": 0.2714667345121486, "ewc_loss": 0.022995194420218468, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.346756658283994e-05, "grad_norm": 3.9060044288635254, "learning_rate": 9.041966935142009e-07, "loss": 0.4657, "mean_token_accuracy": 0.8527487516403198, "num_tokens": 81605832.0, "step": 2134 }, { "epoch": 0.2715939447907391, "ewc_loss": 0.023008663207292557, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.360225183423609e-05, "grad_norm": 3.8334717750549316, "learning_rate": 9.046206019499788e-07, "loss": 0.42, "mean_token_accuracy": 0.8688420653343201, "num_tokens": 81646919.0, "step": 2135 }, { "epoch": 0.2717211550693296, "ewc_loss": 0.022986408323049545, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.337970211869106e-05, "grad_norm": 3.906717300415039, "learning_rate": 9.050445103857567e-07, "loss": 0.4974, "mean_token_accuracy": 0.8469276428222656, "num_tokens": 81687214.0, "step": 2136 }, { "epoch": 0.2718483653479201, "ewc_loss": 0.023035209625959396, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.386770787183195e-05, "grad_norm": 3.8894612789154053, "learning_rate": 9.054684188215344e-07, "loss": 0.4034, "mean_token_accuracy": 0.8736447095870972, "num_tokens": 81721437.0, "step": 2137 }, { "epoch": 0.27197557562651065, "ewc_loss": 0.023030992597341537, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.382555097341537e-05, "grad_norm": 3.982391119003296, "learning_rate": 9.058923272573124e-07, "loss": 0.4779, "mean_token_accuracy": 0.8482301235198975, "num_tokens": 81756595.0, "step": 2138 }, { "epoch": 0.2721027859051011, "ewc_loss": 0.02309029921889305, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.441860700258985e-05, "grad_norm": 3.8178677558898926, "learning_rate": 9.063162356930902e-07, "loss": 0.4414, "mean_token_accuracy": 0.8609009981155396, "num_tokens": 81796198.0, "step": 2139 }, { "epoch": 0.27222999618369165, "ewc_loss": 0.023002609610557556, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.354171586688608e-05, "grad_norm": 3.9293456077575684, "learning_rate": 9.067401441288681e-07, "loss": 0.4729, "mean_token_accuracy": 0.8537094593048096, "num_tokens": 81833002.0, "step": 2140 }, { "epoch": 0.2723572064622822, "ewc_loss": 0.023132018744945526, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.483581768814474e-05, "grad_norm": 3.905874490737915, "learning_rate": 9.07164052564646e-07, "loss": 0.445, "mean_token_accuracy": 0.8549309968948364, "num_tokens": 81872089.0, "step": 2141 }, { "epoch": 0.27248441674087265, "ewc_loss": 0.023064177483320236, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.415740012424067e-05, "grad_norm": 3.891820192337036, "learning_rate": 9.075879610004239e-07, "loss": 0.4813, "mean_token_accuracy": 0.8468801379203796, "num_tokens": 81908356.0, "step": 2142 }, { "epoch": 0.2726116270194632, "ewc_loss": 0.02310204692184925, "ewc_loss_diag": 1.4662742614746094e-05, "ewc_loss_parallel": 8.453609189018607e-05, "grad_norm": 3.868046283721924, "learning_rate": 9.080118694362017e-07, "loss": 0.4848, "mean_token_accuracy": 0.8452328443527222, "num_tokens": 81950251.0, "step": 2143 }, { "epoch": 0.2727388372980537, "ewc_loss": 0.02321801707148552, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.447509753750637e-05, "grad_norm": 3.9291584491729736, "learning_rate": 9.084357778719796e-07, "loss": 0.4765, "mean_token_accuracy": 0.8472824692726135, "num_tokens": 81987157.0, "step": 2144 }, { "epoch": 0.2728660475766442, "ewc_loss": 0.02327200025320053, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.501491538481787e-05, "grad_norm": 3.9542529582977295, "learning_rate": 9.088596863077574e-07, "loss": 0.4613, "mean_token_accuracy": 0.8571749329566956, "num_tokens": 82023633.0, "step": 2145 }, { "epoch": 0.2729932578552347, "ewc_loss": 0.023255743086338043, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.48523632157594e-05, "grad_norm": 3.9404139518737793, "learning_rate": 9.092835947435354e-07, "loss": 0.463, "mean_token_accuracy": 0.8496716022491455, "num_tokens": 82061311.0, "step": 2146 }, { "epoch": 0.27312046813382523, "ewc_loss": 0.02325398102402687, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.483474084641784e-05, "grad_norm": 3.926835298538208, "learning_rate": 9.097075031793132e-07, "loss": 0.4994, "mean_token_accuracy": 0.8425883650779724, "num_tokens": 82098370.0, "step": 2147 }, { "epoch": 0.2732476784124157, "ewc_loss": 0.023257222026586533, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.486715523758903e-05, "grad_norm": 3.920835018157959, "learning_rate": 9.101314116150911e-07, "loss": 0.4267, "mean_token_accuracy": 0.8690526485443115, "num_tokens": 82134931.0, "step": 2148 }, { "epoch": 0.27337488869100623, "ewc_loss": 0.023261377587914467, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.49086936796084e-05, "grad_norm": 3.920774459838867, "learning_rate": 9.10555320050869e-07, "loss": 0.4363, "mean_token_accuracy": 0.8621522784233093, "num_tokens": 82171391.0, "step": 2149 }, { "epoch": 0.27350209896959676, "ewc_loss": 0.023265302181243896, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.494794747093692e-05, "grad_norm": 3.8533382415771484, "learning_rate": 9.109792284866469e-07, "loss": 0.4552, "mean_token_accuracy": 0.8592942953109741, "num_tokens": 82214064.0, "step": 2150 }, { "epoch": 0.27362930924818724, "ewc_loss": 0.023347988724708557, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.45541144371964e-05, "grad_norm": 3.898993492126465, "learning_rate": 9.114031369224247e-07, "loss": 0.47, "mean_token_accuracy": 0.8513337969779968, "num_tokens": 82252774.0, "step": 2151 }, { "epoch": 0.27375651952677776, "ewc_loss": 0.02340751886367798, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.514940418535843e-05, "grad_norm": 3.955472707748413, "learning_rate": 9.118270453582026e-07, "loss": 0.5256, "mean_token_accuracy": 0.8385179042816162, "num_tokens": 82291645.0, "step": 2152 }, { "epoch": 0.2738837298053683, "ewc_loss": 0.02341117337346077, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.518595132045448e-05, "grad_norm": 3.9083364009857178, "learning_rate": 9.122509537939804e-07, "loss": 0.4585, "mean_token_accuracy": 0.8499693870544434, "num_tokens": 82333643.0, "step": 2153 }, { "epoch": 0.27401094008395877, "ewc_loss": 0.023367103189229965, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.474523929180577e-05, "grad_norm": 3.9267194271087646, "learning_rate": 9.126748622297584e-07, "loss": 0.4966, "mean_token_accuracy": 0.8407802581787109, "num_tokens": 82370792.0, "step": 2154 }, { "epoch": 0.2741381503625493, "ewc_loss": 0.023421162739396095, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.528585021849722e-05, "grad_norm": 3.935349702835083, "learning_rate": 9.130987706655362e-07, "loss": 0.4853, "mean_token_accuracy": 0.8455802202224731, "num_tokens": 82408626.0, "step": 2155 }, { "epoch": 0.2742653606411398, "ewc_loss": 0.023279406130313873, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.508898463333026e-05, "grad_norm": 3.9062352180480957, "learning_rate": 9.135226791013141e-07, "loss": 0.5051, "mean_token_accuracy": 0.8462569117546082, "num_tokens": 82448401.0, "step": 2156 }, { "epoch": 0.2743925709197303, "ewc_loss": 0.02326372265815735, "ewc_loss_diag": 1.4781951904296875e-05, "ewc_loss_parallel": 8.493215136695653e-05, "grad_norm": 4.043966770172119, "learning_rate": 9.13946587537092e-07, "loss": 0.4633, "mean_token_accuracy": 0.8492324948310852, "num_tokens": 82476644.0, "step": 2157 }, { "epoch": 0.2745197811983208, "ewc_loss": 0.023477505892515182, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.584927854826674e-05, "grad_norm": 3.9335062503814697, "learning_rate": 9.143704959728699e-07, "loss": 0.4771, "mean_token_accuracy": 0.8454188108444214, "num_tokens": 82512722.0, "step": 2158 }, { "epoch": 0.27464699147691135, "ewc_loss": 0.023387014865875244, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.494437497574836e-05, "grad_norm": 3.985196113586426, "learning_rate": 9.147944044086476e-07, "loss": 0.4959, "mean_token_accuracy": 0.843684196472168, "num_tokens": 82545393.0, "step": 2159 }, { "epoch": 0.2747742017555018, "ewc_loss": 0.023461708799004555, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.569130295654759e-05, "grad_norm": 3.883601188659668, "learning_rate": 9.152183128444255e-07, "loss": 0.4944, "mean_token_accuracy": 0.8491201996803284, "num_tokens": 82586329.0, "step": 2160 }, { "epoch": 0.27490141203409235, "ewc_loss": 0.023368945345282555, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.476367656840011e-05, "grad_norm": 3.9585657119750977, "learning_rate": 9.156422212802034e-07, "loss": 0.5017, "mean_token_accuracy": 0.8381038308143616, "num_tokens": 82625563.0, "step": 2161 }, { "epoch": 0.2750286223126829, "ewc_loss": 0.02349291555583477, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.600337605457753e-05, "grad_norm": 3.9484856128692627, "learning_rate": 9.160661297159813e-07, "loss": 0.4399, "mean_token_accuracy": 0.8606629967689514, "num_tokens": 82660030.0, "step": 2162 }, { "epoch": 0.27515583259127335, "ewc_loss": 0.02342657372355461, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.533995423931628e-05, "grad_norm": 3.937272310256958, "learning_rate": 9.164900381517592e-07, "loss": 0.4609, "mean_token_accuracy": 0.8541624546051025, "num_tokens": 82698794.0, "step": 2163 }, { "epoch": 0.2752830428698639, "ewc_loss": 0.023446425795555115, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.553846419090405e-05, "grad_norm": 3.8895413875579834, "learning_rate": 9.16913946587537e-07, "loss": 0.4613, "mean_token_accuracy": 0.8513514995574951, "num_tokens": 82740015.0, "step": 2164 }, { "epoch": 0.2754102531484544, "ewc_loss": 0.023435387760400772, "ewc_loss_diag": 1.4901161193847656e-05, "ewc_loss_parallel": 8.542810974176973e-05, "grad_norm": 4.00154447555542, "learning_rate": 9.17337855023315e-07, "loss": 0.4738, "mean_token_accuracy": 0.8494440317153931, "num_tokens": 82775765.0, "step": 2165 }, { "epoch": 0.2755374634270449, "ewc_loss": 0.023569881916046143, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 8.616269042249769e-05, "grad_norm": 3.9270057678222656, "learning_rate": 9.177617634590928e-07, "loss": 0.4258, "mean_token_accuracy": 0.8651382923126221, "num_tokens": 82814324.0, "step": 2166 }, { "epoch": 0.2756646737056354, "ewc_loss": 0.02349073253571987, "ewc_loss_diag": 1.4960765838623047e-05, "ewc_loss_parallel": 8.537119720131159e-05, "grad_norm": 3.935455560684204, "learning_rate": 9.181856718948706e-07, "loss": 0.4127, "mean_token_accuracy": 0.8677849769592285, "num_tokens": 82851007.0, "step": 2167 }, { "epoch": 0.27579188398422594, "ewc_loss": 0.023652108386158943, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.576424443162978e-05, "grad_norm": 3.950770378112793, "learning_rate": 9.186095803306485e-07, "loss": 0.455, "mean_token_accuracy": 0.8558679819107056, "num_tokens": 82886846.0, "step": 2168 }, { "epoch": 0.2759190942628164, "ewc_loss": 0.023643698543310165, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.568014163756743e-05, "grad_norm": 3.944934606552124, "learning_rate": 9.190334887664264e-07, "loss": 0.4947, "mean_token_accuracy": 0.8441987037658691, "num_tokens": 82929607.0, "step": 2169 }, { "epoch": 0.27604630454140694, "ewc_loss": 0.023639172315597534, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.563489245716482e-05, "grad_norm": 4.015190124511719, "learning_rate": 9.194573972022043e-07, "loss": 0.5508, "mean_token_accuracy": 0.833034098148346, "num_tokens": 82965089.0, "step": 2170 }, { "epoch": 0.27617351481999747, "ewc_loss": 0.02369193360209465, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.616250852355734e-05, "grad_norm": 3.9120988845825195, "learning_rate": 9.198813056379822e-07, "loss": 0.5258, "mean_token_accuracy": 0.8335393071174622, "num_tokens": 83008345.0, "step": 2171 }, { "epoch": 0.27630072509858794, "ewc_loss": 0.023621857166290283, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.546172466594726e-05, "grad_norm": 3.968395948410034, "learning_rate": 9.2030521407376e-07, "loss": 0.4807, "mean_token_accuracy": 0.8452327251434326, "num_tokens": 83043763.0, "step": 2172 }, { "epoch": 0.27642793537717847, "ewc_loss": 0.023699482902884483, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.623799658380449e-05, "grad_norm": 3.893094301223755, "learning_rate": 9.20729122509538e-07, "loss": 0.4242, "mean_token_accuracy": 0.8649394512176514, "num_tokens": 83081389.0, "step": 2173 }, { "epoch": 0.276555145655769, "ewc_loss": 0.02364177815616131, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.566094766138121e-05, "grad_norm": 3.9561078548431396, "learning_rate": 9.211530309453158e-07, "loss": 0.4636, "mean_token_accuracy": 0.8517906665802002, "num_tokens": 83120547.0, "step": 2174 }, { "epoch": 0.27668235593435947, "ewc_loss": 0.023714136332273483, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.638451981823891e-05, "grad_norm": 3.9791789054870605, "learning_rate": 9.215769393810936e-07, "loss": 0.4683, "mean_token_accuracy": 0.8536033630371094, "num_tokens": 83157229.0, "step": 2175 }, { "epoch": 0.27680956621295, "ewc_loss": 0.023695718497037888, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.620035805506632e-05, "grad_norm": 3.9558372497558594, "learning_rate": 9.220008478168715e-07, "loss": 0.5493, "mean_token_accuracy": 0.8274590373039246, "num_tokens": 83198217.0, "step": 2176 }, { "epoch": 0.2769367764915405, "ewc_loss": 0.023679934442043304, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.604252070654184e-05, "grad_norm": 4.004638671875, "learning_rate": 9.224247562526494e-07, "loss": 0.4696, "mean_token_accuracy": 0.8490028381347656, "num_tokens": 83238750.0, "step": 2177 }, { "epoch": 0.277063986770131, "ewc_loss": 0.02372456155717373, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.648878429085016e-05, "grad_norm": 3.927096366882324, "learning_rate": 9.228486646884273e-07, "loss": 0.4587, "mean_token_accuracy": 0.8530899286270142, "num_tokens": 83280138.0, "step": 2178 }, { "epoch": 0.2771911970487215, "ewc_loss": 0.02366524375975132, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.589559729443863e-05, "grad_norm": 3.994079113006592, "learning_rate": 9.232725731242052e-07, "loss": 0.5102, "mean_token_accuracy": 0.8372288942337036, "num_tokens": 83320633.0, "step": 2179 }, { "epoch": 0.27731840732731206, "ewc_loss": 0.02373933419585228, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.663650805829093e-05, "grad_norm": 3.981241226196289, "learning_rate": 9.23696481559983e-07, "loss": 0.4935, "mean_token_accuracy": 0.8427645564079285, "num_tokens": 83358787.0, "step": 2180 }, { "epoch": 0.2774456176059026, "ewc_loss": 0.023700051009655, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.624367183074355e-05, "grad_norm": 3.917752981185913, "learning_rate": 9.24120389995761e-07, "loss": 0.4372, "mean_token_accuracy": 0.8611600995063782, "num_tokens": 83400609.0, "step": 2181 }, { "epoch": 0.27757282788449306, "ewc_loss": 0.023678049445152283, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.602366870036349e-05, "grad_norm": 3.989215612411499, "learning_rate": 9.245442984315387e-07, "loss": 0.4539, "mean_token_accuracy": 0.8559904098510742, "num_tokens": 83436058.0, "step": 2182 }, { "epoch": 0.2777000381630836, "ewc_loss": 0.023800427094101906, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.663708285894245e-05, "grad_norm": 3.9686665534973145, "learning_rate": 9.249682068673165e-07, "loss": 0.4868, "mean_token_accuracy": 0.8446367383003235, "num_tokens": 83474542.0, "step": 2183 }, { "epoch": 0.2778272484416741, "ewc_loss": 0.023767346516251564, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.630627417005599e-05, "grad_norm": 3.988370656967163, "learning_rate": 9.253921153030945e-07, "loss": 0.5224, "mean_token_accuracy": 0.834506094455719, "num_tokens": 83515551.0, "step": 2184 }, { "epoch": 0.2779544587202646, "ewc_loss": 0.02378806099295616, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.65134279592894e-05, "grad_norm": 3.990997314453125, "learning_rate": 9.258160237388723e-07, "loss": 0.4395, "mean_token_accuracy": 0.8628710508346558, "num_tokens": 83551826.0, "step": 2185 }, { "epoch": 0.2780816689988551, "ewc_loss": 0.023791268467903137, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.654550765641034e-05, "grad_norm": 4.06690788269043, "learning_rate": 9.262399321746503e-07, "loss": 0.4564, "mean_token_accuracy": 0.855744481086731, "num_tokens": 83586598.0, "step": 2186 }, { "epoch": 0.27820887927744564, "ewc_loss": 0.02383173070847988, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.695012365933508e-05, "grad_norm": 4.001754283905029, "learning_rate": 9.266638406104281e-07, "loss": 0.505, "mean_token_accuracy": 0.8406411409378052, "num_tokens": 83625996.0, "step": 2187 }, { "epoch": 0.2783360895560361, "ewc_loss": 0.023748140782117844, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.6114217992872e-05, "grad_norm": 4.032044887542725, "learning_rate": 9.27087749046206e-07, "loss": 0.4739, "mean_token_accuracy": 0.8514793515205383, "num_tokens": 83657322.0, "step": 2188 }, { "epoch": 0.27846329983462664, "ewc_loss": 0.02371871843934059, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.643033652333543e-05, "grad_norm": 3.999180316925049, "learning_rate": 9.275116574819839e-07, "loss": 0.47, "mean_token_accuracy": 0.8501118421554565, "num_tokens": 83693578.0, "step": 2189 }, { "epoch": 0.27859051011321717, "ewc_loss": 0.023711171001195908, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.63548630150035e-05, "grad_norm": 4.021758079528809, "learning_rate": 9.279355659177617e-07, "loss": 0.4752, "mean_token_accuracy": 0.8515517711639404, "num_tokens": 83727672.0, "step": 2190 }, { "epoch": 0.27871772039180764, "ewc_loss": 0.023718934506177902, "ewc_loss_diag": 1.5079975128173828e-05, "ewc_loss_parallel": 8.643249748274684e-05, "grad_norm": 4.008718967437744, "learning_rate": 9.283594743535395e-07, "loss": 0.518, "mean_token_accuracy": 0.838092565536499, "num_tokens": 83761405.0, "step": 2191 }, { "epoch": 0.2788449306703982, "ewc_loss": 0.023774107918143272, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.637388964416459e-05, "grad_norm": 3.9661388397216797, "learning_rate": 9.287833827893175e-07, "loss": 0.4598, "mean_token_accuracy": 0.8509202003479004, "num_tokens": 83801564.0, "step": 2192 }, { "epoch": 0.2789721409489887, "ewc_loss": 0.0237632654607296, "ewc_loss_diag": 1.5139579772949219e-05, "ewc_loss_parallel": 8.626547787571326e-05, "grad_norm": 4.010391712188721, "learning_rate": 9.292072912250953e-07, "loss": 0.4866, "mean_token_accuracy": 0.845424234867096, "num_tokens": 83834260.0, "step": 2193 }, { "epoch": 0.2790993512275792, "ewc_loss": 0.023871738463640213, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.673985576024279e-05, "grad_norm": 3.9010684490203857, "learning_rate": 9.296311996608733e-07, "loss": 0.4164, "mean_token_accuracy": 0.8671528100967407, "num_tokens": 83873518.0, "step": 2194 }, { "epoch": 0.2792265615061697, "ewc_loss": 0.0238246601074934, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.626906492281705e-05, "grad_norm": 3.9958174228668213, "learning_rate": 9.300551080966511e-07, "loss": 0.5298, "mean_token_accuracy": 0.8338264226913452, "num_tokens": 83913902.0, "step": 2195 }, { "epoch": 0.27935377178476023, "ewc_loss": 0.023911572992801666, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.713817805983126e-05, "grad_norm": 4.009346961975098, "learning_rate": 9.30479016532429e-07, "loss": 0.4558, "mean_token_accuracy": 0.8520387411117554, "num_tokens": 83947952.0, "step": 2196 }, { "epoch": 0.2794809820633507, "ewc_loss": 0.023874353617429733, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.676598372403532e-05, "grad_norm": 3.94954514503479, "learning_rate": 9.309029249682068e-07, "loss": 0.4649, "mean_token_accuracy": 0.8534188270568848, "num_tokens": 83986057.0, "step": 2197 }, { "epoch": 0.27960819234194123, "ewc_loss": 0.02388029918074608, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.682545740157366e-05, "grad_norm": 3.97255802154541, "learning_rate": 9.313268334039847e-07, "loss": 0.4755, "mean_token_accuracy": 0.8492085933685303, "num_tokens": 84023881.0, "step": 2198 }, { "epoch": 0.27973540262053176, "ewc_loss": 0.023906603455543518, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.708850509719923e-05, "grad_norm": 3.9533658027648926, "learning_rate": 9.317507418397625e-07, "loss": 0.4102, "mean_token_accuracy": 0.8706086874008179, "num_tokens": 84061480.0, "step": 2199 }, { "epoch": 0.27986261289912223, "ewc_loss": 0.023906605318188667, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.708851237315685e-05, "grad_norm": 3.9566662311553955, "learning_rate": 9.321746502755404e-07, "loss": 0.5113, "mean_token_accuracy": 0.840893566608429, "num_tokens": 84101782.0, "step": 2200 }, { "epoch": 0.27998982317771276, "ewc_loss": 0.02392340637743473, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.725652878638357e-05, "grad_norm": 4.044288158416748, "learning_rate": 9.325985587113183e-07, "loss": 0.4702, "mean_token_accuracy": 0.8514553308486938, "num_tokens": 84136125.0, "step": 2201 }, { "epoch": 0.2801170334563033, "ewc_loss": 0.023952782154083252, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.755029557505623e-05, "grad_norm": 4.033306121826172, "learning_rate": 9.330224671470962e-07, "loss": 0.4819, "mean_token_accuracy": 0.8526401519775391, "num_tokens": 84169748.0, "step": 2202 }, { "epoch": 0.28024424373489376, "ewc_loss": 0.023934027180075645, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.73627359396778e-05, "grad_norm": 4.050825119018555, "learning_rate": 9.334463755828741e-07, "loss": 0.4962, "mean_token_accuracy": 0.8390503525733948, "num_tokens": 84201292.0, "step": 2203 }, { "epoch": 0.2803714540134843, "ewc_loss": 0.02397441491484642, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.776661707088351e-05, "grad_norm": 3.9654061794281006, "learning_rate": 9.338702840186519e-07, "loss": 0.4961, "mean_token_accuracy": 0.8410319089889526, "num_tokens": 84240652.0, "step": 2204 }, { "epoch": 0.2804986642920748, "ewc_loss": 0.02392786182463169, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.730107947485521e-05, "grad_norm": 3.967522144317627, "learning_rate": 9.342941924544298e-07, "loss": 0.4883, "mean_token_accuracy": 0.8456151485443115, "num_tokens": 84280252.0, "step": 2205 }, { "epoch": 0.2806258745706653, "ewc_loss": 0.023986931890249252, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.789177081780508e-05, "grad_norm": 4.007686614990234, "learning_rate": 9.347181008902076e-07, "loss": 0.4281, "mean_token_accuracy": 0.8669983148574829, "num_tokens": 84316505.0, "step": 2206 }, { "epoch": 0.2807530848492558, "ewc_loss": 0.023991385474801064, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.793631423031911e-05, "grad_norm": 3.9960391521453857, "learning_rate": 9.351420093259855e-07, "loss": 0.4304, "mean_token_accuracy": 0.8647598028182983, "num_tokens": 84349990.0, "step": 2207 }, { "epoch": 0.28088029512784635, "ewc_loss": 0.023982390761375427, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.784636156633496e-05, "grad_norm": 4.002085208892822, "learning_rate": 9.355659177617634e-07, "loss": 0.4209, "mean_token_accuracy": 0.8640888929367065, "num_tokens": 84381066.0, "step": 2208 }, { "epoch": 0.2810075054064368, "ewc_loss": 0.023994039744138718, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.796284964773804e-05, "grad_norm": 3.933342218399048, "learning_rate": 9.359898261975413e-07, "loss": 0.4136, "mean_token_accuracy": 0.8686374425888062, "num_tokens": 84422540.0, "step": 2209 }, { "epoch": 0.28113471568502735, "ewc_loss": 0.023958932608366013, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.761178469285369e-05, "grad_norm": 4.015095233917236, "learning_rate": 9.364137346333192e-07, "loss": 0.54, "mean_token_accuracy": 0.8303694725036621, "num_tokens": 84462158.0, "step": 2210 }, { "epoch": 0.2812619259636179, "ewc_loss": 0.024021286517381668, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.823533426038921e-05, "grad_norm": 3.985318899154663, "learning_rate": 9.368376430690971e-07, "loss": 0.4293, "mean_token_accuracy": 0.8619610071182251, "num_tokens": 84495887.0, "step": 2211 }, { "epoch": 0.28138913624220835, "ewc_loss": 0.023970965296030045, "ewc_loss_diag": 1.519918441772461e-05, "ewc_loss_parallel": 8.773212175583467e-05, "grad_norm": 4.043377876281738, "learning_rate": 9.372615515048749e-07, "loss": 0.5024, "mean_token_accuracy": 0.8426067233085632, "num_tokens": 84530176.0, "step": 2212 }, { "epoch": 0.2815163465207989, "ewc_loss": 0.024105466902256012, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 8.846678247209638e-05, "grad_norm": 3.9775826930999756, "learning_rate": 9.376854599406528e-07, "loss": 0.5036, "mean_token_accuracy": 0.844014048576355, "num_tokens": 84574450.0, "step": 2213 }, { "epoch": 0.2816435567993894, "ewc_loss": 0.024034522473812103, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 8.775734022492543e-05, "grad_norm": 4.014721870422363, "learning_rate": 9.381093683764306e-07, "loss": 0.4718, "mean_token_accuracy": 0.8504537343978882, "num_tokens": 84610661.0, "step": 2214 }, { "epoch": 0.2817707670779799, "ewc_loss": 0.024087656289339066, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 8.828867430565879e-05, "grad_norm": 3.9269461631774902, "learning_rate": 9.385332768122085e-07, "loss": 0.453, "mean_token_accuracy": 0.8547933101654053, "num_tokens": 84651259.0, "step": 2215 }, { "epoch": 0.2818979773565704, "ewc_loss": 0.02402985654771328, "ewc_loss_diag": 1.52587890625e-05, "ewc_loss_parallel": 8.771067950874567e-05, "grad_norm": 3.979166269302368, "learning_rate": 9.389571852479864e-07, "loss": 0.4569, "mean_token_accuracy": 0.8564363718032837, "num_tokens": 84692389.0, "step": 2216 }, { "epoch": 0.28202518763516093, "ewc_loss": 0.024224258959293365, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 8.843398245517164e-05, "grad_norm": 3.9976329803466797, "learning_rate": 9.393810936837643e-07, "loss": 0.4609, "mean_token_accuracy": 0.8546355962753296, "num_tokens": 84731905.0, "step": 2217 }, { "epoch": 0.2821523979137514, "ewc_loss": 0.02418798953294754, "ewc_loss_diag": 1.537799835205078e-05, "ewc_loss_parallel": 8.807129779597744e-05, "grad_norm": 3.934537887573242, "learning_rate": 9.398050021195422e-07, "loss": 0.4888, "mean_token_accuracy": 0.8488021492958069, "num_tokens": 84773727.0, "step": 2218 }, { "epoch": 0.28227960819234194, "ewc_loss": 0.024301033467054367, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.79810395417735e-05, "grad_norm": 4.2746663093566895, "learning_rate": 9.402289105553201e-07, "loss": 0.4356, "mean_token_accuracy": 0.860835075378418, "num_tokens": 84816771.0, "step": 2219 }, { "epoch": 0.28240681847093246, "ewc_loss": 0.024462739005684853, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.959809201769531e-05, "grad_norm": 3.9680793285369873, "learning_rate": 9.406528189910978e-07, "loss": 0.4595, "mean_token_accuracy": 0.8548999428749084, "num_tokens": 84853172.0, "step": 2220 }, { "epoch": 0.28253402874952294, "ewc_loss": 0.024181023240089417, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.678094309289008e-05, "grad_norm": 3.977245330810547, "learning_rate": 9.410767274268757e-07, "loss": 0.44, "mean_token_accuracy": 0.8579140305519104, "num_tokens": 84891508.0, "step": 2221 }, { "epoch": 0.28266123902811346, "ewc_loss": 0.024325545877218246, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.822615927783772e-05, "grad_norm": 4.0040788650512695, "learning_rate": 9.415006358626536e-07, "loss": 0.5372, "mean_token_accuracy": 0.828865647315979, "num_tokens": 84931204.0, "step": 2222 }, { "epoch": 0.282788449306704, "ewc_loss": 0.024295806884765625, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.792877633823082e-05, "grad_norm": 4.05334997177124, "learning_rate": 9.419245442984314e-07, "loss": 0.5183, "mean_token_accuracy": 0.8363475799560547, "num_tokens": 84966782.0, "step": 2223 }, { "epoch": 0.28291565958529447, "ewc_loss": 0.024326320737600327, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.823392272461206e-05, "grad_norm": 4.046329498291016, "learning_rate": 9.423484527342094e-07, "loss": 0.4642, "mean_token_accuracy": 0.8508552312850952, "num_tokens": 85001512.0, "step": 2224 }, { "epoch": 0.283042869863885, "ewc_loss": 0.024301286786794662, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.798357885098085e-05, "grad_norm": 4.02895975112915, "learning_rate": 9.427723611699872e-07, "loss": 0.4738, "mean_token_accuracy": 0.8515117168426514, "num_tokens": 85036082.0, "step": 2225 }, { "epoch": 0.2831700801424755, "ewc_loss": 0.024325957521796227, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.823027746984735e-05, "grad_norm": 3.958059549331665, "learning_rate": 9.431962696057652e-07, "loss": 0.4796, "mean_token_accuracy": 0.8495185375213623, "num_tokens": 85079433.0, "step": 2226 }, { "epoch": 0.283297290421066, "ewc_loss": 0.024295253679156303, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.792323933448642e-05, "grad_norm": 4.022744178771973, "learning_rate": 9.43620178041543e-07, "loss": 0.4325, "mean_token_accuracy": 0.8653071522712708, "num_tokens": 85116940.0, "step": 2227 }, { "epoch": 0.2834245006996565, "ewc_loss": 0.024333011358976364, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.830080332700163e-05, "grad_norm": 3.9856925010681152, "learning_rate": 9.440440864773208e-07, "loss": 0.4847, "mean_token_accuracy": 0.8491050004959106, "num_tokens": 85153328.0, "step": 2228 }, { "epoch": 0.28355171097824705, "ewc_loss": 0.02429788187146187, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.794952736934647e-05, "grad_norm": 3.9685122966766357, "learning_rate": 9.444679949130987e-07, "loss": 0.4858, "mean_token_accuracy": 0.8447713851928711, "num_tokens": 85194059.0, "step": 2229 }, { "epoch": 0.2836789212568376, "ewc_loss": 0.024325724691152573, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.822794916341081e-05, "grad_norm": 4.079324245452881, "learning_rate": 9.448919033488766e-07, "loss": 0.5051, "mean_token_accuracy": 0.8431074619293213, "num_tokens": 85230820.0, "step": 2230 }, { "epoch": 0.28380613153542805, "ewc_loss": 0.024387769401073456, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.884839189704508e-05, "grad_norm": 3.964827060699463, "learning_rate": 9.453158117846544e-07, "loss": 0.4884, "mean_token_accuracy": 0.8459650278091431, "num_tokens": 85268864.0, "step": 2231 }, { "epoch": 0.2839333418140186, "ewc_loss": 0.024299748241901398, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.796819020062685e-05, "grad_norm": 4.000776767730713, "learning_rate": 9.457397202204324e-07, "loss": 0.5139, "mean_token_accuracy": 0.839593768119812, "num_tokens": 85311377.0, "step": 2232 }, { "epoch": 0.2840605520926091, "ewc_loss": 0.024406522512435913, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.903591515263543e-05, "grad_norm": 4.014707565307617, "learning_rate": 9.461636286562102e-07, "loss": 0.4289, "mean_token_accuracy": 0.8664026260375977, "num_tokens": 85347082.0, "step": 2233 }, { "epoch": 0.2841877623711996, "ewc_loss": 0.02437026984989643, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.867340511642396e-05, "grad_norm": 4.031045913696289, "learning_rate": 9.465875370919882e-07, "loss": 0.5169, "mean_token_accuracy": 0.8409106731414795, "num_tokens": 85384696.0, "step": 2234 }, { "epoch": 0.2843149726497901, "ewc_loss": 0.024384768679738045, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.881838584784418e-05, "grad_norm": 4.094951152801514, "learning_rate": 9.470114455277659e-07, "loss": 0.503, "mean_token_accuracy": 0.8411046266555786, "num_tokens": 85420192.0, "step": 2235 }, { "epoch": 0.28444218292838064, "ewc_loss": 0.02441463991999626, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.911711483960971e-05, "grad_norm": 3.9907212257385254, "learning_rate": 9.474353539635438e-07, "loss": 0.4528, "mean_token_accuracy": 0.8545621633529663, "num_tokens": 85462281.0, "step": 2236 }, { "epoch": 0.2845693932069711, "ewc_loss": 0.024355093017220497, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.852163591654971e-05, "grad_norm": 4.045661926269531, "learning_rate": 9.478592623993217e-07, "loss": 0.4581, "mean_token_accuracy": 0.8541620373725891, "num_tokens": 85495191.0, "step": 2237 }, { "epoch": 0.28469660348556164, "ewc_loss": 0.02441687136888504, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.91394229256548e-05, "grad_norm": 3.996269464492798, "learning_rate": 9.482831708350996e-07, "loss": 0.4561, "mean_token_accuracy": 0.8559638261795044, "num_tokens": 85533803.0, "step": 2238 }, { "epoch": 0.28482381376415217, "ewc_loss": 0.024362877011299133, "ewc_loss_diag": 1.5497207641601562e-05, "ewc_loss_parallel": 8.859946683514863e-05, "grad_norm": 4.005303859710693, "learning_rate": 9.487070792708775e-07, "loss": 0.5026, "mean_token_accuracy": 0.8441857695579529, "num_tokens": 85573424.0, "step": 2239 }, { "epoch": 0.28495102404274264, "ewc_loss": 0.024529660120606422, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.904660353437066e-05, "grad_norm": 3.9581410884857178, "learning_rate": 9.491309877066554e-07, "loss": 0.4862, "mean_token_accuracy": 0.847466766834259, "num_tokens": 85616423.0, "step": 2240 }, { "epoch": 0.28507823432133317, "ewc_loss": 0.02448856458067894, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.863563562044874e-05, "grad_norm": 4.006760597229004, "learning_rate": 9.495548961424332e-07, "loss": 0.5112, "mean_token_accuracy": 0.8395038843154907, "num_tokens": 85658410.0, "step": 2241 }, { "epoch": 0.2852054445999237, "ewc_loss": 0.024579863995313644, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.954863005783409e-05, "grad_norm": 4.002693176269531, "learning_rate": 9.499788045782111e-07, "loss": 0.47, "mean_token_accuracy": 0.8515297770500183, "num_tokens": 85695767.0, "step": 2242 }, { "epoch": 0.28533265487851417, "ewc_loss": 0.02451985701918602, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.894856000551954e-05, "grad_norm": 3.979954957962036, "learning_rate": 9.504027130139889e-07, "loss": 0.486, "mean_token_accuracy": 0.8462478518486023, "num_tokens": 85737771.0, "step": 2243 }, { "epoch": 0.2854598651571047, "ewc_loss": 0.024567916989326477, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.94291588338092e-05, "grad_norm": 4.0195441246032715, "learning_rate": 9.508266214497667e-07, "loss": 0.4335, "mean_token_accuracy": 0.8653833866119385, "num_tokens": 85774789.0, "step": 2244 }, { "epoch": 0.2855870754356952, "ewc_loss": 0.024542078375816345, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.917078230297193e-05, "grad_norm": 3.9743502140045166, "learning_rate": 9.512505298855447e-07, "loss": 0.472, "mean_token_accuracy": 0.8538817763328552, "num_tokens": 85817833.0, "step": 2245 }, { "epoch": 0.2857142857142857, "ewc_loss": 0.02455902472138405, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.934025390772149e-05, "grad_norm": 4.022239685058594, "learning_rate": 9.516744383213225e-07, "loss": 0.4166, "mean_token_accuracy": 0.8663850426673889, "num_tokens": 85854488.0, "step": 2246 }, { "epoch": 0.2858414959928762, "ewc_loss": 0.024579359218478203, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.954359509516507e-05, "grad_norm": 4.059125900268555, "learning_rate": 9.520983467571005e-07, "loss": 0.4947, "mean_token_accuracy": 0.8448400497436523, "num_tokens": 85893746.0, "step": 2247 }, { "epoch": 0.28596870627146675, "ewc_loss": 0.02460872009396553, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.98371945368126e-05, "grad_norm": 4.04183292388916, "learning_rate": 9.525222551928783e-07, "loss": 0.5318, "mean_token_accuracy": 0.8385196924209595, "num_tokens": 85932798.0, "step": 2248 }, { "epoch": 0.2860959165500572, "ewc_loss": 0.024567460641264915, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.942460408434272e-05, "grad_norm": 4.0692291259765625, "learning_rate": 9.529461636286562e-07, "loss": 0.4472, "mean_token_accuracy": 0.8590582609176636, "num_tokens": 85964446.0, "step": 2249 }, { "epoch": 0.28622312682864776, "ewc_loss": 0.024608056992292404, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.98305734153837e-05, "grad_norm": 4.054825782775879, "learning_rate": 9.533700720644341e-07, "loss": 0.4257, "mean_token_accuracy": 0.8661711812019348, "num_tokens": 86000587.0, "step": 2250 }, { "epoch": 0.2863503371072383, "ewc_loss": 0.02459792047739029, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.972921204986051e-05, "grad_norm": 4.05064058303833, "learning_rate": 9.537939805002118e-07, "loss": 0.4781, "mean_token_accuracy": 0.8490671515464783, "num_tokens": 86042891.0, "step": 2251 }, { "epoch": 0.28647754738582876, "ewc_loss": 0.024599412456154823, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.974412048701197e-05, "grad_norm": 4.018871307373047, "learning_rate": 9.542178889359898e-07, "loss": 0.4554, "mean_token_accuracy": 0.8560901880264282, "num_tokens": 86079973.0, "step": 2252 }, { "epoch": 0.2866047576644193, "ewc_loss": 0.024587996304035187, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.962996798800305e-05, "grad_norm": 4.0242180824279785, "learning_rate": 9.546417973717677e-07, "loss": 0.4785, "mean_token_accuracy": 0.8504679799079895, "num_tokens": 86119767.0, "step": 2253 }, { "epoch": 0.2867319679430098, "ewc_loss": 0.024598686024546623, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.973685908131301e-05, "grad_norm": 3.971395254135132, "learning_rate": 9.550657058075455e-07, "loss": 0.4978, "mean_token_accuracy": 0.8456088304519653, "num_tokens": 86163176.0, "step": 2254 }, { "epoch": 0.2868591782216003, "ewc_loss": 0.024594752117991447, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.969751797849312e-05, "grad_norm": 4.037683010101318, "learning_rate": 9.554896142433234e-07, "loss": 0.4932, "mean_token_accuracy": 0.8438398838043213, "num_tokens": 86201523.0, "step": 2255 }, { "epoch": 0.2869863885001908, "ewc_loss": 0.024637524038553238, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.012524969875813e-05, "grad_norm": 4.014577865600586, "learning_rate": 9.559135226791012e-07, "loss": 0.5118, "mean_token_accuracy": 0.8349227905273438, "num_tokens": 86240277.0, "step": 2256 }, { "epoch": 0.28711359877878134, "ewc_loss": 0.024616360664367676, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.991360664367676e-05, "grad_norm": 3.981802463531494, "learning_rate": 9.563374311148793e-07, "loss": 0.5019, "mean_token_accuracy": 0.8425683975219727, "num_tokens": 86289714.0, "step": 2257 }, { "epoch": 0.2872408090573718, "ewc_loss": 0.02461392618715763, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.98892612894997e-05, "grad_norm": 4.098331928253174, "learning_rate": 9.56761339550657e-07, "loss": 0.4537, "mean_token_accuracy": 0.8582965731620789, "num_tokens": 86320786.0, "step": 2258 }, { "epoch": 0.28736801933596234, "ewc_loss": 0.024682197719812393, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.057196439243853e-05, "grad_norm": 4.003024101257324, "learning_rate": 9.57185247986435e-07, "loss": 0.4423, "mean_token_accuracy": 0.8598589897155762, "num_tokens": 86358002.0, "step": 2259 }, { "epoch": 0.28749522961455287, "ewc_loss": 0.024606753140687943, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 8.981752034742385e-05, "grad_norm": 3.9999866485595703, "learning_rate": 9.576091564222128e-07, "loss": 0.411, "mean_token_accuracy": 0.868323802947998, "num_tokens": 86395334.0, "step": 2260 }, { "epoch": 0.28762243989314334, "ewc_loss": 0.024653460830450058, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.028461499838158e-05, "grad_norm": 4.042495250701904, "learning_rate": 9.580330648579906e-07, "loss": 0.4884, "mean_token_accuracy": 0.8437442779541016, "num_tokens": 86432616.0, "step": 2261 }, { "epoch": 0.2877496501717339, "ewc_loss": 0.024669766426086426, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.044767648447305e-05, "grad_norm": 4.043595314025879, "learning_rate": 9.584569732937685e-07, "loss": 0.4838, "mean_token_accuracy": 0.8473302721977234, "num_tokens": 86472293.0, "step": 2262 }, { "epoch": 0.2878768604503244, "ewc_loss": 0.02464105747640133, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.016057447297499e-05, "grad_norm": 4.057711601257324, "learning_rate": 9.588808817295463e-07, "loss": 0.4723, "mean_token_accuracy": 0.8467472791671753, "num_tokens": 86506304.0, "step": 2263 }, { "epoch": 0.2880040707289149, "ewc_loss": 0.024692676961421967, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.067677456187084e-05, "grad_norm": 4.04298210144043, "learning_rate": 9.593047901653242e-07, "loss": 0.4714, "mean_token_accuracy": 0.8492167592048645, "num_tokens": 86540520.0, "step": 2264 }, { "epoch": 0.2881312810075054, "ewc_loss": 0.024689268320798874, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.064269397640601e-05, "grad_norm": 4.017160415649414, "learning_rate": 9.597286986011022e-07, "loss": 0.4318, "mean_token_accuracy": 0.8627146482467651, "num_tokens": 86576956.0, "step": 2265 }, { "epoch": 0.28825849128609593, "ewc_loss": 0.024701524525880814, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.076525748241693e-05, "grad_norm": 4.090961933135986, "learning_rate": 9.601526070368799e-07, "loss": 0.5156, "mean_token_accuracy": 0.8394927382469177, "num_tokens": 86613799.0, "step": 2266 }, { "epoch": 0.2883857015646864, "ewc_loss": 0.024751197546720505, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.126198710873723e-05, "grad_norm": 4.0855536460876465, "learning_rate": 9.60576515472658e-07, "loss": 0.4576, "mean_token_accuracy": 0.8549596071243286, "num_tokens": 86650338.0, "step": 2267 }, { "epoch": 0.28851291184327693, "ewc_loss": 0.024719595909118652, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.094595588976517e-05, "grad_norm": 4.066219806671143, "learning_rate": 9.610004239084358e-07, "loss": 0.4686, "mean_token_accuracy": 0.8505526781082153, "num_tokens": 86691605.0, "step": 2268 }, { "epoch": 0.28864012212186746, "ewc_loss": 0.024722402915358543, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.097402653424069e-05, "grad_norm": 4.049037933349609, "learning_rate": 9.614243323442136e-07, "loss": 0.5375, "mean_token_accuracy": 0.8296729326248169, "num_tokens": 86728767.0, "step": 2269 }, { "epoch": 0.28876733240045793, "ewc_loss": 0.024706292897462845, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.081292228074744e-05, "grad_norm": 4.04664421081543, "learning_rate": 9.618482407799915e-07, "loss": 0.4479, "mean_token_accuracy": 0.8564197421073914, "num_tokens": 86765240.0, "step": 2270 }, { "epoch": 0.28889454267904846, "ewc_loss": 0.024725353345274925, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.100353781832382e-05, "grad_norm": 3.9967267513275146, "learning_rate": 9.622721492157693e-07, "loss": 0.417, "mean_token_accuracy": 0.8679818511009216, "num_tokens": 86804042.0, "step": 2271 }, { "epoch": 0.289021752957639, "ewc_loss": 0.024698365479707718, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.073365799849853e-05, "grad_norm": 4.148212432861328, "learning_rate": 9.626960576515472e-07, "loss": 0.4664, "mean_token_accuracy": 0.8525274991989136, "num_tokens": 86835220.0, "step": 2272 }, { "epoch": 0.28914896323622946, "ewc_loss": 0.024802837520837784, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.177836182061583e-05, "grad_norm": 4.060527801513672, "learning_rate": 9.63119966087325e-07, "loss": 0.4539, "mean_token_accuracy": 0.8536996841430664, "num_tokens": 86873414.0, "step": 2273 }, { "epoch": 0.28927617351482, "ewc_loss": 0.024718737229704857, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.093737025978044e-05, "grad_norm": 4.031476020812988, "learning_rate": 9.635438745231029e-07, "loss": 0.4447, "mean_token_accuracy": 0.860424280166626, "num_tokens": 86914272.0, "step": 2274 }, { "epoch": 0.2894033837934105, "ewc_loss": 0.024763522669672966, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.138522727880627e-05, "grad_norm": 4.049277305603027, "learning_rate": 9.63967782958881e-07, "loss": 0.4844, "mean_token_accuracy": 0.8448431491851807, "num_tokens": 86952425.0, "step": 2275 }, { "epoch": 0.289530594072001, "ewc_loss": 0.024772508069872856, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.147508535534143e-05, "grad_norm": 4.047529697418213, "learning_rate": 9.643916913946588e-07, "loss": 0.4445, "mean_token_accuracy": 0.8595618009567261, "num_tokens": 86987842.0, "step": 2276 }, { "epoch": 0.2896578043505915, "ewc_loss": 0.024763198569417, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.138198220171034e-05, "grad_norm": 4.004054069519043, "learning_rate": 9.648155998304366e-07, "loss": 0.4179, "mean_token_accuracy": 0.8676272630691528, "num_tokens": 87026750.0, "step": 2277 }, { "epoch": 0.28978501462918205, "ewc_loss": 0.024762270972132683, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.137270535575226e-05, "grad_norm": 4.000792503356934, "learning_rate": 9.652395082662145e-07, "loss": 0.5213, "mean_token_accuracy": 0.8338651657104492, "num_tokens": 87069685.0, "step": 2278 }, { "epoch": 0.2899122249077726, "ewc_loss": 0.024783413857221603, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.158415195997804e-05, "grad_norm": 4.101005554199219, "learning_rate": 9.656634167019923e-07, "loss": 0.4761, "mean_token_accuracy": 0.8459458947181702, "num_tokens": 87107054.0, "step": 2279 }, { "epoch": 0.29003943518636305, "ewc_loss": 0.024821192026138306, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.196191967930645e-05, "grad_norm": 4.03912353515625, "learning_rate": 9.660873251377701e-07, "loss": 0.483, "mean_token_accuracy": 0.8465366363525391, "num_tokens": 87146254.0, "step": 2280 }, { "epoch": 0.2901666454649536, "ewc_loss": 0.024757225066423416, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.132223931374028e-05, "grad_norm": 4.1130595207214355, "learning_rate": 9.66511233573548e-07, "loss": 0.5077, "mean_token_accuracy": 0.8431459665298462, "num_tokens": 87182404.0, "step": 2281 }, { "epoch": 0.2902938557435441, "ewc_loss": 0.02482590079307556, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.200900240102783e-05, "grad_norm": 4.075185298919678, "learning_rate": 9.669351420093258e-07, "loss": 0.4397, "mean_token_accuracy": 0.8611068725585938, "num_tokens": 87216551.0, "step": 2282 }, { "epoch": 0.2904210660221346, "ewc_loss": 0.02477153018116951, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.146530646830797e-05, "grad_norm": 4.116696834564209, "learning_rate": 9.67359050445104e-07, "loss": 0.4627, "mean_token_accuracy": 0.8512571454048157, "num_tokens": 87249402.0, "step": 2283 }, { "epoch": 0.2905482763007251, "ewc_loss": 0.024840563535690308, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.215564205078408e-05, "grad_norm": 4.073296546936035, "learning_rate": 9.677829588808817e-07, "loss": 0.45, "mean_token_accuracy": 0.8551491498947144, "num_tokens": 87281846.0, "step": 2284 }, { "epoch": 0.29067548657931563, "ewc_loss": 0.0247793048620224, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.154305007541552e-05, "grad_norm": 4.1427178382873535, "learning_rate": 9.682068673166596e-07, "loss": 0.5196, "mean_token_accuracy": 0.8412654399871826, "num_tokens": 87315765.0, "step": 2285 }, { "epoch": 0.2908026968579061, "ewc_loss": 0.02485238015651703, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.227381087839603e-05, "grad_norm": 4.119114398956299, "learning_rate": 9.686307757524374e-07, "loss": 0.4634, "mean_token_accuracy": 0.8512978553771973, "num_tokens": 87353668.0, "step": 2286 }, { "epoch": 0.29092990713649663, "ewc_loss": 0.02482428029179573, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.199279884342104e-05, "grad_norm": 4.0487823486328125, "learning_rate": 9.690546841882153e-07, "loss": 0.3956, "mean_token_accuracy": 0.8747530579566956, "num_tokens": 87390747.0, "step": 2287 }, { "epoch": 0.29105711741508716, "ewc_loss": 0.024783052504062653, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.158051398117095e-05, "grad_norm": 4.08005952835083, "learning_rate": 9.694785926239931e-07, "loss": 0.4759, "mean_token_accuracy": 0.8516179323196411, "num_tokens": 87430686.0, "step": 2288 }, { "epoch": 0.29118432769367764, "ewc_loss": 0.02484160289168358, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.216603211825714e-05, "grad_norm": 4.106024742126465, "learning_rate": 9.69902501059771e-07, "loss": 0.5154, "mean_token_accuracy": 0.8375457525253296, "num_tokens": 87465290.0, "step": 2289 }, { "epoch": 0.29131153797226816, "ewc_loss": 0.02481713332235813, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.192133438773453e-05, "grad_norm": 4.049407482147217, "learning_rate": 9.703264094955488e-07, "loss": 0.4843, "mean_token_accuracy": 0.847004234790802, "num_tokens": 87504553.0, "step": 2290 }, { "epoch": 0.2914387482508587, "ewc_loss": 0.02481825277209282, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.193252481054515e-05, "grad_norm": 4.028171062469482, "learning_rate": 9.707503179313269e-07, "loss": 0.4554, "mean_token_accuracy": 0.8576200008392334, "num_tokens": 87544896.0, "step": 2291 }, { "epoch": 0.29156595852944917, "ewc_loss": 0.024844147264957428, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.219146886607632e-05, "grad_norm": 4.070387840270996, "learning_rate": 9.711742263671047e-07, "loss": 0.4553, "mean_token_accuracy": 0.8554109334945679, "num_tokens": 87582075.0, "step": 2292 }, { "epoch": 0.2916931688080397, "ewc_loss": 0.02484625205397606, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.221251821145415e-05, "grad_norm": 4.0569233894348145, "learning_rate": 9.715981348028826e-07, "loss": 0.4482, "mean_token_accuracy": 0.8574779629707336, "num_tokens": 87623117.0, "step": 2293 }, { "epoch": 0.2918203790866302, "ewc_loss": 0.0248505137860775, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.225513349520043e-05, "grad_norm": 4.181166172027588, "learning_rate": 9.720220432386604e-07, "loss": 0.4619, "mean_token_accuracy": 0.8545046448707581, "num_tokens": 87656263.0, "step": 2294 }, { "epoch": 0.2919475893652207, "ewc_loss": 0.024903148412704468, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.278148354496807e-05, "grad_norm": 4.037055492401123, "learning_rate": 9.724459516744383e-07, "loss": 0.4371, "mean_token_accuracy": 0.8624204397201538, "num_tokens": 87695852.0, "step": 2295 }, { "epoch": 0.2920747996438112, "ewc_loss": 0.02480730414390564, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.182303620036691e-05, "grad_norm": 4.205319404602051, "learning_rate": 9.728698601102161e-07, "loss": 0.4349, "mean_token_accuracy": 0.8626075387001038, "num_tokens": 87734331.0, "step": 2296 }, { "epoch": 0.29220200992240175, "ewc_loss": 0.024933617562055588, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.308617882197723e-05, "grad_norm": 4.138247966766357, "learning_rate": 9.73293768545994e-07, "loss": 0.468, "mean_token_accuracy": 0.8480420112609863, "num_tokens": 87768447.0, "step": 2297 }, { "epoch": 0.2923292202009922, "ewc_loss": 0.024796778336167336, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.171778219752014e-05, "grad_norm": 4.070497512817383, "learning_rate": 9.737176769817718e-07, "loss": 0.4954, "mean_token_accuracy": 0.847436785697937, "num_tokens": 87813213.0, "step": 2298 }, { "epoch": 0.29245643047958275, "ewc_loss": 0.024838164448738098, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.213163139065728e-05, "grad_norm": 4.130634784698486, "learning_rate": 9.741415854175499e-07, "loss": 0.5303, "mean_token_accuracy": 0.8373695015907288, "num_tokens": 87851336.0, "step": 2299 }, { "epoch": 0.2925836407581733, "ewc_loss": 0.024875186383724213, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.250186121789739e-05, "grad_norm": 4.131685256958008, "learning_rate": 9.745654938533277e-07, "loss": 0.4842, "mean_token_accuracy": 0.8467634916305542, "num_tokens": 87888474.0, "step": 2300 }, { "epoch": 0.29271085103676375, "ewc_loss": 0.024846330285072327, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.22133112908341e-05, "grad_norm": 4.10223913192749, "learning_rate": 9.749894022891056e-07, "loss": 0.5095, "mean_token_accuracy": 0.8412572741508484, "num_tokens": 87928232.0, "step": 2301 }, { "epoch": 0.2928380613153543, "ewc_loss": 0.024823300540447235, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.198299812851474e-05, "grad_norm": 4.020438194274902, "learning_rate": 9.754133107248834e-07, "loss": 0.427, "mean_token_accuracy": 0.8649998903274536, "num_tokens": 87966982.0, "step": 2302 }, { "epoch": 0.2929652715939448, "ewc_loss": 0.0248112715780735, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.186270472127944e-05, "grad_norm": 4.095055103302002, "learning_rate": 9.758372191606612e-07, "loss": 0.5134, "mean_token_accuracy": 0.8359590172767639, "num_tokens": 88003550.0, "step": 2303 }, { "epoch": 0.2930924818725353, "ewc_loss": 0.02492201328277588, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.297014184994623e-05, "grad_norm": 4.108351707458496, "learning_rate": 9.76261127596439e-07, "loss": 0.4252, "mean_token_accuracy": 0.8653702735900879, "num_tokens": 88038038.0, "step": 2304 }, { "epoch": 0.2932196921511258, "ewc_loss": 0.024864353239536285, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.23935222090222e-05, "grad_norm": 4.122253894805908, "learning_rate": 9.76685036032217e-07, "loss": 0.4953, "mean_token_accuracy": 0.8405712246894836, "num_tokens": 88074627.0, "step": 2305 }, { "epoch": 0.29334690242971634, "ewc_loss": 0.02488815039396286, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.26315042306669e-05, "grad_norm": 4.057503700256348, "learning_rate": 9.771089444679948e-07, "loss": 0.4402, "mean_token_accuracy": 0.8602638244628906, "num_tokens": 88116772.0, "step": 2306 }, { "epoch": 0.2934741127083068, "ewc_loss": 0.024831339716911316, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.206339018419385e-05, "grad_norm": 4.119828224182129, "learning_rate": 9.775328529037728e-07, "loss": 0.4848, "mean_token_accuracy": 0.8465943336486816, "num_tokens": 88152888.0, "step": 2307 }, { "epoch": 0.29360132298689734, "ewc_loss": 0.024907248094677925, "ewc_loss_diag": 1.5616416931152344e-05, "ewc_loss_parallel": 9.282247629016638e-05, "grad_norm": 4.194037914276123, "learning_rate": 9.779567613395507e-07, "loss": 0.513, "mean_token_accuracy": 0.8411906957626343, "num_tokens": 88184544.0, "step": 2308 }, { "epoch": 0.29372853326548787, "ewc_loss": 0.025025371462106705, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.278301149606705e-05, "grad_norm": 4.086257457733154, "learning_rate": 9.783806697753285e-07, "loss": 0.458, "mean_token_accuracy": 0.8565793633460999, "num_tokens": 88221603.0, "step": 2309 }, { "epoch": 0.29385574354407834, "ewc_loss": 0.02498408406972885, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.23701300052926e-05, "grad_norm": 4.053706169128418, "learning_rate": 9.788045782111064e-07, "loss": 0.4317, "mean_token_accuracy": 0.8627517223358154, "num_tokens": 88265109.0, "step": 2310 }, { "epoch": 0.29398295382266887, "ewc_loss": 0.025018513202667236, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.271442831959575e-05, "grad_norm": 4.064105987548828, "learning_rate": 9.792284866468842e-07, "loss": 0.4224, "mean_token_accuracy": 0.863097608089447, "num_tokens": 88302756.0, "step": 2311 }, { "epoch": 0.2941101641012594, "ewc_loss": 0.025038007646799088, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.290938032791018e-05, "grad_norm": 4.120390892028809, "learning_rate": 9.79652395082662e-07, "loss": 0.4221, "mean_token_accuracy": 0.861284613609314, "num_tokens": 88335605.0, "step": 2312 }, { "epoch": 0.29423737437984987, "ewc_loss": 0.025061819702386856, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.314750059274957e-05, "grad_norm": 4.150395393371582, "learning_rate": 9.8007630351844e-07, "loss": 0.4928, "mean_token_accuracy": 0.8455524444580078, "num_tokens": 88369447.0, "step": 2313 }, { "epoch": 0.2943645846584404, "ewc_loss": 0.02506030723452568, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.313235932495445e-05, "grad_norm": 4.2069926261901855, "learning_rate": 9.805002119542178e-07, "loss": 0.5221, "mean_token_accuracy": 0.8363431096076965, "num_tokens": 88404442.0, "step": 2314 }, { "epoch": 0.2944917949370309, "ewc_loss": 0.025082744657993317, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.335675713373348e-05, "grad_norm": 4.056786060333252, "learning_rate": 9.809241203899958e-07, "loss": 0.4682, "mean_token_accuracy": 0.8548945188522339, "num_tokens": 88445870.0, "step": 2315 }, { "epoch": 0.2946190052156214, "ewc_loss": 0.024998094886541367, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.251025039702654e-05, "grad_norm": 4.126248836517334, "learning_rate": 9.813480288257737e-07, "loss": 0.481, "mean_token_accuracy": 0.8435964584350586, "num_tokens": 88480994.0, "step": 2316 }, { "epoch": 0.2947462154942119, "ewc_loss": 0.02509797178208828, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.350901382276788e-05, "grad_norm": 4.072986602783203, "learning_rate": 9.817719372615515e-07, "loss": 0.4386, "mean_token_accuracy": 0.8610479831695557, "num_tokens": 88519666.0, "step": 2317 }, { "epoch": 0.29487342577280246, "ewc_loss": 0.025022994726896286, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.275924094254151e-05, "grad_norm": 4.128632068634033, "learning_rate": 9.821958456973294e-07, "loss": 0.4122, "mean_token_accuracy": 0.8678861856460571, "num_tokens": 88553855.0, "step": 2318 }, { "epoch": 0.29500063605139293, "ewc_loss": 0.0250936858355999, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.346616570837796e-05, "grad_norm": 4.1143317222595215, "learning_rate": 9.826197541331072e-07, "loss": 0.4278, "mean_token_accuracy": 0.8657228946685791, "num_tokens": 88589053.0, "step": 2319 }, { "epoch": 0.29512784632998346, "ewc_loss": 0.025052841752767563, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.305770799983293e-05, "grad_norm": 4.11169958114624, "learning_rate": 9.83043662568885e-07, "loss": 0.4866, "mean_token_accuracy": 0.8476998805999756, "num_tokens": 88627067.0, "step": 2320 }, { "epoch": 0.295255056608574, "ewc_loss": 0.02506830543279648, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.321235120296478e-05, "grad_norm": 4.030381679534912, "learning_rate": 9.83467571004663e-07, "loss": 0.4306, "mean_token_accuracy": 0.8629463911056519, "num_tokens": 88668030.0, "step": 2321 }, { "epoch": 0.29538226688716446, "ewc_loss": 0.02502932772040367, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.282258542953059e-05, "grad_norm": 4.08668327331543, "learning_rate": 9.838914794404407e-07, "loss": 0.4291, "mean_token_accuracy": 0.8651562929153442, "num_tokens": 88708889.0, "step": 2322 }, { "epoch": 0.295509477165755, "ewc_loss": 0.025100603699684143, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.3535338237416e-05, "grad_norm": 4.193717956542969, "learning_rate": 9.843153878762188e-07, "loss": 0.4968, "mean_token_accuracy": 0.845744252204895, "num_tokens": 88745689.0, "step": 2323 }, { "epoch": 0.2956366874443455, "ewc_loss": 0.025095008313655853, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.347938612336293e-05, "grad_norm": 4.0776567459106445, "learning_rate": 9.847392963119966e-07, "loss": 0.448, "mean_token_accuracy": 0.8586021065711975, "num_tokens": 88784667.0, "step": 2324 }, { "epoch": 0.295763897722936, "ewc_loss": 0.0250261053442955, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.279033838538453e-05, "grad_norm": 4.066418170928955, "learning_rate": 9.851632047477745e-07, "loss": 0.4609, "mean_token_accuracy": 0.8525645732879639, "num_tokens": 88827334.0, "step": 2325 }, { "epoch": 0.2958911080015265, "ewc_loss": 0.025078654289245605, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.331583714811131e-05, "grad_norm": 4.0974016189575195, "learning_rate": 9.855871131835523e-07, "loss": 0.3944, "mean_token_accuracy": 0.8736828565597534, "num_tokens": 88863744.0, "step": 2326 }, { "epoch": 0.29601831828011704, "ewc_loss": 0.025088973343372345, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.341901750303805e-05, "grad_norm": 4.0826616287231445, "learning_rate": 9.860110216193302e-07, "loss": 0.426, "mean_token_accuracy": 0.8660633563995361, "num_tokens": 88903693.0, "step": 2327 }, { "epoch": 0.2961455285587075, "ewc_loss": 0.025074616074562073, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.32754555833526e-05, "grad_norm": 4.137848377227783, "learning_rate": 9.86434930055108e-07, "loss": 0.4218, "mean_token_accuracy": 0.8658237457275391, "num_tokens": 88936668.0, "step": 2328 }, { "epoch": 0.29627273883729804, "ewc_loss": 0.0251159630715847, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.36889264266938e-05, "grad_norm": 4.123019695281982, "learning_rate": 9.868588384908859e-07, "loss": 0.4311, "mean_token_accuracy": 0.860259473323822, "num_tokens": 88970920.0, "step": 2329 }, { "epoch": 0.2963999491158886, "ewc_loss": 0.02507214993238449, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.325079736299813e-05, "grad_norm": 4.078372955322266, "learning_rate": 9.872827469266637e-07, "loss": 0.4344, "mean_token_accuracy": 0.8586316704750061, "num_tokens": 89010856.0, "step": 2330 }, { "epoch": 0.2965271593944791, "ewc_loss": 0.025081902742385864, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.334832429885864e-05, "grad_norm": 4.118564128875732, "learning_rate": 9.877066553624418e-07, "loss": 0.4425, "mean_token_accuracy": 0.8587301969528198, "num_tokens": 89047109.0, "step": 2331 }, { "epoch": 0.2966543696730696, "ewc_loss": 0.025109216570854187, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.362145647173747e-05, "grad_norm": 4.037589073181152, "learning_rate": 9.881305637982196e-07, "loss": 0.4516, "mean_token_accuracy": 0.8591119647026062, "num_tokens": 89087859.0, "step": 2332 }, { "epoch": 0.2967815799516601, "ewc_loss": 0.02506178617477417, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.314716589869931e-05, "grad_norm": 4.156486988067627, "learning_rate": 9.885544722339975e-07, "loss": 0.433, "mean_token_accuracy": 0.8612080812454224, "num_tokens": 89123601.0, "step": 2333 }, { "epoch": 0.29690879023025063, "ewc_loss": 0.025130975991487503, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.383905853610486e-05, "grad_norm": 4.124579906463623, "learning_rate": 9.889783806697753e-07, "loss": 0.4162, "mean_token_accuracy": 0.8687442541122437, "num_tokens": 89160133.0, "step": 2334 }, { "epoch": 0.2970360005088411, "ewc_loss": 0.025071054697036743, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.323983977083117e-05, "grad_norm": 4.114026069641113, "learning_rate": 9.894022891055532e-07, "loss": 0.4111, "mean_token_accuracy": 0.8698400259017944, "num_tokens": 89196731.0, "step": 2335 }, { "epoch": 0.29716321078743163, "ewc_loss": 0.025093255564570427, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.346185106551275e-05, "grad_norm": 4.09000825881958, "learning_rate": 9.89826197541331e-07, "loss": 0.3997, "mean_token_accuracy": 0.8721911311149597, "num_tokens": 89230159.0, "step": 2336 }, { "epoch": 0.29729042106602216, "ewc_loss": 0.025087326765060425, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.340255201095715e-05, "grad_norm": 4.147958755493164, "learning_rate": 9.902501059771089e-07, "loss": 0.4633, "mean_token_accuracy": 0.8519967794418335, "num_tokens": 89264648.0, "step": 2337 }, { "epoch": 0.29741763134461263, "ewc_loss": 0.02514929324388504, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.40222162171267e-05, "grad_norm": 4.089192867279053, "learning_rate": 9.906740144128867e-07, "loss": 0.4625, "mean_token_accuracy": 0.8515468835830688, "num_tokens": 89302505.0, "step": 2338 }, { "epoch": 0.29754484162320316, "ewc_loss": 0.025109075009822845, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.362003766000271e-05, "grad_norm": 4.175958156585693, "learning_rate": 9.910979228486648e-07, "loss": 0.4211, "mean_token_accuracy": 0.8655378222465515, "num_tokens": 89338079.0, "step": 2339 }, { "epoch": 0.2976720519017937, "ewc_loss": 0.02518444135785103, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.437371045351028e-05, "grad_norm": 4.118703365325928, "learning_rate": 9.915218312844426e-07, "loss": 0.4859, "mean_token_accuracy": 0.8510646820068359, "num_tokens": 89376717.0, "step": 2340 }, { "epoch": 0.29779926218038416, "ewc_loss": 0.025117583572864532, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.370514453621581e-05, "grad_norm": 4.070319175720215, "learning_rate": 9.919457397202205e-07, "loss": 0.4764, "mean_token_accuracy": 0.8488771319389343, "num_tokens": 89416164.0, "step": 2341 }, { "epoch": 0.2979264724589747, "ewc_loss": 0.025153879076242447, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.40680765779689e-05, "grad_norm": 4.23361873626709, "learning_rate": 9.923696481559983e-07, "loss": 0.432, "mean_token_accuracy": 0.8638627529144287, "num_tokens": 89446491.0, "step": 2342 }, { "epoch": 0.2980536827375652, "ewc_loss": 0.025243308395147324, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.496237908024341e-05, "grad_norm": 4.120471477508545, "learning_rate": 9.927935565917761e-07, "loss": 0.4787, "mean_token_accuracy": 0.8456070423126221, "num_tokens": 89483637.0, "step": 2343 }, { "epoch": 0.2981808930161557, "ewc_loss": 0.02515256777405739, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.405497985426337e-05, "grad_norm": 4.0542378425598145, "learning_rate": 9.93217465027554e-07, "loss": 0.4712, "mean_token_accuracy": 0.8549526929855347, "num_tokens": 89526532.0, "step": 2344 }, { "epoch": 0.2983081032947462, "ewc_loss": 0.025183072313666344, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.4360024377238e-05, "grad_norm": 4.2375335693359375, "learning_rate": 9.936413734633318e-07, "loss": 0.5331, "mean_token_accuracy": 0.8371931314468384, "num_tokens": 89558353.0, "step": 2345 }, { "epoch": 0.29843531357333675, "ewc_loss": 0.02530798502266407, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.560914622852579e-05, "grad_norm": 4.062592506408691, "learning_rate": 9.940652818991097e-07, "loss": 0.4249, "mean_token_accuracy": 0.8648513555526733, "num_tokens": 89598727.0, "step": 2346 }, { "epoch": 0.2985625238519272, "ewc_loss": 0.02513088285923004, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.383812721353024e-05, "grad_norm": 4.100413799285889, "learning_rate": 9.944891903348877e-07, "loss": 0.4917, "mean_token_accuracy": 0.8445725440979004, "num_tokens": 89640093.0, "step": 2347 }, { "epoch": 0.29868973413051775, "ewc_loss": 0.02526703104376793, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.519960440229625e-05, "grad_norm": 4.043870449066162, "learning_rate": 9.949130987706656e-07, "loss": 0.4377, "mean_token_accuracy": 0.8598123788833618, "num_tokens": 89681401.0, "step": 2348 }, { "epoch": 0.2988169444091083, "ewc_loss": 0.025200383737683296, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.453313396079466e-05, "grad_norm": 4.095435619354248, "learning_rate": 9.953370072064432e-07, "loss": 0.4313, "mean_token_accuracy": 0.8620473742485046, "num_tokens": 89720366.0, "step": 2349 }, { "epoch": 0.29894415468769875, "ewc_loss": 0.0252652820199728, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.518211300019175e-05, "grad_norm": 4.030455112457275, "learning_rate": 9.957609156422213e-07, "loss": 0.508, "mean_token_accuracy": 0.8397880792617798, "num_tokens": 89762234.0, "step": 2350 }, { "epoch": 0.2990713649662893, "ewc_loss": 0.025227654725313187, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.480585140408948e-05, "grad_norm": 4.236144065856934, "learning_rate": 9.961848240779991e-07, "loss": 0.4532, "mean_token_accuracy": 0.8567315340042114, "num_tokens": 89792208.0, "step": 2351 }, { "epoch": 0.2991985752448798, "ewc_loss": 0.025406384840607643, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.659314673626795e-05, "grad_norm": 4.049780368804932, "learning_rate": 9.96608732513777e-07, "loss": 0.4324, "mean_token_accuracy": 0.8666352033615112, "num_tokens": 89833246.0, "step": 2352 }, { "epoch": 0.2993257855234703, "ewc_loss": 0.025205031037330627, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.457960550207645e-05, "grad_norm": 4.066842555999756, "learning_rate": 9.970326409495548e-07, "loss": 0.4889, "mean_token_accuracy": 0.844119668006897, "num_tokens": 89873271.0, "step": 2353 }, { "epoch": 0.2994529958020608, "ewc_loss": 0.025353748351335526, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.606677485862747e-05, "grad_norm": 4.09638786315918, "learning_rate": 9.974565493853327e-07, "loss": 0.5044, "mean_token_accuracy": 0.8407372236251831, "num_tokens": 89913069.0, "step": 2354 }, { "epoch": 0.29958020608065133, "ewc_loss": 0.02529917284846306, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.552102710586041e-05, "grad_norm": 4.163989067077637, "learning_rate": 9.978804578211107e-07, "loss": 0.487, "mean_token_accuracy": 0.8456655740737915, "num_tokens": 89948915.0, "step": 2355 }, { "epoch": 0.2997074163592418, "ewc_loss": 0.02559521608054638, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.604005026631057e-05, "grad_norm": 4.221380233764648, "learning_rate": 9.983043662568886e-07, "loss": 0.4843, "mean_token_accuracy": 0.849219560623169, "num_tokens": 89982123.0, "step": 2356 }, { "epoch": 0.29983462663783234, "ewc_loss": 0.025608859956264496, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.617649629944935e-05, "grad_norm": 4.103397369384766, "learning_rate": 9.987282746926662e-07, "loss": 0.4359, "mean_token_accuracy": 0.8617812395095825, "num_tokens": 90018259.0, "step": 2357 }, { "epoch": 0.29996183691642286, "ewc_loss": 0.025512153282761574, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.520941966911778e-05, "grad_norm": 4.070160388946533, "learning_rate": 9.991521831284443e-07, "loss": 0.4554, "mean_token_accuracy": 0.8565411567687988, "num_tokens": 90059041.0, "step": 2358 }, { "epoch": 0.30008904719501334, "ewc_loss": 0.0255759097635746, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.584699000697583e-05, "grad_norm": 4.137429714202881, "learning_rate": 9.995760915642221e-07, "loss": 0.4051, "mean_token_accuracy": 0.8720479011535645, "num_tokens": 90089463.0, "step": 2359 }, { "epoch": 0.30021625747360386, "ewc_loss": 0.02561839297413826, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.627181134419516e-05, "grad_norm": 4.1236701011657715, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8627259731292725, "num_tokens": 90125691.0, "step": 2360 }, { "epoch": 0.3003434677521944, "ewc_loss": 0.02556486614048481, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.573655552230775e-05, "grad_norm": 4.0942888259887695, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8529907464981079, "num_tokens": 90161778.0, "step": 2361 }, { "epoch": 0.30047067803078487, "ewc_loss": 0.025595981627702713, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.60477045737207e-05, "grad_norm": 4.124085903167725, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8402907848358154, "num_tokens": 90202243.0, "step": 2362 }, { "epoch": 0.3005978883093754, "ewc_loss": 0.025596681982278824, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.605471859686077e-05, "grad_norm": 4.08497953414917, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8459972143173218, "num_tokens": 90248924.0, "step": 2363 }, { "epoch": 0.3007250985879659, "ewc_loss": 0.0253300704061985, "ewc_loss_diag": 1.5735626220703125e-05, "ewc_loss_parallel": 9.583000064594671e-05, "grad_norm": 4.227000713348389, "learning_rate": 1e-06, "loss": 0.5711, "mean_token_accuracy": 0.82508385181427, "num_tokens": 90281988.0, "step": 2364 }, { "epoch": 0.3008523088665564, "ewc_loss": 0.02566688135266304, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.675671753939241e-05, "grad_norm": 4.1451005935668945, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8494484424591064, "num_tokens": 90317050.0, "step": 2365 }, { "epoch": 0.3009795191451469, "ewc_loss": 0.025586705654859543, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.595494339009747e-05, "grad_norm": 4.087472915649414, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8620538115501404, "num_tokens": 90358064.0, "step": 2366 }, { "epoch": 0.30110672942373745, "ewc_loss": 0.02561257965862751, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.621369099477306e-05, "grad_norm": 4.106276988983154, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8428560495376587, "num_tokens": 90400692.0, "step": 2367 }, { "epoch": 0.3012339397023279, "ewc_loss": 0.02560923621058464, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.61802652454935e-05, "grad_norm": 4.173049449920654, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8460912704467773, "num_tokens": 90432207.0, "step": 2368 }, { "epoch": 0.30136114998091845, "ewc_loss": 0.025668609887361526, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.677399066276848e-05, "grad_norm": 4.204705238342285, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8512800931930542, "num_tokens": 90466081.0, "step": 2369 }, { "epoch": 0.301488360259509, "ewc_loss": 0.025651179254055023, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.65996878221631e-05, "grad_norm": 4.108244895935059, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8610305786132812, "num_tokens": 90501963.0, "step": 2370 }, { "epoch": 0.30161557053809945, "ewc_loss": 0.0255971010774374, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.605890227248892e-05, "grad_norm": 4.187772274017334, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8649648427963257, "num_tokens": 90540651.0, "step": 2371 }, { "epoch": 0.30174278081669, "ewc_loss": 0.025690171867609024, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.698962094262242e-05, "grad_norm": 4.1528754234313965, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8459237217903137, "num_tokens": 90585105.0, "step": 2372 }, { "epoch": 0.3018699910952805, "ewc_loss": 0.025611665099859238, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.620453784009442e-05, "grad_norm": 4.0902628898620605, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8504039645195007, "num_tokens": 90626773.0, "step": 2373 }, { "epoch": 0.301997201373871, "ewc_loss": 0.025615185499191284, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.623975347494707e-05, "grad_norm": 4.145263671875, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8692338466644287, "num_tokens": 90665974.0, "step": 2374 }, { "epoch": 0.3021244116524615, "ewc_loss": 0.025637276470661163, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.646064427215606e-05, "grad_norm": 4.150193691253662, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8486805558204651, "num_tokens": 90706801.0, "step": 2375 }, { "epoch": 0.30225162193105204, "ewc_loss": 0.025613484904170036, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.62227422860451e-05, "grad_norm": 4.153785228729248, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8501378297805786, "num_tokens": 90743635.0, "step": 2376 }, { "epoch": 0.3023788322096425, "ewc_loss": 0.025616507977247238, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.625298116588965e-05, "grad_norm": 4.21724271774292, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8539657592773438, "num_tokens": 90782797.0, "step": 2377 }, { "epoch": 0.30250604248823304, "ewc_loss": 0.025644509121775627, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.653298184275627e-05, "grad_norm": 8.313677787780762, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8468767404556274, "num_tokens": 90822661.0, "step": 2378 }, { "epoch": 0.30263325276682357, "ewc_loss": 0.02869250625371933, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 0.00012701295781880617, "grad_norm": 4.880659580230713, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8355873823165894, "num_tokens": 90857804.0, "step": 2379 }, { "epoch": 0.3027604630454141, "ewc_loss": 0.025114767253398895, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.123556083068252e-05, "grad_norm": 3.8336551189422607, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8512639403343201, "num_tokens": 90895778.0, "step": 2380 }, { "epoch": 0.30288767332400457, "ewc_loss": 0.025896646082401276, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.905436309054494e-05, "grad_norm": 4.392498970031738, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8597347140312195, "num_tokens": 90934463.0, "step": 2381 }, { "epoch": 0.3030148836025951, "ewc_loss": 0.026162225753068924, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 0.00010171013855142519, "grad_norm": 4.156024932861328, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8575291037559509, "num_tokens": 90967888.0, "step": 2382 }, { "epoch": 0.3031420938811856, "ewc_loss": 0.025584347546100616, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.593136928742751e-05, "grad_norm": 4.187617778778076, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8609597682952881, "num_tokens": 91005200.0, "step": 2383 }, { "epoch": 0.3032693041597761, "ewc_loss": 0.02585788443684578, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.866673644864932e-05, "grad_norm": 4.233353614807129, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8370751738548279, "num_tokens": 91037840.0, "step": 2384 }, { "epoch": 0.3033965144383666, "ewc_loss": 0.02571573108434677, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.724518895382062e-05, "grad_norm": 4.216352939605713, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8405246138572693, "num_tokens": 91073336.0, "step": 2385 }, { "epoch": 0.30352372471695716, "ewc_loss": 0.025725044310092926, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.733832848723978e-05, "grad_norm": 4.219781875610352, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.855231761932373, "num_tokens": 91107001.0, "step": 2386 }, { "epoch": 0.30365093499554763, "ewc_loss": 0.025703884661197662, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.712673636386171e-05, "grad_norm": 4.178910732269287, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8606762290000916, "num_tokens": 91143937.0, "step": 2387 }, { "epoch": 0.30377814527413816, "ewc_loss": 0.025659950450062752, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.668739949120209e-05, "grad_norm": 4.144084930419922, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8444550037384033, "num_tokens": 91185391.0, "step": 2388 }, { "epoch": 0.3039053555527287, "ewc_loss": 0.025644179433584213, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.652968583395705e-05, "grad_norm": 4.133864402770996, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8526419997215271, "num_tokens": 91227653.0, "step": 2389 }, { "epoch": 0.30403256583131916, "ewc_loss": 0.025656411424279213, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.665200195740908e-05, "grad_norm": 4.139951229095459, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8733060359954834, "num_tokens": 91263610.0, "step": 2390 }, { "epoch": 0.3041597761099097, "ewc_loss": 0.025623928755521774, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.632717410568148e-05, "grad_norm": 4.2212934494018555, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8479732871055603, "num_tokens": 91296880.0, "step": 2391 }, { "epoch": 0.3042869863885002, "ewc_loss": 0.025683099403977394, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.691888408269733e-05, "grad_norm": 4.150539875030518, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8422932624816895, "num_tokens": 91336242.0, "step": 2392 }, { "epoch": 0.3044141966670907, "ewc_loss": 0.025607813149690628, "ewc_loss_diag": 1.5974044799804688e-05, "ewc_loss_parallel": 9.616601164452732e-05, "grad_norm": 4.118220806121826, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8610794544219971, "num_tokens": 91376527.0, "step": 2393 }, { "epoch": 0.3045414069456812, "ewc_loss": 0.02575099468231201, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.637712355470285e-05, "grad_norm": 4.276878833770752, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8445507287979126, "num_tokens": 91408415.0, "step": 2394 }, { "epoch": 0.30466861722427174, "ewc_loss": 0.025832120329141617, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.718840010464191e-05, "grad_norm": 8.370782852172852, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8552621603012085, "num_tokens": 91449189.0, "step": 2395 }, { "epoch": 0.3047958275028622, "ewc_loss": 0.028809618204832077, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 0.00012696337944362313, "grad_norm": 4.804562091827393, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8499942421913147, "num_tokens": 91488599.0, "step": 2396 }, { "epoch": 0.30492303778145274, "ewc_loss": 0.025331085547804832, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.095733548747376e-05, "grad_norm": 3.795652389526367, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8582010865211487, "num_tokens": 91525138.0, "step": 2397 }, { "epoch": 0.30505024806004327, "ewc_loss": 0.0260643120855093, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.951030369848013e-05, "grad_norm": 4.410558223724365, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8557436466217041, "num_tokens": 91567444.0, "step": 2398 }, { "epoch": 0.30517745833863374, "ewc_loss": 0.026310840621590614, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 0.00010197559458902106, "grad_norm": 4.197188377380371, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8508044481277466, "num_tokens": 91602926.0, "step": 2399 }, { "epoch": 0.3053046686172243, "ewc_loss": 0.025864223018288612, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.62887133937329e-05, "grad_norm": 4.237827301025391, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.866219162940979, "num_tokens": 91635901.0, "step": 2400 }, { "epoch": 0.3054318788958148, "ewc_loss": 0.02600393258035183, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.890651563182473e-05, "grad_norm": 4.141830921173096, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8566744327545166, "num_tokens": 91679542.0, "step": 2401 }, { "epoch": 0.3055590891744053, "ewc_loss": 0.025783199816942215, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.669918654253706e-05, "grad_norm": 4.140209197998047, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8478742837905884, "num_tokens": 91723022.0, "step": 2402 }, { "epoch": 0.3056862994529958, "ewc_loss": 0.02591269090771675, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.79940959950909e-05, "grad_norm": 4.210606575012207, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8502818942070007, "num_tokens": 91760664.0, "step": 2403 }, { "epoch": 0.30581350973158633, "ewc_loss": 0.026147441938519478, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.790019976207986e-05, "grad_norm": 4.136958599090576, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8496520519256592, "num_tokens": 91805570.0, "step": 2404 }, { "epoch": 0.3059407200101768, "ewc_loss": 0.02583005093038082, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.716769272927195e-05, "grad_norm": 4.197501182556152, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8463317155838013, "num_tokens": 91843342.0, "step": 2405 }, { "epoch": 0.30606793028876733, "ewc_loss": 0.025863423943519592, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.75014190771617e-05, "grad_norm": 4.134023666381836, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8451522588729858, "num_tokens": 91883872.0, "step": 2406 }, { "epoch": 0.30619514056735786, "ewc_loss": 0.02579202875494957, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.678746573626995e-05, "grad_norm": 4.131733417510986, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8522825241088867, "num_tokens": 91923174.0, "step": 2407 }, { "epoch": 0.30632235084594833, "ewc_loss": 0.0258294977247715, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.71621775534004e-05, "grad_norm": 4.2010345458984375, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8526229858398438, "num_tokens": 91963462.0, "step": 2408 }, { "epoch": 0.30644956112453886, "ewc_loss": 0.02582230418920517, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.709021833259612e-05, "grad_norm": 4.150003910064697, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8559638857841492, "num_tokens": 92000907.0, "step": 2409 }, { "epoch": 0.3065767714031294, "ewc_loss": 0.025776244699954987, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.662963566370308e-05, "grad_norm": 4.181177616119385, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8561623096466064, "num_tokens": 92036110.0, "step": 2410 }, { "epoch": 0.30670398168171986, "ewc_loss": 0.026054291054606438, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.696869528852403e-05, "grad_norm": 4.122422218322754, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.850166916847229, "num_tokens": 92080647.0, "step": 2411 }, { "epoch": 0.3068311919603104, "ewc_loss": 0.026010725647211075, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.653302549850196e-05, "grad_norm": 4.188446998596191, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8578440546989441, "num_tokens": 92115407.0, "step": 2412 }, { "epoch": 0.3069584022389009, "ewc_loss": 0.0260865930467844, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.72917114268057e-05, "grad_norm": 4.230255126953125, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8537550568580627, "num_tokens": 92149660.0, "step": 2413 }, { "epoch": 0.3070856125174914, "ewc_loss": 0.02593356743454933, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.698216308606789e-05, "grad_norm": 4.142776966094971, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8565903902053833, "num_tokens": 92191581.0, "step": 2414 }, { "epoch": 0.3072128227960819, "ewc_loss": 0.025764644145965576, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.651363507146016e-05, "grad_norm": 4.163639545440674, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8484653234481812, "num_tokens": 92230435.0, "step": 2415 }, { "epoch": 0.30734003307467245, "ewc_loss": 0.02580944076180458, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.696159395389259e-05, "grad_norm": 4.156060695648193, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8414514064788818, "num_tokens": 92269545.0, "step": 2416 }, { "epoch": 0.3074672433532629, "ewc_loss": 0.02578960731625557, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.676325134932995e-05, "grad_norm": 4.118509292602539, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8608734011650085, "num_tokens": 92307447.0, "step": 2417 }, { "epoch": 0.30759445363185345, "ewc_loss": 0.02592112496495247, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.685773693490773e-05, "grad_norm": 4.106673717498779, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8527215123176575, "num_tokens": 92349441.0, "step": 2418 }, { "epoch": 0.307721663910444, "ewc_loss": 0.025817133486270905, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.703852992970496e-05, "grad_norm": 4.210585117340088, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8581430912017822, "num_tokens": 92380405.0, "step": 2419 }, { "epoch": 0.30784887418903445, "ewc_loss": 0.025862116366624832, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.748834418132901e-05, "grad_norm": 4.223201751708984, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8520795106887817, "num_tokens": 92419596.0, "step": 2420 }, { "epoch": 0.307976084467625, "ewc_loss": 0.025820579379796982, "ewc_loss_diag": 1.609325408935547e-05, "ewc_loss_parallel": 9.707298886496574e-05, "grad_norm": 4.144436359405518, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8468119502067566, "num_tokens": 92454590.0, "step": 2421 }, { "epoch": 0.3081032947462155, "ewc_loss": 0.025940746068954468, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.70539404079318e-05, "grad_norm": 4.169195652008057, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8507005572319031, "num_tokens": 92491640.0, "step": 2422 }, { "epoch": 0.308230505024806, "ewc_loss": 0.025970477610826492, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.735125786392018e-05, "grad_norm": 4.162181377410889, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8455212712287903, "num_tokens": 92528682.0, "step": 2423 }, { "epoch": 0.3083577153033965, "ewc_loss": 0.02592797577381134, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.69262546277605e-05, "grad_norm": 4.124845504760742, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.860181450843811, "num_tokens": 92566368.0, "step": 2424 }, { "epoch": 0.30848492558198704, "ewc_loss": 0.025964990258216858, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.729639714350924e-05, "grad_norm": 4.284472942352295, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8316830396652222, "num_tokens": 92601380.0, "step": 2425 }, { "epoch": 0.3086121358605775, "ewc_loss": 0.026173872873187065, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.816450619837269e-05, "grad_norm": 4.129263877868652, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8691802620887756, "num_tokens": 92639724.0, "step": 2426 }, { "epoch": 0.30873934613916804, "ewc_loss": 0.025914553552865982, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.679201320977882e-05, "grad_norm": 4.148994445800781, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.858844518661499, "num_tokens": 92677450.0, "step": 2427 }, { "epoch": 0.30886655641775856, "ewc_loss": 0.02613472379744053, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.777301602298394e-05, "grad_norm": 4.102413177490234, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.862993597984314, "num_tokens": 92714681.0, "step": 2428 }, { "epoch": 0.3089937666963491, "ewc_loss": 0.02596909925341606, "ewc_loss_diag": 1.621246337890625e-05, "ewc_loss_parallel": 9.73374699242413e-05, "grad_norm": 4.153834342956543, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8492132425308228, "num_tokens": 92749925.0, "step": 2429 }, { "epoch": 0.30912097697493957, "ewc_loss": 0.026130665093660355, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.773242345545441e-05, "grad_norm": 4.107837200164795, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8576371669769287, "num_tokens": 92791720.0, "step": 2430 }, { "epoch": 0.3092481872535301, "ewc_loss": 0.026110101491212845, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.752680489327759e-05, "grad_norm": 4.214538097381592, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8463466167449951, "num_tokens": 92826179.0, "step": 2431 }, { "epoch": 0.3093753975321206, "ewc_loss": 0.026182275265455246, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.82485362328589e-05, "grad_norm": 4.047589302062988, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8528424501419067, "num_tokens": 92872751.0, "step": 2432 }, { "epoch": 0.3095026078107111, "ewc_loss": 0.026098323985934258, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.740902169141918e-05, "grad_norm": 4.247148036956787, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8485697507858276, "num_tokens": 92907470.0, "step": 2433 }, { "epoch": 0.3096298180893016, "ewc_loss": 0.02624833583831787, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.890912770060822e-05, "grad_norm": 4.157575607299805, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8581782579421997, "num_tokens": 92943321.0, "step": 2434 }, { "epoch": 0.30975702836789215, "ewc_loss": 0.026105716824531555, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.74829526967369e-05, "grad_norm": 4.1814985275268555, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8555837869644165, "num_tokens": 92982136.0, "step": 2435 }, { "epoch": 0.3098842386464826, "ewc_loss": 0.026173114776611328, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.81569173745811e-05, "grad_norm": 4.294610023498535, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8437720537185669, "num_tokens": 93015974.0, "step": 2436 }, { "epoch": 0.31001144892507315, "ewc_loss": 0.026320528239011765, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.841036808211356e-05, "grad_norm": 4.131105422973633, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.856946587562561, "num_tokens": 93054010.0, "step": 2437 }, { "epoch": 0.3101386592036637, "ewc_loss": 0.02621551975607872, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.736028005136177e-05, "grad_norm": 4.139883041381836, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8615112900733948, "num_tokens": 93090226.0, "step": 2438 }, { "epoch": 0.31026586948225415, "ewc_loss": 0.026296552270650864, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.817059617489576e-05, "grad_norm": 4.144538402557373, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.85121089220047, "num_tokens": 93130242.0, "step": 2439 }, { "epoch": 0.3103930797608447, "ewc_loss": 0.026265457272529602, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.785965085029602e-05, "grad_norm": 4.1425676345825195, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8487541675567627, "num_tokens": 93170990.0, "step": 2440 }, { "epoch": 0.3105202900394352, "ewc_loss": 0.02628443017601967, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.804937872104347e-05, "grad_norm": 4.210679054260254, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8460254073143005, "num_tokens": 93205478.0, "step": 2441 }, { "epoch": 0.3106475003180257, "ewc_loss": 0.026333365589380264, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.85387378023006e-05, "grad_norm": 4.190593719482422, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8453761339187622, "num_tokens": 93240952.0, "step": 2442 }, { "epoch": 0.3107747105966162, "ewc_loss": 0.026439081877470016, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.837518155109137e-05, "grad_norm": 8.344544410705566, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8434686660766602, "num_tokens": 93278882.0, "step": 2443 }, { "epoch": 0.31090192087520674, "ewc_loss": 0.02959437295794487, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 0.000131148801301606, "grad_norm": 4.836230278015137, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8454958200454712, "num_tokens": 93324573.0, "step": 2444 }, { "epoch": 0.3110291311537972, "ewc_loss": 0.02594039775431156, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.338834934169427e-05, "grad_norm": 3.821136236190796, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8669863939285278, "num_tokens": 93361049.0, "step": 2445 }, { "epoch": 0.31115634143238774, "ewc_loss": 0.02671150490641594, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 0.00010232012573396787, "grad_norm": 4.43397855758667, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8498426675796509, "num_tokens": 93401748.0, "step": 2446 }, { "epoch": 0.31128355171097827, "ewc_loss": 0.026936836540699005, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 0.00010457343887537718, "grad_norm": 4.202372074127197, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8642198443412781, "num_tokens": 93434857.0, "step": 2447 }, { "epoch": 0.31141076198956874, "ewc_loss": 0.026337169110774994, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.857676195679232e-05, "grad_norm": 4.2042741775512695, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8520123362541199, "num_tokens": 93474681.0, "step": 2448 }, { "epoch": 0.31153797226815927, "ewc_loss": 0.026608126237988472, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 0.00010128634312422946, "grad_norm": 4.194501876831055, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8519704937934875, "num_tokens": 93510627.0, "step": 2449 }, { "epoch": 0.3116651825467498, "ewc_loss": 0.026294145733118057, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.936723654391244e-05, "grad_norm": 4.173849582672119, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.84378981590271, "num_tokens": 93551598.0, "step": 2450 }, { "epoch": 0.31179239282534027, "ewc_loss": 0.026474379003047943, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.994886931963265e-05, "grad_norm": 4.231144428253174, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8459998965263367, "num_tokens": 93586622.0, "step": 2451 }, { "epoch": 0.3119196031039308, "ewc_loss": 0.026468627154827118, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.989135287469253e-05, "grad_norm": 4.212247371673584, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8429580926895142, "num_tokens": 93626496.0, "step": 2452 }, { "epoch": 0.3120468133825213, "ewc_loss": 0.026273861527442932, "ewc_loss_diag": 1.633167266845703e-05, "ewc_loss_parallel": 9.9164382845629e-05, "grad_norm": 4.178114891052246, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8544328212738037, "num_tokens": 93665641.0, "step": 2453 }, { "epoch": 0.3121740236611118, "ewc_loss": 0.026392027735710144, "ewc_loss_diag": 1.6450881958007812e-05, "ewc_loss_parallel": 9.912536916090176e-05, "grad_norm": 4.209886074066162, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8588975667953491, "num_tokens": 93702135.0, "step": 2454 }, { "epoch": 0.3123012339397023, "ewc_loss": 0.02652520127594471, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.923638572217897e-05, "grad_norm": 4.158362865447998, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8593727946281433, "num_tokens": 93738226.0, "step": 2455 }, { "epoch": 0.31242844421829286, "ewc_loss": 0.026486963033676147, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.885400504572317e-05, "grad_norm": 4.137189865112305, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8549920320510864, "num_tokens": 93780273.0, "step": 2456 }, { "epoch": 0.31255565449688333, "ewc_loss": 0.026500606909394264, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.899044380290434e-05, "grad_norm": 4.154375076293945, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8403152823448181, "num_tokens": 93821977.0, "step": 2457 }, { "epoch": 0.31268286477547386, "ewc_loss": 0.02650308609008789, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.901524026645347e-05, "grad_norm": 4.200047969818115, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8622933626174927, "num_tokens": 93856084.0, "step": 2458 }, { "epoch": 0.3128100750540644, "ewc_loss": 0.026520462706685066, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.91890046861954e-05, "grad_norm": 4.224598407745361, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8560523390769958, "num_tokens": 93893491.0, "step": 2459 }, { "epoch": 0.31293728533265486, "ewc_loss": 0.026503030210733414, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.90146872936748e-05, "grad_norm": 4.1122355461120605, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8416391611099243, "num_tokens": 93940240.0, "step": 2460 }, { "epoch": 0.3130644956112454, "ewc_loss": 0.026448845863342285, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.847282490227371e-05, "grad_norm": 4.243937969207764, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8462193608283997, "num_tokens": 93974504.0, "step": 2461 }, { "epoch": 0.3131917058898359, "ewc_loss": 0.026682641357183456, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.959008457371965e-05, "grad_norm": 4.169020175933838, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8553146123886108, "num_tokens": 94009208.0, "step": 2462 }, { "epoch": 0.3133189161684264, "ewc_loss": 0.026580994948744774, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.857362601906061e-05, "grad_norm": 4.197202682495117, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8534096479415894, "num_tokens": 94049249.0, "step": 2463 }, { "epoch": 0.3134461264470169, "ewc_loss": 0.026646139100193977, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.92250643321313e-05, "grad_norm": 4.215585231781006, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.85500568151474, "num_tokens": 94085553.0, "step": 2464 }, { "epoch": 0.31357333672560744, "ewc_loss": 0.026626326143741608, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.902694000629708e-05, "grad_norm": 4.161114692687988, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.832453727722168, "num_tokens": 94127531.0, "step": 2465 }, { "epoch": 0.3137005470041979, "ewc_loss": 0.02647516131401062, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.87359817372635e-05, "grad_norm": 4.142772197723389, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8649011850357056, "num_tokens": 94171191.0, "step": 2466 }, { "epoch": 0.31382775728278844, "ewc_loss": 0.02651652693748474, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.914964903146029e-05, "grad_norm": 4.152400970458984, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.85807204246521, "num_tokens": 94212090.0, "step": 2467 }, { "epoch": 0.313954967561379, "ewc_loss": 0.026494458317756653, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.89289692370221e-05, "grad_norm": 4.160200595855713, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.871531069278717, "num_tokens": 94246835.0, "step": 2468 }, { "epoch": 0.31408217783996945, "ewc_loss": 0.026497280225157738, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.895717812469229e-05, "grad_norm": 4.167149066925049, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8517837524414062, "num_tokens": 94284581.0, "step": 2469 }, { "epoch": 0.31420938811856, "ewc_loss": 0.026511412113904953, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.909849177347496e-05, "grad_norm": 4.164881706237793, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8632267117500305, "num_tokens": 94323050.0, "step": 2470 }, { "epoch": 0.3143365983971505, "ewc_loss": 0.026509340852499008, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.9077784398105e-05, "grad_norm": 4.160397529602051, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8482868075370789, "num_tokens": 94364702.0, "step": 2471 }, { "epoch": 0.314463808675741, "ewc_loss": 0.02651563286781311, "ewc_loss_diag": 1.6570091247558594e-05, "ewc_loss_parallel": 9.914069960359484e-05, "grad_norm": 4.166187763214111, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8601422905921936, "num_tokens": 94404735.0, "step": 2472 }, { "epoch": 0.3145910189543315, "ewc_loss": 0.026630770415067673, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.90713742794469e-05, "grad_norm": 4.147290229797363, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8393093347549438, "num_tokens": 94444683.0, "step": 2473 }, { "epoch": 0.31471822923292203, "ewc_loss": 0.026632102206349373, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.908468928188086e-05, "grad_norm": 4.209405899047852, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8550723791122437, "num_tokens": 94484392.0, "step": 2474 }, { "epoch": 0.3148454395115125, "ewc_loss": 0.02667616307735443, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.952530672308058e-05, "grad_norm": 4.197335720062256, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8604710102081299, "num_tokens": 94524860.0, "step": 2475 }, { "epoch": 0.31497264979010303, "ewc_loss": 0.02661031112074852, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.88667961792089e-05, "grad_norm": 4.1706719398498535, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8495175838470459, "num_tokens": 94565076.0, "step": 2476 }, { "epoch": 0.31509986006869356, "ewc_loss": 0.026642989367246628, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.91935667116195e-05, "grad_norm": 4.208554267883301, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.830228328704834, "num_tokens": 94606191.0, "step": 2477 }, { "epoch": 0.31522707034728403, "ewc_loss": 0.026654403656721115, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.930769738275558e-05, "grad_norm": 4.129532814025879, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8469279408454895, "num_tokens": 94648021.0, "step": 2478 }, { "epoch": 0.31535428062587456, "ewc_loss": 0.026615455746650696, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.891823719954118e-05, "grad_norm": 4.185470104217529, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8643323183059692, "num_tokens": 94686305.0, "step": 2479 }, { "epoch": 0.3154814909044651, "ewc_loss": 0.0266842283308506, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.960596798919141e-05, "grad_norm": 4.158693790435791, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8512190580368042, "num_tokens": 94728606.0, "step": 2480 }, { "epoch": 0.3156087011830556, "ewc_loss": 0.02665811963379383, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.934487025020644e-05, "grad_norm": 4.150594234466553, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8558921217918396, "num_tokens": 94770162.0, "step": 2481 }, { "epoch": 0.3157359114616461, "ewc_loss": 0.026660755276679993, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.937123104464263e-05, "grad_norm": 4.165480613708496, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.85940021276474, "num_tokens": 94807735.0, "step": 2482 }, { "epoch": 0.3158631217402366, "ewc_loss": 0.026692938059568405, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.969306120183319e-05, "grad_norm": 4.250875473022461, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8475596904754639, "num_tokens": 94840366.0, "step": 2483 }, { "epoch": 0.31599033201882715, "ewc_loss": 0.02672658860683441, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 0.00010002954513765872, "grad_norm": 4.2416205406188965, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8498440980911255, "num_tokens": 94874293.0, "step": 2484 }, { "epoch": 0.3161175422974176, "ewc_loss": 0.026712648570537567, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.989014506572857e-05, "grad_norm": 4.2219462394714355, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8525132536888123, "num_tokens": 94909347.0, "step": 2485 }, { "epoch": 0.31624475257600815, "ewc_loss": 0.02671128511428833, "ewc_loss_diag": 1.6689300537109375e-05, "ewc_loss_parallel": 9.987653174903244e-05, "grad_norm": 4.096705913543701, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8631972670555115, "num_tokens": 94955120.0, "step": 2486 }, { "epoch": 0.3163719628545987, "ewc_loss": 0.027755703777074814, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 9.933437831932679e-05, "grad_norm": 35.49360656738281, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8677805662155151, "num_tokens": 94992319.0, "step": 2487 }, { "epoch": 0.31649917313318915, "ewc_loss": 0.03594043850898743, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00019094737945124507, "grad_norm": 6.432849407196045, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8636829853057861, "num_tokens": 95023464.0, "step": 2488 }, { "epoch": 0.3166263834117797, "ewc_loss": 0.029403358697891235, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00012557656737044454, "grad_norm": 4.065103054046631, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8665884137153625, "num_tokens": 95055963.0, "step": 2489 }, { "epoch": 0.3167535936903702, "ewc_loss": 0.028726015239953995, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00011880312376888469, "grad_norm": 5.413209915161133, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8346090316772461, "num_tokens": 95089628.0, "step": 2490 }, { "epoch": 0.3168808039689607, "ewc_loss": 0.03201041370630264, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00015164710930548608, "grad_norm": 5.42216682434082, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8659540414810181, "num_tokens": 95128557.0, "step": 2491 }, { "epoch": 0.3170080142475512, "ewc_loss": 0.028700370341539383, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00011854666809085757, "grad_norm": 4.4320454597473145, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8417637944221497, "num_tokens": 95168324.0, "step": 2492 }, { "epoch": 0.31713522452614173, "ewc_loss": 0.028319835662841797, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00011474132043076679, "grad_norm": 4.884496688842773, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8335892558097839, "num_tokens": 95208139.0, "step": 2493 }, { "epoch": 0.3172624348047322, "ewc_loss": 0.0290516410022974, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00012205937673570588, "grad_norm": 4.612515926361084, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8481022119522095, "num_tokens": 95247337.0, "step": 2494 }, { "epoch": 0.31738964508332274, "ewc_loss": 0.02781715616583824, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010971452138619497, "grad_norm": 4.518820762634277, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8464388847351074, "num_tokens": 95289208.0, "step": 2495 }, { "epoch": 0.31751685536191326, "ewc_loss": 0.02788037806749344, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00011034673661924899, "grad_norm": 4.488284111022949, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8517478704452515, "num_tokens": 95329717.0, "step": 2496 }, { "epoch": 0.31764406564050374, "ewc_loss": 0.02768106572329998, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.0001083536262740381, "grad_norm": 4.4158430099487305, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8343268036842346, "num_tokens": 95371052.0, "step": 2497 }, { "epoch": 0.31777127591909426, "ewc_loss": 0.027405448257923126, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010559745714999735, "grad_norm": 4.363281726837158, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8727699518203735, "num_tokens": 95412486.0, "step": 2498 }, { "epoch": 0.3178984861976848, "ewc_loss": 0.027325350791215897, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010479647608008236, "grad_norm": 4.367466449737549, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8691823482513428, "num_tokens": 95446167.0, "step": 2499 }, { "epoch": 0.31802569647627527, "ewc_loss": 0.027223562821745872, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010377859871368855, "grad_norm": 4.3431220054626465, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8478915691375732, "num_tokens": 95485763.0, "step": 2500 }, { "epoch": 0.3181529067548658, "ewc_loss": 0.027094803750514984, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010249100159853697, "grad_norm": 4.305995941162109, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.843518853187561, "num_tokens": 95526407.0, "step": 2501 }, { "epoch": 0.3182801170334563, "ewc_loss": 0.02701460011303425, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010168897279072553, "grad_norm": 4.341972351074219, "learning_rate": 1e-06, "loss": 0.57, "mean_token_accuracy": 0.8212651610374451, "num_tokens": 95567479.0, "step": 2502 }, { "epoch": 0.3184073273120468, "ewc_loss": 0.027001896873116493, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010156193457078189, "grad_norm": 4.2696404457092285, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8599271178245544, "num_tokens": 95606588.0, "step": 2503 }, { "epoch": 0.3185345375906373, "ewc_loss": 0.02688252180814743, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010036817548098043, "grad_norm": 4.2482428550720215, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8574264049530029, "num_tokens": 95653293.0, "step": 2504 }, { "epoch": 0.31866174786922785, "ewc_loss": 0.026883039623498917, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010037337051471695, "grad_norm": 4.293246746063232, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8454294204711914, "num_tokens": 95692424.0, "step": 2505 }, { "epoch": 0.3187889581478183, "ewc_loss": 0.026862964034080505, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.0001001726122922264, "grad_norm": 4.198627948760986, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8616294860839844, "num_tokens": 95733679.0, "step": 2506 }, { "epoch": 0.31891616842640885, "ewc_loss": 0.026786815375089645, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 9.94111251202412e-05, "grad_norm": 4.265666484832764, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8488037586212158, "num_tokens": 95768211.0, "step": 2507 }, { "epoch": 0.3190433787049994, "ewc_loss": 0.026868697255849838, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010022995411418378, "grad_norm": 4.261810779571533, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8522340655326843, "num_tokens": 95805974.0, "step": 2508 }, { "epoch": 0.31917058898358985, "ewc_loss": 0.026793327182531357, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 9.947623038897291e-05, "grad_norm": 4.225698471069336, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8517248630523682, "num_tokens": 95845463.0, "step": 2509 }, { "epoch": 0.3192977992621804, "ewc_loss": 0.026791542768478394, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 9.945838974090293e-05, "grad_norm": 4.213467597961426, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8483085632324219, "num_tokens": 95888148.0, "step": 2510 }, { "epoch": 0.3194250095407709, "ewc_loss": 0.02682301588356495, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 9.977312583941966e-05, "grad_norm": 4.208086967468262, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8750714659690857, "num_tokens": 95925432.0, "step": 2511 }, { "epoch": 0.3195522198193614, "ewc_loss": 0.02680605836212635, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 9.96035523712635e-05, "grad_norm": 4.172995567321777, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.858489453792572, "num_tokens": 95970695.0, "step": 2512 }, { "epoch": 0.3196794300979519, "ewc_loss": 0.026806512847542763, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 9.960809256881475e-05, "grad_norm": 4.238844871520996, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8544164299964905, "num_tokens": 96006093.0, "step": 2513 }, { "epoch": 0.31980664037654244, "ewc_loss": 0.026848655194044113, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010002953058574349, "grad_norm": 4.179655075073242, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8530591726303101, "num_tokens": 96045945.0, "step": 2514 }, { "epoch": 0.3199338506551329, "ewc_loss": 0.026828264817595482, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 9.982561459764838e-05, "grad_norm": 4.234001636505127, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8707138895988464, "num_tokens": 96076748.0, "step": 2515 }, { "epoch": 0.32006106093372344, "ewc_loss": 0.026886243373155594, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010040541383204982, "grad_norm": 4.214648246765137, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8554093837738037, "num_tokens": 96116964.0, "step": 2516 }, { "epoch": 0.32018827121231397, "ewc_loss": 0.026881428435444832, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010035725426860154, "grad_norm": 4.258313179016113, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8560734391212463, "num_tokens": 96152073.0, "step": 2517 }, { "epoch": 0.32031548149090444, "ewc_loss": 0.026928424835205078, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010082721564685926, "grad_norm": 4.225000858306885, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8517146110534668, "num_tokens": 96191608.0, "step": 2518 }, { "epoch": 0.32044269176949497, "ewc_loss": 0.026874549686908722, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010028845281340182, "grad_norm": 4.189737796783447, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8706996440887451, "num_tokens": 96228124.0, "step": 2519 }, { "epoch": 0.3205699020480855, "ewc_loss": 0.026882659643888474, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010036955791292712, "grad_norm": 4.1780524253845215, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8652840852737427, "num_tokens": 96265870.0, "step": 2520 }, { "epoch": 0.32069711232667597, "ewc_loss": 0.026915498077869415, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010069793643197045, "grad_norm": 4.15868616104126, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8643646240234375, "num_tokens": 96309623.0, "step": 2521 }, { "epoch": 0.3208243226052665, "ewc_loss": 0.026895299553871155, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010049595584860072, "grad_norm": 4.209233283996582, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8477225303649902, "num_tokens": 96344036.0, "step": 2522 }, { "epoch": 0.320951532883857, "ewc_loss": 0.027047790586948395, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010080016363644972, "grad_norm": 4.206710338592529, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8651489019393921, "num_tokens": 96377627.0, "step": 2523 }, { "epoch": 0.3210787431624475, "ewc_loss": 0.02690395526587963, "ewc_loss_diag": 1.6808509826660156e-05, "ewc_loss_parallel": 0.00010058251791633666, "grad_norm": 4.212076663970947, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8547049760818481, "num_tokens": 96413568.0, "step": 2524 }, { "epoch": 0.32120595344103803, "ewc_loss": 0.02706831321120262, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010100541112478822, "grad_norm": 4.166780948638916, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8518263697624207, "num_tokens": 96454682.0, "step": 2525 }, { "epoch": 0.32133316371962856, "ewc_loss": 0.02706236019730568, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010094587196363136, "grad_norm": 4.260273456573486, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.843486487865448, "num_tokens": 96490456.0, "step": 2526 }, { "epoch": 0.32146037399821903, "ewc_loss": 0.02712235599756241, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.0001015458328765817, "grad_norm": 4.256442070007324, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8464694619178772, "num_tokens": 96526891.0, "step": 2527 }, { "epoch": 0.32158758427680956, "ewc_loss": 0.027091853320598602, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010124080290552229, "grad_norm": 4.123970031738281, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8490671515464783, "num_tokens": 96575226.0, "step": 2528 }, { "epoch": 0.3217147945554001, "ewc_loss": 0.027041777968406677, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010074004967464134, "grad_norm": 4.261054039001465, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8439397215843201, "num_tokens": 96606383.0, "step": 2529 }, { "epoch": 0.3218420048339906, "ewc_loss": 0.02718457765877247, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010216804366791621, "grad_norm": 4.256344795227051, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8568071722984314, "num_tokens": 96639826.0, "step": 2530 }, { "epoch": 0.3219692151125811, "ewc_loss": 0.027099018916487694, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010131245653610677, "grad_norm": 4.150833606719971, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8584983348846436, "num_tokens": 96679151.0, "step": 2531 }, { "epoch": 0.3220964253911716, "ewc_loss": 0.027104225009679794, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010136452328879386, "grad_norm": 4.193084716796875, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8603506684303284, "num_tokens": 96719365.0, "step": 2532 }, { "epoch": 0.32222363566976214, "ewc_loss": 0.027151253074407578, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010183480480918661, "grad_norm": 4.221134662628174, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.851064920425415, "num_tokens": 96758318.0, "step": 2533 }, { "epoch": 0.3223508459483526, "ewc_loss": 0.02715088427066803, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.0001018311086227186, "grad_norm": 4.259872913360596, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8486723899841309, "num_tokens": 96791159.0, "step": 2534 }, { "epoch": 0.32247805622694314, "ewc_loss": 0.027148108929395676, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010180335084442049, "grad_norm": 4.216245174407959, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8573448657989502, "num_tokens": 96825352.0, "step": 2535 }, { "epoch": 0.32260526650553367, "ewc_loss": 0.02715003862977028, "ewc_loss_diag": 1.6927719116210938e-05, "ewc_loss_parallel": 0.00010182266123592854, "grad_norm": 4.244357109069824, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8512849807739258, "num_tokens": 96859700.0, "step": 2536 }, { "epoch": 0.32273247678412414, "ewc_loss": 0.027294661849737167, "ewc_loss_diag": 1.704692840576172e-05, "ewc_loss_parallel": 0.00010204817954218015, "grad_norm": 4.189751625061035, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8520495295524597, "num_tokens": 96897077.0, "step": 2537 }, { "epoch": 0.3228596870627147, "ewc_loss": 0.02739449217915535, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 0.00010182576806982979, "grad_norm": 4.1781415939331055, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8496805429458618, "num_tokens": 96936936.0, "step": 2538 }, { "epoch": 0.3229868973413052, "ewc_loss": 0.027408599853515625, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 0.00010196685616392642, "grad_norm": 4.164628505706787, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8585034608840942, "num_tokens": 96975702.0, "step": 2539 }, { "epoch": 0.3231141076198957, "ewc_loss": 0.027418505400419235, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 0.0001020659037749283, "grad_norm": 4.2036356925964355, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8529121279716492, "num_tokens": 97010585.0, "step": 2540 }, { "epoch": 0.3232413178984862, "ewc_loss": 0.02744978666305542, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 0.0001023787262965925, "grad_norm": 4.208254814147949, "learning_rate": 1e-06, "loss": 0.3835, "mean_token_accuracy": 0.8771064877510071, "num_tokens": 97047210.0, "step": 2541 }, { "epoch": 0.32336852817707673, "ewc_loss": 0.027566228061914444, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.0001023224467644468, "grad_norm": 4.219735145568848, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8492130637168884, "num_tokens": 97084835.0, "step": 2542 }, { "epoch": 0.3234957384556672, "ewc_loss": 0.027433397248387337, "ewc_loss_diag": 1.71661376953125e-05, "ewc_loss_parallel": 0.00010221483535133302, "grad_norm": 4.17101526260376, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8511980772018433, "num_tokens": 97125829.0, "step": 2543 }, { "epoch": 0.32362294873425773, "ewc_loss": 0.02754727378487587, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010213289351668209, "grad_norm": 4.196869373321533, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8465235233306885, "num_tokens": 97162874.0, "step": 2544 }, { "epoch": 0.32375015901284826, "ewc_loss": 0.027555715292692184, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010221730917692184, "grad_norm": 4.224462985992432, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8565291166305542, "num_tokens": 97197559.0, "step": 2545 }, { "epoch": 0.32387736929143873, "ewc_loss": 0.027544351294636726, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010210367327090353, "grad_norm": 4.309848785400391, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8418101072311401, "num_tokens": 97227028.0, "step": 2546 }, { "epoch": 0.32400457957002926, "ewc_loss": 0.027604807168245316, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010270823258906603, "grad_norm": 4.143792629241943, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8498725295066833, "num_tokens": 97270209.0, "step": 2547 }, { "epoch": 0.3241317898486198, "ewc_loss": 0.02749740332365036, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010163418482989073, "grad_norm": 4.242676258087158, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8573317527770996, "num_tokens": 97309176.0, "step": 2548 }, { "epoch": 0.32425900012721026, "ewc_loss": 0.027600795030593872, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010266809840686619, "grad_norm": 4.309624195098877, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8352068066596985, "num_tokens": 97340464.0, "step": 2549 }, { "epoch": 0.3243862104058008, "ewc_loss": 0.027559218928217888, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010225234291283414, "grad_norm": 4.172671794891357, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8571814298629761, "num_tokens": 97378435.0, "step": 2550 }, { "epoch": 0.3245134206843913, "ewc_loss": 0.02751235105097294, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010178366210311651, "grad_norm": 4.199496746063232, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8495833277702332, "num_tokens": 97417886.0, "step": 2551 }, { "epoch": 0.3246406309629818, "ewc_loss": 0.02756020426750183, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010226219455944374, "grad_norm": 4.2315545082092285, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8573064804077148, "num_tokens": 97450552.0, "step": 2552 }, { "epoch": 0.3247678412415723, "ewc_loss": 0.027537405490875244, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010203420970356092, "grad_norm": 4.192802906036377, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8598756790161133, "num_tokens": 97488586.0, "step": 2553 }, { "epoch": 0.32489505152016285, "ewc_loss": 0.027530770748853683, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010196787479799241, "grad_norm": 4.209803104400635, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8488737344741821, "num_tokens": 97528893.0, "step": 2554 }, { "epoch": 0.3250222617987533, "ewc_loss": 0.027556639164686203, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010222653509117663, "grad_norm": 4.236339092254639, "learning_rate": 1e-06, "loss": 0.5369, "mean_token_accuracy": 0.8306723237037659, "num_tokens": 97568654.0, "step": 2555 }, { "epoch": 0.32514947207734385, "ewc_loss": 0.027559075504541397, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010225090227322653, "grad_norm": 4.222623348236084, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.858311653137207, "num_tokens": 97607068.0, "step": 2556 }, { "epoch": 0.3252766823559344, "ewc_loss": 0.027539949864149094, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010205966827925295, "grad_norm": 4.166692733764648, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8753328323364258, "num_tokens": 97647505.0, "step": 2557 }, { "epoch": 0.32540389263452485, "ewc_loss": 0.02754373475909233, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010209749598288909, "grad_norm": 4.184897422790527, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8476135730743408, "num_tokens": 97690505.0, "step": 2558 }, { "epoch": 0.3255311029131154, "ewc_loss": 0.027549324557185173, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010215340444119647, "grad_norm": 4.327690124511719, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8409607410430908, "num_tokens": 97723955.0, "step": 2559 }, { "epoch": 0.3256583131917059, "ewc_loss": 0.027599191293120384, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010265206947224215, "grad_norm": 4.3140482902526855, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8479852676391602, "num_tokens": 97763822.0, "step": 2560 }, { "epoch": 0.3257855234702964, "ewc_loss": 0.027534861117601395, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010200876567978412, "grad_norm": 4.159060478210449, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8496469259262085, "num_tokens": 97803827.0, "step": 2561 }, { "epoch": 0.3259127337488869, "ewc_loss": 0.027482062578201294, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010148077853955328, "grad_norm": 4.193052291870117, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.853164792060852, "num_tokens": 97841056.0, "step": 2562 }, { "epoch": 0.32603994402747744, "ewc_loss": 0.027542995288968086, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010209011088591069, "grad_norm": 4.14911413192749, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8701398372650146, "num_tokens": 97881714.0, "step": 2563 }, { "epoch": 0.3261671543060679, "ewc_loss": 0.027510665357112885, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010176681826123968, "grad_norm": 4.214808464050293, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8505364060401917, "num_tokens": 97925797.0, "step": 2564 }, { "epoch": 0.32629436458465844, "ewc_loss": 0.027557924389839172, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.0001022393989842385, "grad_norm": 4.175548553466797, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.858623743057251, "num_tokens": 97963638.0, "step": 2565 }, { "epoch": 0.32642157486324896, "ewc_loss": 0.02750205621123314, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010168071457883343, "grad_norm": 4.2404937744140625, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8475621938705444, "num_tokens": 98002043.0, "step": 2566 }, { "epoch": 0.32654878514183944, "ewc_loss": 0.027554839849472046, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010220854164799675, "grad_norm": 4.1865692138671875, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8552510142326355, "num_tokens": 98041031.0, "step": 2567 }, { "epoch": 0.32667599542042997, "ewc_loss": 0.02749796211719513, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010163978731725365, "grad_norm": 4.234071731567383, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8616243600845337, "num_tokens": 98075510.0, "step": 2568 }, { "epoch": 0.3268032056990205, "ewc_loss": 0.027564048767089844, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010230063344351947, "grad_norm": 4.1857500076293945, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8593959808349609, "num_tokens": 98114162.0, "step": 2569 }, { "epoch": 0.32693041597761097, "ewc_loss": 0.02750483900308609, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010170853784075007, "grad_norm": 4.26047945022583, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8492430448532104, "num_tokens": 98148722.0, "step": 2570 }, { "epoch": 0.3270576262562015, "ewc_loss": 0.027584506198763847, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010250521881971508, "grad_norm": 4.218652248382568, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8331865072250366, "num_tokens": 98188255.0, "step": 2571 }, { "epoch": 0.327184836534792, "ewc_loss": 0.0275326669216156, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010198682139161974, "grad_norm": 4.264369964599609, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8638726472854614, "num_tokens": 98228597.0, "step": 2572 }, { "epoch": 0.3273120468133825, "ewc_loss": 0.027577504515647888, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010243519500363618, "grad_norm": 4.196412563323975, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8445504903793335, "num_tokens": 98273405.0, "step": 2573 }, { "epoch": 0.327439257091973, "ewc_loss": 0.02751653641462326, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010182552068727091, "grad_norm": 4.20871639251709, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8391271233558655, "num_tokens": 98315091.0, "step": 2574 }, { "epoch": 0.32756646737056355, "ewc_loss": 0.02756686881184578, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010232884233118966, "grad_norm": 4.199901580810547, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8548417687416077, "num_tokens": 98356967.0, "step": 2575 }, { "epoch": 0.327693677649154, "ewc_loss": 0.027520213276147842, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010186228610109538, "grad_norm": 4.251482009887695, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8438957929611206, "num_tokens": 98394455.0, "step": 2576 }, { "epoch": 0.32782088792774455, "ewc_loss": 0.02758033387362957, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010246349120279774, "grad_norm": 4.261206150054932, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8618260622024536, "num_tokens": 98430455.0, "step": 2577 }, { "epoch": 0.3279480982063351, "ewc_loss": 0.027558881789445877, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010224896686850116, "grad_norm": 4.178951740264893, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8388286828994751, "num_tokens": 98472218.0, "step": 2578 }, { "epoch": 0.3280753084849256, "ewc_loss": 0.027528923004865646, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010194938658969477, "grad_norm": 4.222705364227295, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.860883355140686, "num_tokens": 98509540.0, "step": 2579 }, { "epoch": 0.3282025187635161, "ewc_loss": 0.027597961947321892, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010263977310387418, "grad_norm": 4.225600719451904, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8525545597076416, "num_tokens": 98544761.0, "step": 2580 }, { "epoch": 0.3283297290421066, "ewc_loss": 0.02756529301404953, "ewc_loss_diag": 1.728534698486328e-05, "ewc_loss_parallel": 0.00010231309715891257, "grad_norm": 4.206389904022217, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8666028380393982, "num_tokens": 98580522.0, "step": 2581 }, { "epoch": 0.32845693932069714, "ewc_loss": 0.027713164687156677, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.0001025710953399539, "grad_norm": 4.244900226593018, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8622980713844299, "num_tokens": 98612988.0, "step": 2582 }, { "epoch": 0.3285841495992876, "ewc_loss": 0.027728451415896416, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010272397048538551, "grad_norm": 4.187960624694824, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8425204753875732, "num_tokens": 98654213.0, "step": 2583 }, { "epoch": 0.32871135987787814, "ewc_loss": 0.027698062360286713, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.0001024200682877563, "grad_norm": 4.2011284828186035, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.856574296951294, "num_tokens": 98697168.0, "step": 2584 }, { "epoch": 0.32883857015646867, "ewc_loss": 0.027713526040315628, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010257472604280338, "grad_norm": 4.243964672088623, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8587042689323425, "num_tokens": 98735664.0, "step": 2585 }, { "epoch": 0.32896578043505914, "ewc_loss": 0.02772253006696701, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.0001026647660182789, "grad_norm": 4.1766357421875, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8487831354141235, "num_tokens": 98777257.0, "step": 2586 }, { "epoch": 0.32909299071364967, "ewc_loss": 0.02769090048968792, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.0001023484583129175, "grad_norm": 4.263183116912842, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8578987121582031, "num_tokens": 98812443.0, "step": 2587 }, { "epoch": 0.3292202009922402, "ewc_loss": 0.027762843295931816, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010306788317393512, "grad_norm": 4.246748924255371, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8611003756523132, "num_tokens": 98848231.0, "step": 2588 }, { "epoch": 0.32934741127083067, "ewc_loss": 0.027715034782886505, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010258978727506474, "grad_norm": 4.213782787322998, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8567891716957092, "num_tokens": 98886487.0, "step": 2589 }, { "epoch": 0.3294746215494212, "ewc_loss": 0.027726709842681885, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010270655911881477, "grad_norm": 4.239868640899658, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8435559272766113, "num_tokens": 98924750.0, "step": 2590 }, { "epoch": 0.3296018318280117, "ewc_loss": 0.027714598923921585, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010258542897645384, "grad_norm": 4.263960838317871, "learning_rate": 1e-06, "loss": 0.5475, "mean_token_accuracy": 0.8326507210731506, "num_tokens": 98964151.0, "step": 2591 }, { "epoch": 0.3297290421066022, "ewc_loss": 0.02774360403418541, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010287549957865849, "grad_norm": 4.247073173522949, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8650455474853516, "num_tokens": 99000277.0, "step": 2592 }, { "epoch": 0.3298562523851927, "ewc_loss": 0.027706202119588852, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.000102501486253459, "grad_norm": 4.1515984535217285, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8441033959388733, "num_tokens": 99042335.0, "step": 2593 }, { "epoch": 0.32998346266378326, "ewc_loss": 0.027827786281704903, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.0001024966113618575, "grad_norm": 4.216874122619629, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8520299196243286, "num_tokens": 99082964.0, "step": 2594 }, { "epoch": 0.33011067294237373, "ewc_loss": 0.027748987078666687, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.0001029293198371306, "grad_norm": 4.119061470031738, "learning_rate": 1e-06, "loss": 0.395, "mean_token_accuracy": 0.8766098022460938, "num_tokens": 99123440.0, "step": 2595 }, { "epoch": 0.33023788322096426, "ewc_loss": 0.027703404426574707, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010247351019643247, "grad_norm": 4.252041339874268, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8519161939620972, "num_tokens": 99167047.0, "step": 2596 }, { "epoch": 0.3303650934995548, "ewc_loss": 0.027800986543297768, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010344931797590107, "grad_norm": 4.261096954345703, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8502219915390015, "num_tokens": 99202594.0, "step": 2597 }, { "epoch": 0.33049230377814526, "ewc_loss": 0.02771715447306633, "ewc_loss_diag": 1.7404556274414062e-05, "ewc_loss_parallel": 0.00010261101124342531, "grad_norm": 4.174811840057373, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8599121570587158, "num_tokens": 99244732.0, "step": 2598 }, { "epoch": 0.3306195140567358, "ewc_loss": 0.027835749089717865, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010257623216602951, "grad_norm": 4.2389750480651855, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8642921447753906, "num_tokens": 99287161.0, "step": 2599 }, { "epoch": 0.3307467243353263, "ewc_loss": 0.027876077219843864, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010297952394466847, "grad_norm": 4.246222972869873, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.846705436706543, "num_tokens": 99322650.0, "step": 2600 }, { "epoch": 0.3308739346139168, "ewc_loss": 0.027857013046741486, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010278887202730402, "grad_norm": 4.296689033508301, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8546172380447388, "num_tokens": 99358529.0, "step": 2601 }, { "epoch": 0.3310011448925073, "ewc_loss": 0.027863040566444397, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010284915333613753, "grad_norm": 4.2643141746521, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8632811307907104, "num_tokens": 99394010.0, "step": 2602 }, { "epoch": 0.33112835517109784, "ewc_loss": 0.027848295867443085, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010270171333104372, "grad_norm": 4.199221611022949, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8573882579803467, "num_tokens": 99434764.0, "step": 2603 }, { "epoch": 0.3312555654496883, "ewc_loss": 0.02784893848001957, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010270813800161704, "grad_norm": 4.195629596710205, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8562254905700684, "num_tokens": 99481322.0, "step": 2604 }, { "epoch": 0.33138277572827884, "ewc_loss": 0.027867743745446205, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010289618512615561, "grad_norm": 4.27872371673584, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8442870378494263, "num_tokens": 99517719.0, "step": 2605 }, { "epoch": 0.3315099860068694, "ewc_loss": 0.027895009145140648, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010316884436178952, "grad_norm": 4.243874549865723, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.857053279876709, "num_tokens": 99553662.0, "step": 2606 }, { "epoch": 0.33163719628545985, "ewc_loss": 0.027862899005413055, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010284773452440277, "grad_norm": 4.247341632843018, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8533915281295776, "num_tokens": 99589195.0, "step": 2607 }, { "epoch": 0.3317644065640504, "ewc_loss": 0.027898471802473068, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010320346336811781, "grad_norm": 4.245550632476807, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8557590246200562, "num_tokens": 99626092.0, "step": 2608 }, { "epoch": 0.3318916168426409, "ewc_loss": 0.027897829189896584, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010319703869754449, "grad_norm": 4.259314060211182, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8297198414802551, "num_tokens": 99667087.0, "step": 2609 }, { "epoch": 0.3320188271212314, "ewc_loss": 0.027998192235827446, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010297996777808294, "grad_norm": 4.228575229644775, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8498563170433044, "num_tokens": 99705929.0, "step": 2610 }, { "epoch": 0.3321460373998219, "ewc_loss": 0.028024669736623764, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001032447544275783, "grad_norm": 4.2560648918151855, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8505549430847168, "num_tokens": 99742044.0, "step": 2611 }, { "epoch": 0.33227324767841243, "ewc_loss": 0.028029967099428177, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010329773067496717, "grad_norm": 4.298055171966553, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8576833009719849, "num_tokens": 99772621.0, "step": 2612 }, { "epoch": 0.3324004579570029, "ewc_loss": 0.028053175657987595, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010352979734307155, "grad_norm": 4.205638408660889, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8597844243049622, "num_tokens": 99814405.0, "step": 2613 }, { "epoch": 0.33252766823559343, "ewc_loss": 0.02799435704946518, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010294160892954096, "grad_norm": 4.193451881408691, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8649601340293884, "num_tokens": 99852197.0, "step": 2614 }, { "epoch": 0.33265487851418396, "ewc_loss": 0.028046540915966034, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010346344788558781, "grad_norm": 4.277013301849365, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8478392958641052, "num_tokens": 99894254.0, "step": 2615 }, { "epoch": 0.33278208879277443, "ewc_loss": 0.027956675738096237, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010378551814937964, "grad_norm": 4.203909873962402, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.849597156047821, "num_tokens": 99936230.0, "step": 2616 }, { "epoch": 0.33290929907136496, "ewc_loss": 0.027901235967874527, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010323111928300932, "grad_norm": 4.240680694580078, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8564384579658508, "num_tokens": 99974964.0, "step": 2617 }, { "epoch": 0.3330365093499555, "ewc_loss": 0.027959760278463364, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010381634638179094, "grad_norm": 4.192946434020996, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8450264930725098, "num_tokens": 100018750.0, "step": 2618 }, { "epoch": 0.33316371962854596, "ewc_loss": 0.028025025501847267, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010324829781893641, "grad_norm": 4.25999116897583, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8485890626907349, "num_tokens": 100054064.0, "step": 2619 }, { "epoch": 0.3332909299071365, "ewc_loss": 0.02797655016183853, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010398426093161106, "grad_norm": 4.210692405700684, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8596594333648682, "num_tokens": 100092549.0, "step": 2620 }, { "epoch": 0.333418140185727, "ewc_loss": 0.027953393757343292, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010375268175266683, "grad_norm": 4.3706583976745605, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8286073207855225, "num_tokens": 100123973.0, "step": 2621 }, { "epoch": 0.3335453504643175, "ewc_loss": 0.02805149555206299, "ewc_loss_diag": 1.7523765563964844e-05, "ewc_loss_parallel": 0.00010473369184182957, "grad_norm": 4.223440647125244, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8527311086654663, "num_tokens": 100162142.0, "step": 2622 }, { "epoch": 0.333672560742908, "ewc_loss": 0.02804754674434662, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010347351053496823, "grad_norm": 8.462857246398926, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8495680093765259, "num_tokens": 100199352.0, "step": 2623 }, { "epoch": 0.33379977102149855, "ewc_loss": 0.03168489784002304, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00013984701945446432, "grad_norm": 5.073036193847656, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8387871384620667, "num_tokens": 100237446.0, "step": 2624 }, { "epoch": 0.333926981300089, "ewc_loss": 0.02761775627732277, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 9.917560964822769e-05, "grad_norm": 3.857229471206665, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8587114810943604, "num_tokens": 100272986.0, "step": 2625 }, { "epoch": 0.33405419157867955, "ewc_loss": 0.02857508882880211, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010874892177525908, "grad_norm": 4.674674034118652, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8453407287597656, "num_tokens": 100304998.0, "step": 2626 }, { "epoch": 0.3341814018572701, "ewc_loss": 0.029032394289970398, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001133219848270528, "grad_norm": 4.321410655975342, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8499171733856201, "num_tokens": 100338473.0, "step": 2627 }, { "epoch": 0.33430861213586055, "ewc_loss": 0.028156444430351257, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010456250311108306, "grad_norm": 4.501504421234131, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8377648591995239, "num_tokens": 100372644.0, "step": 2628 }, { "epoch": 0.3344358224144511, "ewc_loss": 0.02863985113799572, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010939655476249754, "grad_norm": 4.297284126281738, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8339323997497559, "num_tokens": 100413248.0, "step": 2629 }, { "epoch": 0.3345630326930416, "ewc_loss": 0.028203416615724564, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001050322171067819, "grad_norm": 4.335003852844238, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8482154607772827, "num_tokens": 100448022.0, "step": 2630 }, { "epoch": 0.33469024297163213, "ewc_loss": 0.02840369939804077, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001070350335794501, "grad_norm": 4.2280097007751465, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8413783311843872, "num_tokens": 100493222.0, "step": 2631 }, { "epoch": 0.3348174532502226, "ewc_loss": 0.02824423648416996, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010544041288085282, "grad_norm": 4.255206108093262, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8594650030136108, "num_tokens": 100534736.0, "step": 2632 }, { "epoch": 0.33494466352881314, "ewc_loss": 0.02828005701303482, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001057986228261143, "grad_norm": 4.298027038574219, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.861080527305603, "num_tokens": 100576754.0, "step": 2633 }, { "epoch": 0.33507187380740366, "ewc_loss": 0.028240621089935303, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010540425137151033, "grad_norm": 4.329705238342285, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8460259437561035, "num_tokens": 100611407.0, "step": 2634 }, { "epoch": 0.33519908408599414, "ewc_loss": 0.028265340253710747, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001056514447554946, "grad_norm": 4.296570301055908, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8602166771888733, "num_tokens": 100643531.0, "step": 2635 }, { "epoch": 0.33532629436458466, "ewc_loss": 0.02818782813847065, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010487632971489802, "grad_norm": 4.291110992431641, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8541956543922424, "num_tokens": 100678612.0, "step": 2636 }, { "epoch": 0.3354535046431752, "ewc_loss": 0.02822529897093773, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010525104880798608, "grad_norm": 4.307192325592041, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8582012057304382, "num_tokens": 100718873.0, "step": 2637 }, { "epoch": 0.33558071492176567, "ewc_loss": 0.028178054839372635, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010477859905222431, "grad_norm": 4.24812650680542, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.852310061454773, "num_tokens": 100759883.0, "step": 2638 }, { "epoch": 0.3357079252003562, "ewc_loss": 0.02817671000957489, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001047651530825533, "grad_norm": 4.273405075073242, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8336318731307983, "num_tokens": 100800022.0, "step": 2639 }, { "epoch": 0.3358351354789467, "ewc_loss": 0.02815929241478443, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010459097393322736, "grad_norm": 4.260582447052002, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8382179737091064, "num_tokens": 100837299.0, "step": 2640 }, { "epoch": 0.3359623457575372, "ewc_loss": 0.028158575296401978, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010458379983901978, "grad_norm": 4.297990798950195, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8496206402778625, "num_tokens": 100868433.0, "step": 2641 }, { "epoch": 0.3360895560361277, "ewc_loss": 0.028186559677124023, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001048636477207765, "grad_norm": 4.222691535949707, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8383188843727112, "num_tokens": 100907768.0, "step": 2642 }, { "epoch": 0.33621676631471825, "ewc_loss": 0.028150994330644608, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010450797708472237, "grad_norm": 4.302095890045166, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8301680088043213, "num_tokens": 100942723.0, "step": 2643 }, { "epoch": 0.3363439765933087, "ewc_loss": 0.028191886842250824, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010491690773051232, "grad_norm": 4.185137748718262, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8583511114120483, "num_tokens": 100984997.0, "step": 2644 }, { "epoch": 0.33647118687189925, "ewc_loss": 0.028135284781455994, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010435088188387454, "grad_norm": 4.207055568695068, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8685641288757324, "num_tokens": 101026290.0, "step": 2645 }, { "epoch": 0.3365983971504898, "ewc_loss": 0.028180085122585297, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010479888442205265, "grad_norm": 4.251479148864746, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.846560537815094, "num_tokens": 101066766.0, "step": 2646 }, { "epoch": 0.33672560742908025, "ewc_loss": 0.02819127030670643, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010491075227037072, "grad_norm": 4.3718342781066895, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8568829894065857, "num_tokens": 101100833.0, "step": 2647 }, { "epoch": 0.3368528177076708, "ewc_loss": 0.02821764349937439, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001051744693540968, "grad_norm": 4.169119358062744, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.863500714302063, "num_tokens": 101145552.0, "step": 2648 }, { "epoch": 0.3369800279862613, "ewc_loss": 0.028089426457881927, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010389231465524063, "grad_norm": 4.227852821350098, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8545060753822327, "num_tokens": 101186644.0, "step": 2649 }, { "epoch": 0.3371072382648518, "ewc_loss": 0.028342805802822113, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.0001052053994499147, "grad_norm": 4.259669780731201, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8514546155929565, "num_tokens": 101223356.0, "step": 2650 }, { "epoch": 0.3372344485434423, "ewc_loss": 0.02826726995408535, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010445004591019824, "grad_norm": 4.261250019073486, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8656653761863708, "num_tokens": 101260527.0, "step": 2651 }, { "epoch": 0.33736165882203284, "ewc_loss": 0.028297174721956253, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.0001047490950440988, "grad_norm": 4.210811614990234, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8524550795555115, "num_tokens": 101305403.0, "step": 2652 }, { "epoch": 0.3374888691006233, "ewc_loss": 0.028130875900387764, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010430680413264781, "grad_norm": 4.23448371887207, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8392099142074585, "num_tokens": 101345858.0, "step": 2653 }, { "epoch": 0.33761607937921384, "ewc_loss": 0.028163639828562737, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001046344477799721, "grad_norm": 4.332074165344238, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8304498195648193, "num_tokens": 101380321.0, "step": 2654 }, { "epoch": 0.33774328965780437, "ewc_loss": 0.028194716200232506, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001049452112056315, "grad_norm": 4.202544689178467, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8614380955696106, "num_tokens": 101419120.0, "step": 2655 }, { "epoch": 0.33787049993639484, "ewc_loss": 0.028097720816731453, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001039752532960847, "grad_norm": 4.3102240562438965, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8545762300491333, "num_tokens": 101453272.0, "step": 2656 }, { "epoch": 0.33799771021498537, "ewc_loss": 0.028238706290721893, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.0001053851010510698, "grad_norm": 4.341386318206787, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8419552445411682, "num_tokens": 101486879.0, "step": 2657 }, { "epoch": 0.3381249204935759, "ewc_loss": 0.028144344687461853, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010444148938404396, "grad_norm": 4.290485858917236, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.845382571220398, "num_tokens": 101523691.0, "step": 2658 }, { "epoch": 0.33825213077216637, "ewc_loss": 0.028154365718364716, "ewc_loss_diag": 1.7642974853515625e-05, "ewc_loss_parallel": 0.00010454171570017934, "grad_norm": 4.320243835449219, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.854597270488739, "num_tokens": 101555279.0, "step": 2659 }, { "epoch": 0.3383793410507569, "ewc_loss": 0.02830297127366066, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.000104807062598411, "grad_norm": 4.205489635467529, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8690994381904602, "num_tokens": 101591780.0, "step": 2660 }, { "epoch": 0.3385065513293474, "ewc_loss": 0.028259366750717163, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010437099990667775, "grad_norm": 4.2028727531433105, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8699276447296143, "num_tokens": 101631041.0, "step": 2661 }, { "epoch": 0.3386337616079379, "ewc_loss": 0.02831234782934189, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010490083514014259, "grad_norm": 4.2633209228515625, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8429449796676636, "num_tokens": 101666315.0, "step": 2662 }, { "epoch": 0.33876097188652843, "ewc_loss": 0.02831486240029335, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.0001049259735736996, "grad_norm": 4.290164470672607, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.865567147731781, "num_tokens": 101704046.0, "step": 2663 }, { "epoch": 0.33888818216511896, "ewc_loss": 0.028336387127637863, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.0001051412255037576, "grad_norm": 4.208713531494141, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8649656772613525, "num_tokens": 101743076.0, "step": 2664 }, { "epoch": 0.33901539244370943, "ewc_loss": 0.028281107544898987, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010458842007210478, "grad_norm": 4.200824737548828, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.865792989730835, "num_tokens": 101784006.0, "step": 2665 }, { "epoch": 0.33914260272229996, "ewc_loss": 0.028326012194156647, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010503745579626411, "grad_norm": 4.271295547485352, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8600870966911316, "num_tokens": 101821845.0, "step": 2666 }, { "epoch": 0.3392698130008905, "ewc_loss": 0.028351087123155594, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010528821439947933, "grad_norm": 4.2439284324646, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8590935468673706, "num_tokens": 101861121.0, "step": 2667 }, { "epoch": 0.33939702327948096, "ewc_loss": 0.028325224295258522, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010502958321012557, "grad_norm": 4.372959613800049, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8502786159515381, "num_tokens": 101894330.0, "step": 2668 }, { "epoch": 0.3395242335580715, "ewc_loss": 0.028383329510688782, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010561064846115187, "grad_norm": 4.247805595397949, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8397572040557861, "num_tokens": 101934835.0, "step": 2669 }, { "epoch": 0.339651443836662, "ewc_loss": 0.028276555240154266, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010454290168127045, "grad_norm": 4.307461738586426, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.837131917476654, "num_tokens": 101970943.0, "step": 2670 }, { "epoch": 0.3397786541152525, "ewc_loss": 0.02835976332426071, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010537498019402847, "grad_norm": 4.344440937042236, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8418784141540527, "num_tokens": 102001211.0, "step": 2671 }, { "epoch": 0.339905864393843, "ewc_loss": 0.02833520993590355, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010512944572838023, "grad_norm": 4.207813739776611, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8609412908554077, "num_tokens": 102039399.0, "step": 2672 }, { "epoch": 0.34003307467243354, "ewc_loss": 0.028300777077674866, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010478512558620423, "grad_norm": 4.364864826202393, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.843525230884552, "num_tokens": 102070635.0, "step": 2673 }, { "epoch": 0.340160284951024, "ewc_loss": 0.028442904353141785, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010620637476677075, "grad_norm": 4.249125957489014, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8580121994018555, "num_tokens": 102104590.0, "step": 2674 }, { "epoch": 0.34028749522961454, "ewc_loss": 0.028294669464230537, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010472403664607555, "grad_norm": 4.194298267364502, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8591974973678589, "num_tokens": 102149267.0, "step": 2675 }, { "epoch": 0.3404147055082051, "ewc_loss": 0.028342455625534058, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010520189971430227, "grad_norm": 4.24199914932251, "learning_rate": 1e-06, "loss": 0.5345, "mean_token_accuracy": 0.8374338150024414, "num_tokens": 102188089.0, "step": 2676 }, { "epoch": 0.34054191578679555, "ewc_loss": 0.02839556336402893, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010573297186056152, "grad_norm": 4.269753456115723, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8602345585823059, "num_tokens": 102224335.0, "step": 2677 }, { "epoch": 0.3406691260653861, "ewc_loss": 0.028384216129779816, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010561951785348356, "grad_norm": 4.229149341583252, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8573616147041321, "num_tokens": 102263421.0, "step": 2678 }, { "epoch": 0.3407963363439766, "ewc_loss": 0.028386497870087624, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010564232070464641, "grad_norm": 4.310522079467773, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8362747430801392, "num_tokens": 102301133.0, "step": 2679 }, { "epoch": 0.34092354662256713, "ewc_loss": 0.028399506583809853, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.0001057724075508304, "grad_norm": 4.232537746429443, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.857659101486206, "num_tokens": 102338516.0, "step": 2680 }, { "epoch": 0.3410507569011576, "ewc_loss": 0.028336621820926666, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010514357563806698, "grad_norm": 4.274959087371826, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8430452346801758, "num_tokens": 102376010.0, "step": 2681 }, { "epoch": 0.34117796717974813, "ewc_loss": 0.02841695211827755, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010594686318654567, "grad_norm": 4.189029216766357, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8748883605003357, "num_tokens": 102417080.0, "step": 2682 }, { "epoch": 0.34130517745833866, "ewc_loss": 0.028345821425318718, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010523555829422548, "grad_norm": 4.291685581207275, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8558350205421448, "num_tokens": 102458042.0, "step": 2683 }, { "epoch": 0.34143238773692913, "ewc_loss": 0.028434913605451584, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010612649202812463, "grad_norm": 4.202816009521484, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8621041178703308, "num_tokens": 102497152.0, "step": 2684 }, { "epoch": 0.34155959801551966, "ewc_loss": 0.028324183076620102, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010501917131477967, "grad_norm": 4.308933258056641, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8520146608352661, "num_tokens": 102531486.0, "step": 2685 }, { "epoch": 0.3416868082941102, "ewc_loss": 0.028446901589632034, "ewc_loss_diag": 1.7762184143066406e-05, "ewc_loss_parallel": 0.00010624637070577592, "grad_norm": 4.2220282554626465, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8530901670455933, "num_tokens": 102571552.0, "step": 2686 }, { "epoch": 0.34181401857270066, "ewc_loss": 0.028357114642858505, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010534848115639761, "grad_norm": 4.2848711013793945, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8595198392868042, "num_tokens": 102607338.0, "step": 2687 }, { "epoch": 0.3419412288512912, "ewc_loss": 0.028443489223718643, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010621223918860778, "grad_norm": 4.188107967376709, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8539384603500366, "num_tokens": 102654445.0, "step": 2688 }, { "epoch": 0.3420684391298817, "ewc_loss": 0.02836327999830246, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010541014489717782, "grad_norm": 4.209950923919678, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8624390363693237, "num_tokens": 102695265.0, "step": 2689 }, { "epoch": 0.3421956494084722, "ewc_loss": 0.028449106961488724, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010626840230543166, "grad_norm": 4.313560962677002, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8593916893005371, "num_tokens": 102728621.0, "step": 2690 }, { "epoch": 0.3423228596870627, "ewc_loss": 0.02848120965063572, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010658943938324228, "grad_norm": 4.282869338989258, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8482394218444824, "num_tokens": 102766378.0, "step": 2691 }, { "epoch": 0.34245006996565325, "ewc_loss": 0.028411593288183212, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.0001058932684827596, "grad_norm": 4.2148566246032715, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8526628017425537, "num_tokens": 102806331.0, "step": 2692 }, { "epoch": 0.3425772802442437, "ewc_loss": 0.02844208851456642, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010619821841828525, "grad_norm": 4.3155951499938965, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8568984270095825, "num_tokens": 102845314.0, "step": 2693 }, { "epoch": 0.34270449052283425, "ewc_loss": 0.028459200635552406, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010636934894137084, "grad_norm": 4.223321914672852, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8531519174575806, "num_tokens": 102888614.0, "step": 2694 }, { "epoch": 0.3428317008014248, "ewc_loss": 0.02841101959347725, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.000105887527752202, "grad_norm": 4.325690746307373, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8487811088562012, "num_tokens": 102925988.0, "step": 2695 }, { "epoch": 0.34295891108001525, "ewc_loss": 0.02845655381679535, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010634287900757045, "grad_norm": 4.242000579833984, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8623770475387573, "num_tokens": 102963150.0, "step": 2696 }, { "epoch": 0.3430861213586058, "ewc_loss": 0.028382012620568275, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010559747170191258, "grad_norm": 4.293108940124512, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8632078170776367, "num_tokens": 102999164.0, "step": 2697 }, { "epoch": 0.3432133316371963, "ewc_loss": 0.02845080941915512, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010628544259816408, "grad_norm": 4.311405658721924, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8622605800628662, "num_tokens": 103034337.0, "step": 2698 }, { "epoch": 0.3433405419157868, "ewc_loss": 0.0284283347427845, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010606068826746196, "grad_norm": 4.236024856567383, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8606336712837219, "num_tokens": 103075062.0, "step": 2699 }, { "epoch": 0.3434677521943773, "ewc_loss": 0.028404301032423973, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010582035611150786, "grad_norm": 4.324521064758301, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8496996760368347, "num_tokens": 103113209.0, "step": 2700 }, { "epoch": 0.34359496247296784, "ewc_loss": 0.028461217880249023, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010638951789587736, "grad_norm": 8.328142166137695, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8586705923080444, "num_tokens": 103153117.0, "step": 2701 }, { "epoch": 0.3437221727515583, "ewc_loss": 0.03168473392724991, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00013862467312719673, "grad_norm": 4.95281982421875, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8526067733764648, "num_tokens": 103192234.0, "step": 2702 }, { "epoch": 0.34384938303014884, "ewc_loss": 0.02777911350131035, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 9.956846770364791e-05, "grad_norm": 3.9574215412139893, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8510856628417969, "num_tokens": 103224958.0, "step": 2703 }, { "epoch": 0.34397659330873936, "ewc_loss": 0.028766091912984848, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010943827510345727, "grad_norm": 4.622368812561035, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8557784557342529, "num_tokens": 103259679.0, "step": 2704 }, { "epoch": 0.34410380358732984, "ewc_loss": 0.029017144814133644, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00011194879334652796, "grad_norm": 4.238940715789795, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8465982675552368, "num_tokens": 103299987.0, "step": 2705 }, { "epoch": 0.34423101386592037, "ewc_loss": 0.028371702879667282, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010549437865847722, "grad_norm": 5.4460039138793945, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.840779721736908, "num_tokens": 103344457.0, "step": 2706 }, { "epoch": 0.3443582241445109, "ewc_loss": 0.029718076810240746, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00011895811621798202, "grad_norm": 4.4171576499938965, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8591567277908325, "num_tokens": 103385173.0, "step": 2707 }, { "epoch": 0.34448543442310137, "ewc_loss": 0.028104500845074654, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010282234870828688, "grad_norm": 4.32127571105957, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8393048048019409, "num_tokens": 103425972.0, "step": 2708 }, { "epoch": 0.3446126447016919, "ewc_loss": 0.028725219890475273, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.0001090295409085229, "grad_norm": 4.360353469848633, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8508626818656921, "num_tokens": 103466819.0, "step": 2709 }, { "epoch": 0.3447398549802824, "ewc_loss": 0.028509467840194702, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010687203030101955, "grad_norm": 4.302699565887451, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.842811107635498, "num_tokens": 103509660.0, "step": 2710 }, { "epoch": 0.3448670652588729, "ewc_loss": 0.028458192944526672, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.0001063592717400752, "grad_norm": 4.352700233459473, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8480377793312073, "num_tokens": 103544557.0, "step": 2711 }, { "epoch": 0.3449942755374634, "ewc_loss": 0.028518419712781906, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010696155368350446, "grad_norm": 4.286166667938232, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8638200759887695, "num_tokens": 103585132.0, "step": 2712 }, { "epoch": 0.34512148581605395, "ewc_loss": 0.028423573821783066, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010601308167679235, "grad_norm": 4.2876667976379395, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8626052141189575, "num_tokens": 103626353.0, "step": 2713 }, { "epoch": 0.3452486960946444, "ewc_loss": 0.02846638672053814, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010644121357472613, "grad_norm": 4.355987071990967, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8567910194396973, "num_tokens": 103659035.0, "step": 2714 }, { "epoch": 0.34537590637323495, "ewc_loss": 0.028463348746299744, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010641084372764453, "grad_norm": 5.375486850738525, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8620655536651611, "num_tokens": 103699497.0, "step": 2715 }, { "epoch": 0.3455031166518255, "ewc_loss": 0.029063217341899872, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00011240951425861567, "grad_norm": 4.359116554260254, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8434650897979736, "num_tokens": 103733685.0, "step": 2716 }, { "epoch": 0.34563032693041595, "ewc_loss": 0.028004901483654976, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010182635742239654, "grad_norm": 4.239439964294434, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8379502296447754, "num_tokens": 103769399.0, "step": 2717 }, { "epoch": 0.3457575372090065, "ewc_loss": 0.02842693030834198, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010604664566926658, "grad_norm": 4.266658782958984, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8623716831207275, "num_tokens": 103809078.0, "step": 2718 }, { "epoch": 0.345884747487597, "ewc_loss": 0.028305616229772568, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010483349615242332, "grad_norm": 4.251863956451416, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8716700077056885, "num_tokens": 103846764.0, "step": 2719 }, { "epoch": 0.3460119577661875, "ewc_loss": 0.028346411883831024, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.0001052414663718082, "grad_norm": 4.233242988586426, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8635311722755432, "num_tokens": 103883089.0, "step": 2720 }, { "epoch": 0.346139168044778, "ewc_loss": 0.02834984101355076, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010527575068408623, "grad_norm": 4.258988857269287, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.872596800327301, "num_tokens": 103915775.0, "step": 2721 }, { "epoch": 0.34626637832336854, "ewc_loss": 0.028384264558553696, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010561997623881325, "grad_norm": 4.293138027191162, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8492792844772339, "num_tokens": 103953703.0, "step": 2722 }, { "epoch": 0.346393588601959, "ewc_loss": 0.0284256674349308, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010603401460684836, "grad_norm": 4.326492786407471, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8436559438705444, "num_tokens": 103990775.0, "step": 2723 }, { "epoch": 0.34652079888054954, "ewc_loss": 0.028440389782190323, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010618125088512897, "grad_norm": 4.275274753570557, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8581709861755371, "num_tokens": 104025927.0, "step": 2724 }, { "epoch": 0.34664800915914007, "ewc_loss": 0.028407521545886993, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010585256677586585, "grad_norm": 4.266389846801758, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.845543622970581, "num_tokens": 104065860.0, "step": 2725 }, { "epoch": 0.34677521943773054, "ewc_loss": 0.02843267284333706, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010610406752675772, "grad_norm": 4.278639793395996, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8568001985549927, "num_tokens": 104102117.0, "step": 2726 }, { "epoch": 0.34690242971632107, "ewc_loss": 0.028449635952711105, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010627369920257479, "grad_norm": 4.276857376098633, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8514884114265442, "num_tokens": 104140896.0, "step": 2727 }, { "epoch": 0.3470296399949116, "ewc_loss": 0.02845868095755577, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010636416118359193, "grad_norm": 4.3193793296813965, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8531164526939392, "num_tokens": 104181355.0, "step": 2728 }, { "epoch": 0.3471568502735021, "ewc_loss": 0.028494857251644135, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010672592179616913, "grad_norm": 4.312554359436035, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8556035757064819, "num_tokens": 104218564.0, "step": 2729 }, { "epoch": 0.3472840605520926, "ewc_loss": 0.02846551686525345, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010643251880537719, "grad_norm": 4.252110004425049, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8592398762702942, "num_tokens": 104256553.0, "step": 2730 }, { "epoch": 0.3474112708306831, "ewc_loss": 0.028443407267332077, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010621141700539738, "grad_norm": 4.293049335479736, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.868140459060669, "num_tokens": 104291150.0, "step": 2731 }, { "epoch": 0.34753848110927366, "ewc_loss": 0.02851993218064308, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010697667312342674, "grad_norm": 4.293722152709961, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.846850574016571, "num_tokens": 104327977.0, "step": 2732 }, { "epoch": 0.34766569138786413, "ewc_loss": 0.028454184532165527, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010631918848957866, "grad_norm": 4.281528949737549, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.852104902267456, "num_tokens": 104365167.0, "step": 2733 }, { "epoch": 0.34779290166645466, "ewc_loss": 0.0285131074488163, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.0001069084246410057, "grad_norm": 4.3300886154174805, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8607562184333801, "num_tokens": 104400304.0, "step": 2734 }, { "epoch": 0.3479201119450452, "ewc_loss": 0.02850392833352089, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010681661660782993, "grad_norm": 4.283554553985596, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8405313491821289, "num_tokens": 104437974.0, "step": 2735 }, { "epoch": 0.34804732222363566, "ewc_loss": 0.02850686013698578, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010684593144105747, "grad_norm": 4.2396063804626465, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8585545420646667, "num_tokens": 104474474.0, "step": 2736 }, { "epoch": 0.3481745325022262, "ewc_loss": 0.028531860560178757, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010709594789659604, "grad_norm": 4.306279182434082, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8406732082366943, "num_tokens": 104511004.0, "step": 2737 }, { "epoch": 0.3483017427808167, "ewc_loss": 0.02858855202794075, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010766286868602037, "grad_norm": 4.266883373260498, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8486796617507935, "num_tokens": 104551889.0, "step": 2738 }, { "epoch": 0.3484289530594072, "ewc_loss": 0.02854251302778721, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.0001072024751920253, "grad_norm": 4.3141889572143555, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8392086029052734, "num_tokens": 104591962.0, "step": 2739 }, { "epoch": 0.3485561633379977, "ewc_loss": 0.028591681271791458, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010769415530376136, "grad_norm": 4.2547712326049805, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.852250337600708, "num_tokens": 104636201.0, "step": 2740 }, { "epoch": 0.34868337361658824, "ewc_loss": 0.0285361148416996, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010713849042076617, "grad_norm": 4.254464149475098, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8540256023406982, "num_tokens": 104673448.0, "step": 2741 }, { "epoch": 0.3488105838951787, "ewc_loss": 0.02860998921096325, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010787724022520706, "grad_norm": 4.305902481079102, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8620018362998962, "num_tokens": 104708879.0, "step": 2742 }, { "epoch": 0.34893779417376924, "ewc_loss": 0.028722666203975677, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010778329306049272, "grad_norm": 4.22205114364624, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8651059865951538, "num_tokens": 104747073.0, "step": 2743 }, { "epoch": 0.3490650044523598, "ewc_loss": 0.028584379702806473, "ewc_loss_diag": 1.7881393432617188e-05, "ewc_loss_parallel": 0.00010762114106910303, "grad_norm": 4.3281168937683105, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8535590767860413, "num_tokens": 104786950.0, "step": 2744 }, { "epoch": 0.34919221473095025, "ewc_loss": 0.028760649263858795, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010816311987582594, "grad_norm": 4.211262226104736, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8594276905059814, "num_tokens": 104828306.0, "step": 2745 }, { "epoch": 0.3493194250095408, "ewc_loss": 0.02869156375527382, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010747226770035923, "grad_norm": 4.32388973236084, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.847209632396698, "num_tokens": 104867030.0, "step": 2746 }, { "epoch": 0.3494466352881313, "ewc_loss": 0.028784003108739853, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010839667083928362, "grad_norm": 4.268470764160156, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.859356164932251, "num_tokens": 104909182.0, "step": 2747 }, { "epoch": 0.3495738455667218, "ewc_loss": 0.028696998953819275, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010752661910373718, "grad_norm": 4.254566669464111, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8514836430549622, "num_tokens": 104950648.0, "step": 2748 }, { "epoch": 0.3497010558453123, "ewc_loss": 0.02872675657272339, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010782421304611489, "grad_norm": 4.248638153076172, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8597021102905273, "num_tokens": 104994780.0, "step": 2749 }, { "epoch": 0.34982826612390283, "ewc_loss": 0.028692254796624184, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010747918713605031, "grad_norm": 4.276027202606201, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8617787957191467, "num_tokens": 105029469.0, "step": 2750 }, { "epoch": 0.3499554764024933, "ewc_loss": 0.02872665971517563, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010782323806779459, "grad_norm": 4.309656620025635, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8481560349464417, "num_tokens": 105066797.0, "step": 2751 }, { "epoch": 0.35008268668108383, "ewc_loss": 0.028719868510961533, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010775533883133903, "grad_norm": 4.2844953536987305, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8698364496231079, "num_tokens": 105100869.0, "step": 2752 }, { "epoch": 0.35020989695967436, "ewc_loss": 0.028706587851047516, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010762252350104973, "grad_norm": 4.287652492523193, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8678309917449951, "num_tokens": 105135409.0, "step": 2753 }, { "epoch": 0.35033710723826483, "ewc_loss": 0.02869999036192894, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010755653056548908, "grad_norm": 4.30084753036499, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8619731664657593, "num_tokens": 105172839.0, "step": 2754 }, { "epoch": 0.35046431751685536, "ewc_loss": 0.028715496882796288, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010771161032607779, "grad_norm": 4.278625011444092, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8612254858016968, "num_tokens": 105216656.0, "step": 2755 }, { "epoch": 0.3505915277954459, "ewc_loss": 0.028695974498987198, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010751638183137402, "grad_norm": 4.312350749969482, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.848706841468811, "num_tokens": 105253740.0, "step": 2756 }, { "epoch": 0.35071873807403636, "ewc_loss": 0.028745125979185104, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010800790914800018, "grad_norm": 4.357206344604492, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8584014773368835, "num_tokens": 105290073.0, "step": 2757 }, { "epoch": 0.3508459483526269, "ewc_loss": 0.02871594950556755, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010771614324767143, "grad_norm": 4.254727363586426, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8624182343482971, "num_tokens": 105325933.0, "step": 2758 }, { "epoch": 0.3509731586312174, "ewc_loss": 0.02869625948369503, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010751923400675878, "grad_norm": 4.282281398773193, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8585797548294067, "num_tokens": 105362962.0, "step": 2759 }, { "epoch": 0.3511003689098079, "ewc_loss": 0.028741007670760155, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010796671267598867, "grad_norm": 4.209628105163574, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8555565476417542, "num_tokens": 105410074.0, "step": 2760 }, { "epoch": 0.3512275791883984, "ewc_loss": 0.02867669239640236, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010732356895459816, "grad_norm": 4.30348014831543, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.8403118848800659, "num_tokens": 105450264.0, "step": 2761 }, { "epoch": 0.35135478946698895, "ewc_loss": 0.028782455250620842, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010838119487743825, "grad_norm": 4.2945170402526855, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8462239503860474, "num_tokens": 105486566.0, "step": 2762 }, { "epoch": 0.3514819997455794, "ewc_loss": 0.02874491922557354, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010800582822412252, "grad_norm": 4.293297290802002, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8590179085731506, "num_tokens": 105527185.0, "step": 2763 }, { "epoch": 0.35160921002416995, "ewc_loss": 0.028734663501381874, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010790327360155061, "grad_norm": 4.310920715332031, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8544558882713318, "num_tokens": 105557401.0, "step": 2764 }, { "epoch": 0.3517364203027605, "ewc_loss": 0.02874772995710373, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.0001080339279724285, "grad_norm": 4.303292751312256, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8632588386535645, "num_tokens": 105586877.0, "step": 2765 }, { "epoch": 0.35186363058135095, "ewc_loss": 0.02876228280365467, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010817946895258501, "grad_norm": 4.282395839691162, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8605834245681763, "num_tokens": 105626988.0, "step": 2766 }, { "epoch": 0.3519908408599415, "ewc_loss": 0.028750520199537277, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010806185309775174, "grad_norm": 4.255743503570557, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8472248315811157, "num_tokens": 105665796.0, "step": 2767 }, { "epoch": 0.352118051138532, "ewc_loss": 0.028766920790076256, "ewc_loss_diag": 1.800060272216797e-05, "ewc_loss_parallel": 0.00010822584590641782, "grad_norm": 4.211066722869873, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8484041690826416, "num_tokens": 105708892.0, "step": 2768 }, { "epoch": 0.3522452614171225, "ewc_loss": 0.028881944715976715, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010815538553288206, "grad_norm": 4.287049770355225, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8496681451797485, "num_tokens": 105748221.0, "step": 2769 }, { "epoch": 0.352372471695713, "ewc_loss": 0.028952769935131073, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010886364179896191, "grad_norm": 4.301273822784424, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8530184626579285, "num_tokens": 105784568.0, "step": 2770 }, { "epoch": 0.35249968197430354, "ewc_loss": 0.028909623622894287, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010843217023648322, "grad_norm": 4.2523112297058105, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8433398008346558, "num_tokens": 105827223.0, "step": 2771 }, { "epoch": 0.352626892252894, "ewc_loss": 0.028883611783385277, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010817205475177616, "grad_norm": 4.225738525390625, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8739224076271057, "num_tokens": 105871014.0, "step": 2772 }, { "epoch": 0.35275410253148454, "ewc_loss": 0.028914855793118477, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010848449892364442, "grad_norm": 4.317263603210449, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8522874116897583, "num_tokens": 105908995.0, "step": 2773 }, { "epoch": 0.35288131281007507, "ewc_loss": 0.028968337923288345, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010901931091211736, "grad_norm": 4.300657749176025, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8491438031196594, "num_tokens": 105948243.0, "step": 2774 }, { "epoch": 0.35300852308866554, "ewc_loss": 0.02889702282845974, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010830616520252079, "grad_norm": 4.280192852020264, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8578880429267883, "num_tokens": 105983997.0, "step": 2775 }, { "epoch": 0.35313573336725607, "ewc_loss": 0.028914911672472954, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.0001084850518964231, "grad_norm": 4.332196235656738, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8421962857246399, "num_tokens": 106020336.0, "step": 2776 }, { "epoch": 0.3532629436458466, "ewc_loss": 0.028932321816682816, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010865914373425767, "grad_norm": 4.3134918212890625, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8567990064620972, "num_tokens": 106053300.0, "step": 2777 }, { "epoch": 0.35339015392443707, "ewc_loss": 0.028908150270581245, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010841744369827211, "grad_norm": 4.279665946960449, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8705402612686157, "num_tokens": 106091790.0, "step": 2778 }, { "epoch": 0.3535173642030276, "ewc_loss": 0.02891262248158455, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010846215445781127, "grad_norm": 4.315798759460449, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8578815460205078, "num_tokens": 106123768.0, "step": 2779 }, { "epoch": 0.3536445744816181, "ewc_loss": 0.028912164270877838, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010845757060451433, "grad_norm": 4.313514709472656, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8626307845115662, "num_tokens": 106158219.0, "step": 2780 }, { "epoch": 0.35377178476020865, "ewc_loss": 0.02891269326210022, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010846288205357268, "grad_norm": 4.287960052490234, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8508637547492981, "num_tokens": 106195795.0, "step": 2781 }, { "epoch": 0.3538989950387991, "ewc_loss": 0.02893100306391716, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010864596697501838, "grad_norm": 4.3511962890625, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8556702136993408, "num_tokens": 106234973.0, "step": 2782 }, { "epoch": 0.35402620531738965, "ewc_loss": 0.028955798596143723, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010889392433455214, "grad_norm": 4.385138988494873, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8412889242172241, "num_tokens": 106267297.0, "step": 2783 }, { "epoch": 0.3541534155959802, "ewc_loss": 0.028984710574150085, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010918304906226695, "grad_norm": 4.242351531982422, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8631536960601807, "num_tokens": 106309070.0, "step": 2784 }, { "epoch": 0.35428062587457065, "ewc_loss": 0.028894582763314247, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010828176164068282, "grad_norm": 4.333091735839844, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8595585823059082, "num_tokens": 106345784.0, "step": 2785 }, { "epoch": 0.3544078361531612, "ewc_loss": 0.029110712930560112, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010922236106125638, "grad_norm": 4.281132221221924, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8637269139289856, "num_tokens": 106383172.0, "step": 2786 }, { "epoch": 0.3545350464317517, "ewc_loss": 0.02904258854687214, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.000108541120425798, "grad_norm": 4.373202323913574, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8536913990974426, "num_tokens": 106416916.0, "step": 2787 }, { "epoch": 0.3546622567103422, "ewc_loss": 0.02902156114578247, "ewc_loss_diag": 1.811981201171875e-05, "ewc_loss_parallel": 0.00010955153993563727, "grad_norm": 4.297714710235596, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.839928150177002, "num_tokens": 106456969.0, "step": 2788 }, { "epoch": 0.3547894669889327, "ewc_loss": 0.02903658337891102, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010848107194760814, "grad_norm": 4.346612453460693, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8534067273139954, "num_tokens": 106490476.0, "step": 2789 }, { "epoch": 0.35491667726752324, "ewc_loss": 0.029129233211278915, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.0001094075632863678, "grad_norm": 4.31148624420166, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8553900122642517, "num_tokens": 106525990.0, "step": 2790 }, { "epoch": 0.3550438875461137, "ewc_loss": 0.029091335833072662, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010902859503403306, "grad_norm": 4.4162516593933105, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8303576707839966, "num_tokens": 106562953.0, "step": 2791 }, { "epoch": 0.35517109782470424, "ewc_loss": 0.02914397604763508, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.000109554996015504, "grad_norm": 4.292469501495361, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8570427894592285, "num_tokens": 106599142.0, "step": 2792 }, { "epoch": 0.35529830810329477, "ewc_loss": 0.0290908794850111, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010902402573265135, "grad_norm": 4.269584655761719, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8671079874038696, "num_tokens": 106636743.0, "step": 2793 }, { "epoch": 0.35542551838188524, "ewc_loss": 0.029120344668626785, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010931869474006817, "grad_norm": 4.270749568939209, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8602851033210754, "num_tokens": 106678992.0, "step": 2794 }, { "epoch": 0.35555272866047577, "ewc_loss": 0.02911696583032608, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.0001092849051929079, "grad_norm": 4.292541980743408, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8453383445739746, "num_tokens": 106719884.0, "step": 2795 }, { "epoch": 0.3556799389390663, "ewc_loss": 0.029131347313523293, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010942870721919462, "grad_norm": 4.29872989654541, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8515537977218628, "num_tokens": 106755224.0, "step": 2796 }, { "epoch": 0.35580714921765677, "ewc_loss": 0.02911657653748989, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010928099800366908, "grad_norm": 4.279660224914551, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8670655488967896, "num_tokens": 106789366.0, "step": 2797 }, { "epoch": 0.3559343594962473, "ewc_loss": 0.02913181111216545, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010943333472823724, "grad_norm": 4.35636043548584, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8464940786361694, "num_tokens": 106826382.0, "step": 2798 }, { "epoch": 0.3560615697748378, "ewc_loss": 0.029158754274249077, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010970277799060568, "grad_norm": 4.299973964691162, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8467761278152466, "num_tokens": 106860027.0, "step": 2799 }, { "epoch": 0.3561887800534283, "ewc_loss": 0.029101191088557243, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010912714787991717, "grad_norm": 4.330838203430176, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8633424639701843, "num_tokens": 106898684.0, "step": 2800 }, { "epoch": 0.35631599033201883, "ewc_loss": 0.02914038486778736, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010951908188872039, "grad_norm": 4.341470241546631, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8557488322257996, "num_tokens": 106934496.0, "step": 2801 }, { "epoch": 0.35644320061060936, "ewc_loss": 0.02911624126136303, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010927765106316656, "grad_norm": 4.256012916564941, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8689376711845398, "num_tokens": 106974335.0, "step": 2802 }, { "epoch": 0.35657041088919983, "ewc_loss": 0.02909204363822937, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010903568181674927, "grad_norm": 4.330710411071777, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8308542966842651, "num_tokens": 107013914.0, "step": 2803 }, { "epoch": 0.35669762116779036, "ewc_loss": 0.029141200706362724, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010952724551316351, "grad_norm": 4.264036655426025, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8597086071968079, "num_tokens": 107053747.0, "step": 2804 }, { "epoch": 0.3568248314463809, "ewc_loss": 0.029077591374516487, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010889114491874352, "grad_norm": 4.3134894371032715, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8457289338111877, "num_tokens": 107095245.0, "step": 2805 }, { "epoch": 0.35695204172497136, "ewc_loss": 0.02914690598845482, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010958429629681632, "grad_norm": 4.3017802238464355, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8557878732681274, "num_tokens": 107133627.0, "step": 2806 }, { "epoch": 0.3570792520035619, "ewc_loss": 0.02908782660961151, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010899349581450224, "grad_norm": 4.299860000610352, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8307871222496033, "num_tokens": 107170128.0, "step": 2807 }, { "epoch": 0.3572064622821524, "ewc_loss": 0.029141731560230255, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010953255696222186, "grad_norm": 4.261627674102783, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8655076026916504, "num_tokens": 107209137.0, "step": 2808 }, { "epoch": 0.3573336725607429, "ewc_loss": 0.029093727469444275, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.0001090525183826685, "grad_norm": 4.316291809082031, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8505619764328003, "num_tokens": 107244866.0, "step": 2809 }, { "epoch": 0.3574608828393334, "ewc_loss": 0.02915593981742859, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.0001096746273105964, "grad_norm": 4.2684736251831055, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8555129170417786, "num_tokens": 107287399.0, "step": 2810 }, { "epoch": 0.35758809311792394, "ewc_loss": 0.029087122529745102, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010898645268753171, "grad_norm": 4.317938804626465, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8451626300811768, "num_tokens": 107325627.0, "step": 2811 }, { "epoch": 0.3577153033965144, "ewc_loss": 0.029277902096509933, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010967355774482712, "grad_norm": 4.30565881729126, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8701680898666382, "num_tokens": 107359375.0, "step": 2812 }, { "epoch": 0.35784251367510495, "ewc_loss": 0.029255788773298264, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010945242684101686, "grad_norm": 4.28920841217041, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8432236313819885, "num_tokens": 107400286.0, "step": 2813 }, { "epoch": 0.3579697239536955, "ewc_loss": 0.029261702671647072, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010951155854854733, "grad_norm": 4.360612869262695, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8464838266372681, "num_tokens": 107431768.0, "step": 2814 }, { "epoch": 0.35809693423228595, "ewc_loss": 0.029303742572665215, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010993195610353723, "grad_norm": 4.278409004211426, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8647680282592773, "num_tokens": 107470875.0, "step": 2815 }, { "epoch": 0.3582241445108765, "ewc_loss": 0.029216276481747627, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.0001090572914108634, "grad_norm": 4.326407432556152, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8454523682594299, "num_tokens": 107510988.0, "step": 2816 }, { "epoch": 0.358351354789467, "ewc_loss": 0.029267914593219757, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.000109573666122742, "grad_norm": 4.242922782897949, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8612551689147949, "num_tokens": 107552423.0, "step": 2817 }, { "epoch": 0.3584785650680575, "ewc_loss": 0.029193539172410965, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010882992501137778, "grad_norm": 4.308531761169434, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8473038077354431, "num_tokens": 107591693.0, "step": 2818 }, { "epoch": 0.358605775346648, "ewc_loss": 0.029291240498423576, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010980694059981033, "grad_norm": 4.314235210418701, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.860112190246582, "num_tokens": 107626017.0, "step": 2819 }, { "epoch": 0.35873298562523853, "ewc_loss": 0.02926388755440712, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010953339369734749, "grad_norm": 4.301580429077148, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.870466947555542, "num_tokens": 107664927.0, "step": 2820 }, { "epoch": 0.358860195903829, "ewc_loss": 0.029234975576400757, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010924429079750553, "grad_norm": 4.357728481292725, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8473097085952759, "num_tokens": 107702487.0, "step": 2821 }, { "epoch": 0.35898740618241953, "ewc_loss": 0.029289964586496353, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010979418584611267, "grad_norm": 4.280192852020264, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8676692843437195, "num_tokens": 107741263.0, "step": 2822 }, { "epoch": 0.35911461646101006, "ewc_loss": 0.029233019798994064, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010922472574748099, "grad_norm": 4.301541805267334, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8575114607810974, "num_tokens": 107781079.0, "step": 2823 }, { "epoch": 0.35924182673960053, "ewc_loss": 0.02929648756980896, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010985940753016621, "grad_norm": 4.331852436065674, "learning_rate": 1e-06, "loss": 0.3983, "mean_token_accuracy": 0.8736048340797424, "num_tokens": 107819309.0, "step": 2824 }, { "epoch": 0.35936903701819106, "ewc_loss": 0.029361866414546967, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 0.00010929249401669949, "grad_norm": 9.94567584991455, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8566383719444275, "num_tokens": 107853251.0, "step": 2825 }, { "epoch": 0.3594962472967816, "ewc_loss": 0.03392469137907028, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 0.00015492072270717472, "grad_norm": 5.256350517272949, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8758901357650757, "num_tokens": 107893476.0, "step": 2826 }, { "epoch": 0.35962345757537206, "ewc_loss": 0.028717882931232452, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00010407335503259674, "grad_norm": 3.8227181434631348, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8652427196502686, "num_tokens": 107931204.0, "step": 2827 }, { "epoch": 0.3597506678539626, "ewc_loss": 0.029989590868353844, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011679044109769166, "grad_norm": 4.889578342437744, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8379669189453125, "num_tokens": 107966471.0, "step": 2828 }, { "epoch": 0.3598778781325531, "ewc_loss": 0.03073454648256302, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00012423998850863427, "grad_norm": 4.403083324432373, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8567653298377991, "num_tokens": 108005328.0, "step": 2829 }, { "epoch": 0.36000508841114365, "ewc_loss": 0.029406525194644928, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011095979425590485, "grad_norm": 4.584751129150391, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8439987301826477, "num_tokens": 108046295.0, "step": 2830 }, { "epoch": 0.3601322986897341, "ewc_loss": 0.030076172202825546, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011765626550186425, "grad_norm": 4.531632423400879, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8576399087905884, "num_tokens": 108078164.0, "step": 2831 }, { "epoch": 0.36025950896832465, "ewc_loss": 0.02953246235847473, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011221915337955579, "grad_norm": 4.375125885009766, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8503051400184631, "num_tokens": 108117105.0, "step": 2832 }, { "epoch": 0.3603867192469152, "ewc_loss": 0.029414905235171318, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011226428614463657, "grad_norm": 4.418945789337158, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8524385094642639, "num_tokens": 108155301.0, "step": 2833 }, { "epoch": 0.36051392952550565, "ewc_loss": 0.029406480491161346, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011218003055546433, "grad_norm": 4.366584777832031, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8659553527832031, "num_tokens": 108193091.0, "step": 2834 }, { "epoch": 0.3606411398040962, "ewc_loss": 0.029261421412229538, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011072945198975503, "grad_norm": 4.31835412979126, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8531844615936279, "num_tokens": 108233719.0, "step": 2835 }, { "epoch": 0.3607683500826867, "ewc_loss": 0.029260456562042236, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011071980406995863, "grad_norm": 4.4197540283203125, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.861443817615509, "num_tokens": 108265775.0, "step": 2836 }, { "epoch": 0.3608955603612772, "ewc_loss": 0.029286790639162064, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011098314280388877, "grad_norm": 4.314381122589111, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8654868602752686, "num_tokens": 108307953.0, "step": 2837 }, { "epoch": 0.3610227706398677, "ewc_loss": 0.029188262298703194, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.0001099978617276065, "grad_norm": 4.334812641143799, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.851578414440155, "num_tokens": 108351998.0, "step": 2838 }, { "epoch": 0.36114998091845824, "ewc_loss": 0.0292449239641428, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011056446965085343, "grad_norm": 4.346022605895996, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8430613279342651, "num_tokens": 108395968.0, "step": 2839 }, { "epoch": 0.3612771911970487, "ewc_loss": 0.029170189052820206, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010981712694047019, "grad_norm": 4.385927677154541, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8359692096710205, "num_tokens": 108433045.0, "step": 2840 }, { "epoch": 0.36140440147563924, "ewc_loss": 0.0292415339499712, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011053057824028656, "grad_norm": 4.3522162437438965, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8506612777709961, "num_tokens": 108470964.0, "step": 2841 }, { "epoch": 0.36153161175422976, "ewc_loss": 0.02915073372423649, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010962256783386692, "grad_norm": 4.284963607788086, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8556677103042603, "num_tokens": 108512572.0, "step": 2842 }, { "epoch": 0.36165882203282024, "ewc_loss": 0.029173165559768677, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00010984689288306981, "grad_norm": 4.367816925048828, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8403455018997192, "num_tokens": 108548526.0, "step": 2843 }, { "epoch": 0.36178603231141077, "ewc_loss": 0.029235003516077995, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011046526924474165, "grad_norm": 4.337179660797119, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8519893288612366, "num_tokens": 108588447.0, "step": 2844 }, { "epoch": 0.3619132425900013, "ewc_loss": 0.029159631580114365, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.000109711560071446, "grad_norm": 4.348684787750244, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.845989465713501, "num_tokens": 108628426.0, "step": 2845 }, { "epoch": 0.36204045286859177, "ewc_loss": 0.029203910380601883, "ewc_loss_diag": 1.823902130126953e-05, "ewc_loss_parallel": 0.00011015433119609952, "grad_norm": 4.372440814971924, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8501049280166626, "num_tokens": 108661355.0, "step": 2846 }, { "epoch": 0.3621676631471823, "ewc_loss": 0.029330234974622726, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011019688099622726, "grad_norm": 4.3201904296875, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8724386096000671, "num_tokens": 108698505.0, "step": 2847 }, { "epoch": 0.3622948734257728, "ewc_loss": 0.029529022052884102, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00010974334873026237, "grad_norm": 8.050488471984863, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8596282005310059, "num_tokens": 108737409.0, "step": 2848 }, { "epoch": 0.3624220837043633, "ewc_loss": 0.032729532569646835, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 0.00014296914741862565, "grad_norm": 5.0113654136657715, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8556160926818848, "num_tokens": 108772969.0, "step": 2849 }, { "epoch": 0.3625492939829538, "ewc_loss": 0.02879948541522026, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 0.0001036686880979687, "grad_norm": 4.119823932647705, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8618297576904297, "num_tokens": 108805129.0, "step": 2850 }, { "epoch": 0.36267650426154435, "ewc_loss": 0.029863692820072174, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 0.00011431075836298987, "grad_norm": 4.653265476226807, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8406574726104736, "num_tokens": 108841197.0, "step": 2851 }, { "epoch": 0.3628037145401348, "ewc_loss": 0.030061490833759308, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011506804003147408, "grad_norm": 4.331706523895264, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8684864044189453, "num_tokens": 108876442.0, "step": 2852 }, { "epoch": 0.36293092481872535, "ewc_loss": 0.029576577246189117, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011021889076801017, "grad_norm": 4.495124340057373, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8422961235046387, "num_tokens": 108911432.0, "step": 2853 }, { "epoch": 0.3630581350973159, "ewc_loss": 0.029696153476834297, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011385606194380671, "grad_norm": 4.408748149871826, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8412407636642456, "num_tokens": 108952977.0, "step": 2854 }, { "epoch": 0.36318534537590635, "ewc_loss": 0.029398217797279358, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011087671009590849, "grad_norm": 4.383420944213867, "learning_rate": 1e-06, "loss": 0.5461, "mean_token_accuracy": 0.8321694731712341, "num_tokens": 108995602.0, "step": 2855 }, { "epoch": 0.3633125556544969, "ewc_loss": 0.029747329652309418, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011192642705282196, "grad_norm": 4.3277587890625, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8564975261688232, "num_tokens": 109040226.0, "step": 2856 }, { "epoch": 0.3634397659330874, "ewc_loss": 0.029681185260415077, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011126497702207416, "grad_norm": 4.413766860961914, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8567876219749451, "num_tokens": 109082898.0, "step": 2857 }, { "epoch": 0.3635669762116779, "ewc_loss": 0.029735583811998367, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011180895671714097, "grad_norm": 4.406907081604004, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8466126918792725, "num_tokens": 109122460.0, "step": 2858 }, { "epoch": 0.3636941864902684, "ewc_loss": 0.029412543401122093, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011101996642537415, "grad_norm": 4.3736467361450195, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8540539741516113, "num_tokens": 109161680.0, "step": 2859 }, { "epoch": 0.36382139676885894, "ewc_loss": 0.0294177308678627, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011107185127912089, "grad_norm": 4.315029144287109, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8737260103225708, "num_tokens": 109198943.0, "step": 2860 }, { "epoch": 0.3639486070474494, "ewc_loss": 0.02937363274395466, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011063086276408285, "grad_norm": 4.358292579650879, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8626385927200317, "num_tokens": 109235719.0, "step": 2861 }, { "epoch": 0.36407581732603994, "ewc_loss": 0.029435433447360992, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011124885350000113, "grad_norm": 4.392786979675293, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8487927913665771, "num_tokens": 109272681.0, "step": 2862 }, { "epoch": 0.36420302760463047, "ewc_loss": 0.029407402500510216, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011096855450887233, "grad_norm": 4.4371538162231445, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8431493043899536, "num_tokens": 109306114.0, "step": 2863 }, { "epoch": 0.36433023788322094, "ewc_loss": 0.029675455763936043, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011120767885586247, "grad_norm": 4.408749580383301, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8462202548980713, "num_tokens": 109337728.0, "step": 2864 }, { "epoch": 0.36445744816181147, "ewc_loss": 0.029393594712018967, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011083047138527036, "grad_norm": 4.352854251861572, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8462982177734375, "num_tokens": 109377809.0, "step": 2865 }, { "epoch": 0.364584658440402, "ewc_loss": 0.029405560344457626, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.0001109501245082356, "grad_norm": 4.340303421020508, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8690190315246582, "num_tokens": 109413499.0, "step": 2866 }, { "epoch": 0.36471186871899247, "ewc_loss": 0.029376892372965813, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011066345905419439, "grad_norm": 4.280087471008301, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8448548913002014, "num_tokens": 109460961.0, "step": 2867 }, { "epoch": 0.364839078997583, "ewc_loss": 0.02939612790942192, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011085579899372533, "grad_norm": 4.3248701095581055, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8655924201011658, "num_tokens": 109500615.0, "step": 2868 }, { "epoch": 0.36496628927617353, "ewc_loss": 0.029418405145406723, "ewc_loss_diag": 1.8358230590820312e-05, "ewc_loss_parallel": 0.00011107859609182924, "grad_norm": 4.370061874389648, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8579593300819397, "num_tokens": 109533921.0, "step": 2869 }, { "epoch": 0.365093499554764, "ewc_loss": 0.029702603816986084, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011147915211040527, "grad_norm": 4.4530487060546875, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8451178073883057, "num_tokens": 109567069.0, "step": 2870 }, { "epoch": 0.36522070983335453, "ewc_loss": 0.029696999117732048, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011142311996081844, "grad_norm": 4.332851886749268, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8677880167961121, "num_tokens": 109603696.0, "step": 2871 }, { "epoch": 0.36534792011194506, "ewc_loss": 0.029489487409591675, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 0.00011056870425818488, "grad_norm": 4.308366775512695, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8719812631607056, "num_tokens": 109644590.0, "step": 2872 }, { "epoch": 0.36547513039053553, "ewc_loss": 0.029538094997406006, "ewc_loss_diag": 1.8477439880371094e-05, "ewc_loss_parallel": 0.00011105478188255802, "grad_norm": 4.3872389793396, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8504104614257812, "num_tokens": 109678869.0, "step": 2873 }, { "epoch": 0.36560234066912606, "ewc_loss": 0.029683204367756844, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011128516780445352, "grad_norm": 4.286322593688965, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8500056266784668, "num_tokens": 109722517.0, "step": 2874 }, { "epoch": 0.3657295509477166, "ewc_loss": 0.029626324772834778, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011071636981796473, "grad_norm": 4.392485618591309, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8515310287475586, "num_tokens": 109757869.0, "step": 2875 }, { "epoch": 0.36585676122630706, "ewc_loss": 0.029745716601610184, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011191029625479132, "grad_norm": 4.376511573791504, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8511428833007812, "num_tokens": 109793162.0, "step": 2876 }, { "epoch": 0.3659839715048976, "ewc_loss": 0.02965652197599411, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.0001110183511627838, "grad_norm": 4.314600467681885, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8614556193351746, "num_tokens": 109832942.0, "step": 2877 }, { "epoch": 0.3661111817834881, "ewc_loss": 0.029671166092157364, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011116477253381163, "grad_norm": 4.34377908706665, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8531770706176758, "num_tokens": 109876230.0, "step": 2878 }, { "epoch": 0.3662383920620786, "ewc_loss": 0.029700351879000664, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011145664029754698, "grad_norm": 4.386880397796631, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8452805280685425, "num_tokens": 109911844.0, "step": 2879 }, { "epoch": 0.3663656023406691, "ewc_loss": 0.029703445732593536, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.0001114875849452801, "grad_norm": 4.335949420928955, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8422902822494507, "num_tokens": 109957619.0, "step": 2880 }, { "epoch": 0.36649281261925964, "ewc_loss": 0.02978525683283806, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011108497710665688, "grad_norm": 4.34756326675415, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8442821502685547, "num_tokens": 109999117.0, "step": 2881 }, { "epoch": 0.3666200228978502, "ewc_loss": 0.02971203625202179, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011157347762491554, "grad_norm": 4.28216028213501, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8603917956352234, "num_tokens": 110039557.0, "step": 2882 }, { "epoch": 0.36674723317644065, "ewc_loss": 0.029794026166200638, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011117268877569586, "grad_norm": 4.423614025115967, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8462534546852112, "num_tokens": 110074418.0, "step": 2883 }, { "epoch": 0.3668744434550312, "ewc_loss": 0.02988710254430771, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011210343654965982, "grad_norm": 4.334357738494873, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8597985506057739, "num_tokens": 110109255.0, "step": 2884 }, { "epoch": 0.3670016537336217, "ewc_loss": 0.029659327119588852, "ewc_loss_diag": 1.8596649169921875e-05, "ewc_loss_parallel": 0.00011104640725534409, "grad_norm": 4.337281227111816, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8609132170677185, "num_tokens": 110150180.0, "step": 2885 }, { "epoch": 0.3671288640122122, "ewc_loss": 0.029864929616451263, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011188170901732519, "grad_norm": 4.345327377319336, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8385102152824402, "num_tokens": 110193653.0, "step": 2886 }, { "epoch": 0.3672560742908027, "ewc_loss": 0.029845096170902252, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011168339551659301, "grad_norm": 4.38690710067749, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8619003891944885, "num_tokens": 110231020.0, "step": 2887 }, { "epoch": 0.36738328456939323, "ewc_loss": 0.029851533472537994, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011174775136169046, "grad_norm": 4.30985689163208, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8701725006103516, "num_tokens": 110269397.0, "step": 2888 }, { "epoch": 0.3675104948479837, "ewc_loss": 0.029805149883031845, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011128390906378627, "grad_norm": 4.294961929321289, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8695223331451416, "num_tokens": 110311188.0, "step": 2889 }, { "epoch": 0.36763770512657423, "ewc_loss": 0.02983395755290985, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011157198605360463, "grad_norm": 4.413932800292969, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8596175312995911, "num_tokens": 110343801.0, "step": 2890 }, { "epoch": 0.36776491540516476, "ewc_loss": 0.029884152114391327, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011207395436940715, "grad_norm": 4.317198276519775, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8383175134658813, "num_tokens": 110385830.0, "step": 2891 }, { "epoch": 0.36789212568375523, "ewc_loss": 0.02979131042957306, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.0001111455203499645, "grad_norm": 4.416916370391846, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8540005683898926, "num_tokens": 110421494.0, "step": 2892 }, { "epoch": 0.36801933596234576, "ewc_loss": 0.02991594187915325, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011239184095757082, "grad_norm": 4.311907768249512, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8539180755615234, "num_tokens": 110463132.0, "step": 2893 }, { "epoch": 0.3681465462409363, "ewc_loss": 0.029809333384037018, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011132575309602544, "grad_norm": 4.3608527183532715, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8582079410552979, "num_tokens": 110498330.0, "step": 2894 }, { "epoch": 0.36827375651952676, "ewc_loss": 0.02989059127867222, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011213833204237744, "grad_norm": 4.316086769104004, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.86325603723526, "num_tokens": 110534438.0, "step": 2895 }, { "epoch": 0.3684009667981173, "ewc_loss": 0.0298476405441761, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011170883954036981, "grad_norm": 4.3728766441345215, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8499322533607483, "num_tokens": 110572059.0, "step": 2896 }, { "epoch": 0.3685281770767078, "ewc_loss": 0.029895547777414322, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011218788858968765, "grad_norm": 4.3100481033325195, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8575177788734436, "num_tokens": 110611012.0, "step": 2897 }, { "epoch": 0.3686553873552983, "ewc_loss": 0.02985108084976673, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011174323299201205, "grad_norm": 4.330924987792969, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8622138500213623, "num_tokens": 110647429.0, "step": 2898 }, { "epoch": 0.3687825976338888, "ewc_loss": 0.029903020709753036, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011226261995034292, "grad_norm": 4.393798351287842, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8491822481155396, "num_tokens": 110685052.0, "step": 2899 }, { "epoch": 0.36890980791247935, "ewc_loss": 0.029908575117588043, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.0001123181646107696, "grad_norm": 4.346140384674072, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.8346217274665833, "num_tokens": 110725222.0, "step": 2900 }, { "epoch": 0.3690370181910698, "ewc_loss": 0.02985646203160286, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011179703869856894, "grad_norm": 4.356894493103027, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8430872559547424, "num_tokens": 110764314.0, "step": 2901 }, { "epoch": 0.36916422846966035, "ewc_loss": 0.029915597289800644, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011238838487770408, "grad_norm": 4.311049461364746, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.866779625415802, "num_tokens": 110803585.0, "step": 2902 }, { "epoch": 0.3692914387482509, "ewc_loss": 0.029876120388507843, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011199361324543133, "grad_norm": 4.3434906005859375, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8498384356498718, "num_tokens": 110846532.0, "step": 2903 }, { "epoch": 0.36941864902684135, "ewc_loss": 0.029902104288339615, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011225345224374905, "grad_norm": 4.325228691101074, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8526938557624817, "num_tokens": 110889807.0, "step": 2904 }, { "epoch": 0.3695458593054319, "ewc_loss": 0.02989937551319599, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011222617467865348, "grad_norm": 4.361893653869629, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8355937004089355, "num_tokens": 110930567.0, "step": 2905 }, { "epoch": 0.3696730695840224, "ewc_loss": 0.029896941035985947, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.0001122018220485188, "grad_norm": 4.326112747192383, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8622500896453857, "num_tokens": 110968725.0, "step": 2906 }, { "epoch": 0.3698002798626129, "ewc_loss": 0.029871374368667603, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011194615217391402, "grad_norm": 4.353957653045654, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8788050413131714, "num_tokens": 111002724.0, "step": 2907 }, { "epoch": 0.3699274901412034, "ewc_loss": 0.029935643076896667, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011258884478593245, "grad_norm": 4.387699127197266, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8542084693908691, "num_tokens": 111036668.0, "step": 2908 }, { "epoch": 0.37005470041979394, "ewc_loss": 0.029894735664129257, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011217976862099022, "grad_norm": 4.3567986488342285, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8445874452590942, "num_tokens": 111077712.0, "step": 2909 }, { "epoch": 0.3701819106983844, "ewc_loss": 0.02989734709262848, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011220588930882514, "grad_norm": 4.378873348236084, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8457791209220886, "num_tokens": 111118856.0, "step": 2910 }, { "epoch": 0.37030912097697494, "ewc_loss": 0.02991468273103237, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011237924627494067, "grad_norm": 4.388914585113525, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8473917245864868, "num_tokens": 111156712.0, "step": 2911 }, { "epoch": 0.37043633125556547, "ewc_loss": 0.02992212027311325, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011245363566558808, "grad_norm": 4.369226455688477, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8561466932296753, "num_tokens": 111195542.0, "step": 2912 }, { "epoch": 0.37056354153415594, "ewc_loss": 0.02987455576658249, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011197796993656084, "grad_norm": 4.344820976257324, "learning_rate": 1e-06, "loss": 0.5373, "mean_token_accuracy": 0.8367810249328613, "num_tokens": 111237827.0, "step": 2913 }, { "epoch": 0.37069075181274647, "ewc_loss": 0.029886534437537193, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011209776857867837, "grad_norm": 4.305488109588623, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8624976873397827, "num_tokens": 111278802.0, "step": 2914 }, { "epoch": 0.370817962091337, "ewc_loss": 0.02987746149301529, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011200704466318712, "grad_norm": 4.377252578735352, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8515070676803589, "num_tokens": 111313795.0, "step": 2915 }, { "epoch": 0.37094517236992747, "ewc_loss": 0.029967425391077995, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011290668044239283, "grad_norm": 4.340111255645752, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8618980646133423, "num_tokens": 111353798.0, "step": 2916 }, { "epoch": 0.371072382648518, "ewc_loss": 0.02988392487168312, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011207166244275868, "grad_norm": 4.379868984222412, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8546268939971924, "num_tokens": 111390613.0, "step": 2917 }, { "epoch": 0.3711995929271085, "ewc_loss": 0.02995525673031807, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011278497549938038, "grad_norm": 4.297267436981201, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8554695248603821, "num_tokens": 111432043.0, "step": 2918 }, { "epoch": 0.371326803205699, "ewc_loss": 0.029890216886997223, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011213459947612137, "grad_norm": 4.3648505210876465, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8608976602554321, "num_tokens": 111476575.0, "step": 2919 }, { "epoch": 0.3714540134842895, "ewc_loss": 0.029995515942573547, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.0001131875833380036, "grad_norm": 4.435580253601074, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8553817868232727, "num_tokens": 111515304.0, "step": 2920 }, { "epoch": 0.37158122376288005, "ewc_loss": 0.029954563826322556, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.0001127780633396469, "grad_norm": 4.41775369644165, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8641039729118347, "num_tokens": 111550246.0, "step": 2921 }, { "epoch": 0.3717084340414705, "ewc_loss": 0.02992667630314827, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011249919043621048, "grad_norm": 4.369028568267822, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8556202054023743, "num_tokens": 111592189.0, "step": 2922 }, { "epoch": 0.37183564432006105, "ewc_loss": 0.029883967712521553, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011207209900021553, "grad_norm": 4.3354034423828125, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8580960035324097, "num_tokens": 111638177.0, "step": 2923 }, { "epoch": 0.3719628545986516, "ewc_loss": 0.0298771969974041, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011200438166270033, "grad_norm": 4.448712348937988, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8556722402572632, "num_tokens": 111674365.0, "step": 2924 }, { "epoch": 0.37209006487724205, "ewc_loss": 0.029964132234454155, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011287374218227342, "grad_norm": 4.415437698364258, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8500974178314209, "num_tokens": 111714571.0, "step": 2925 }, { "epoch": 0.3722172751558326, "ewc_loss": 0.029871292412281036, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011194535909453407, "grad_norm": 4.406956195831299, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8552912473678589, "num_tokens": 111755214.0, "step": 2926 }, { "epoch": 0.3723444854344231, "ewc_loss": 0.029903803020715714, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011227044888073578, "grad_norm": 4.35617208480835, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.828209638595581, "num_tokens": 111796516.0, "step": 2927 }, { "epoch": 0.3724716957130136, "ewc_loss": 0.02989739552140236, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011220637679798529, "grad_norm": 4.405245780944824, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8392646312713623, "num_tokens": 111832615.0, "step": 2928 }, { "epoch": 0.3725989059916041, "ewc_loss": 0.029929427430033684, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.0001125266935559921, "grad_norm": 4.368046283721924, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.844566285610199, "num_tokens": 111874682.0, "step": 2929 }, { "epoch": 0.37272611627019464, "ewc_loss": 0.029893867671489716, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011217109567951411, "grad_norm": 4.357912063598633, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8430199027061462, "num_tokens": 111916369.0, "step": 2930 }, { "epoch": 0.37285332654878517, "ewc_loss": 0.02994558773934841, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011268829985056072, "grad_norm": 4.369023323059082, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8482677936553955, "num_tokens": 111950671.0, "step": 2931 }, { "epoch": 0.37298053682737564, "ewc_loss": 0.029950391501188278, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011273635027464479, "grad_norm": 4.366030216217041, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8376461863517761, "num_tokens": 111988846.0, "step": 2932 }, { "epoch": 0.37310774710596617, "ewc_loss": 0.0299754049628973, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011298646859358996, "grad_norm": 4.407990455627441, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8646712303161621, "num_tokens": 112022853.0, "step": 2933 }, { "epoch": 0.3732349573845567, "ewc_loss": 0.02996188774704933, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011285131040494889, "grad_norm": 4.310577869415283, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8508641719818115, "num_tokens": 112065631.0, "step": 2934 }, { "epoch": 0.37336216766314717, "ewc_loss": 0.0299554243683815, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011278666352154687, "grad_norm": 4.391132831573486, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8441674709320068, "num_tokens": 112107236.0, "step": 2935 }, { "epoch": 0.3734893779417377, "ewc_loss": 0.030042890459299088, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011366131366230547, "grad_norm": 4.36033821105957, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8320231437683105, "num_tokens": 112153914.0, "step": 2936 }, { "epoch": 0.3736165882203282, "ewc_loss": 0.029973626136779785, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011296867887722328, "grad_norm": 4.436588287353516, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8573300242424011, "num_tokens": 112187596.0, "step": 2937 }, { "epoch": 0.3737437984989187, "ewc_loss": 0.03017423301935196, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011375406029401347, "grad_norm": 4.389880180358887, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8533584475517273, "num_tokens": 112222409.0, "step": 2938 }, { "epoch": 0.37387100877750923, "ewc_loss": 0.03010069765150547, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011301869380986318, "grad_norm": 4.401346683502197, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8419597148895264, "num_tokens": 112257699.0, "step": 2939 }, { "epoch": 0.37399821905609976, "ewc_loss": 0.03004072792828083, "ewc_loss_diag": 1.8715858459472656e-05, "ewc_loss_parallel": 0.00011363970406819135, "grad_norm": 4.350126266479492, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8628793954849243, "num_tokens": 112292431.0, "step": 2940 }, { "epoch": 0.37412542933469023, "ewc_loss": 0.030111871659755707, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.0001131304306909442, "grad_norm": 4.377572536468506, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8559234142303467, "num_tokens": 112331241.0, "step": 2941 }, { "epoch": 0.37425263961328076, "ewc_loss": 0.030170075595378876, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011371247819624841, "grad_norm": 4.368445873260498, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8506661653518677, "num_tokens": 112368486.0, "step": 2942 }, { "epoch": 0.3743798498918713, "ewc_loss": 0.030116353183984756, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011317525058984756, "grad_norm": 5.479369640350342, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8451387882232666, "num_tokens": 112409638.0, "step": 2943 }, { "epoch": 0.37450706017046176, "ewc_loss": 0.03094414994120598, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00012145320943091065, "grad_norm": 4.374360084533691, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8492893576622009, "num_tokens": 112448257.0, "step": 2944 }, { "epoch": 0.3746342704490523, "ewc_loss": 0.029709583148360252, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00010910754645010456, "grad_norm": 4.291856288909912, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.869152307510376, "num_tokens": 112488739.0, "step": 2945 }, { "epoch": 0.3747614807276428, "ewc_loss": 0.030242303386330605, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011443474795669317, "grad_norm": 4.40950345993042, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8390462398529053, "num_tokens": 112528621.0, "step": 2946 }, { "epoch": 0.3748886910062333, "ewc_loss": 0.03009587898850441, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011297051969449967, "grad_norm": 4.392910480499268, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8502641916275024, "num_tokens": 112564615.0, "step": 2947 }, { "epoch": 0.3750159012848238, "ewc_loss": 0.03007475659251213, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011275929136900231, "grad_norm": 4.509964466094971, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8425094485282898, "num_tokens": 112598190.0, "step": 2948 }, { "epoch": 0.37514311156341434, "ewc_loss": 0.03016226552426815, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011363437806721777, "grad_norm": 4.3875203132629395, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8534854650497437, "num_tokens": 112632078.0, "step": 2949 }, { "epoch": 0.3752703218420048, "ewc_loss": 0.030063040554523468, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.0001126421193475835, "grad_norm": 4.4542131423950195, "learning_rate": 1e-06, "loss": 0.539, "mean_token_accuracy": 0.831555187702179, "num_tokens": 112667848.0, "step": 2950 }, { "epoch": 0.37539753212059535, "ewc_loss": 0.0301726832985878, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011373856250429526, "grad_norm": 4.35699987411499, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8518701791763306, "num_tokens": 112707037.0, "step": 2951 }, { "epoch": 0.3755247423991859, "ewc_loss": 0.030084818601608276, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011285990331089124, "grad_norm": 4.325549602508545, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8629193305969238, "num_tokens": 112751921.0, "step": 2952 }, { "epoch": 0.37565195267777635, "ewc_loss": 0.030132682994008064, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011333854490658268, "grad_norm": 4.370423793792725, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8585792779922485, "num_tokens": 112793061.0, "step": 2953 }, { "epoch": 0.3757791629563669, "ewc_loss": 0.03025311976671219, "ewc_loss_diag": 1.895427703857422e-05, "ewc_loss_parallel": 0.00011332220310578123, "grad_norm": 4.328719139099121, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8638903498649597, "num_tokens": 112833852.0, "step": 2954 }, { "epoch": 0.3759063732349574, "ewc_loss": 0.030121900141239166, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.0001132307224906981, "grad_norm": 4.429008960723877, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8504328727722168, "num_tokens": 112866718.0, "step": 2955 }, { "epoch": 0.3760335835135479, "ewc_loss": 0.03019842691719532, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011399598588468507, "grad_norm": 4.360939025878906, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8554284572601318, "num_tokens": 112909420.0, "step": 2956 }, { "epoch": 0.3761607937921384, "ewc_loss": 0.030101407319307327, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.0001130257805925794, "grad_norm": 4.4360833168029785, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8556350469589233, "num_tokens": 112944570.0, "step": 2957 }, { "epoch": 0.37628800407072893, "ewc_loss": 0.0301908440887928, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011392017040634528, "grad_norm": 4.310489177703857, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8663281798362732, "num_tokens": 112984107.0, "step": 2958 }, { "epoch": 0.3764152143493194, "ewc_loss": 0.030082987621426582, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011284159700153396, "grad_norm": 4.333820343017578, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.87713623046875, "num_tokens": 113023066.0, "step": 2959 }, { "epoch": 0.37654242462790993, "ewc_loss": 0.03017139434814453, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011372566223144531, "grad_norm": 4.411744594573975, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8525975942611694, "num_tokens": 113060093.0, "step": 2960 }, { "epoch": 0.37666963490650046, "ewc_loss": 0.030171487480401993, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011372658627806231, "grad_norm": 4.738341331481934, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8456777334213257, "num_tokens": 113095940.0, "step": 2961 }, { "epoch": 0.37679684518509093, "ewc_loss": 0.03031284734606743, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011514018115121871, "grad_norm": 4.3454155921936035, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8654834628105164, "num_tokens": 113130699.0, "step": 2962 }, { "epoch": 0.37692405546368146, "ewc_loss": 0.03024272993206978, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001119976004702039, "grad_norm": 5.491722583770752, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.853797435760498, "num_tokens": 113169145.0, "step": 2963 }, { "epoch": 0.377051265742272, "ewc_loss": 0.03130926936864853, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00012266298290342093, "grad_norm": 4.488972187042236, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8678585290908813, "num_tokens": 113201987.0, "step": 2964 }, { "epoch": 0.37717847602086246, "ewc_loss": 0.029726874083280563, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00010928046685876325, "grad_norm": 4.346641540527344, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8606003522872925, "num_tokens": 113239588.0, "step": 2965 }, { "epoch": 0.377305686299453, "ewc_loss": 0.030228257179260254, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011429428559495136, "grad_norm": 4.417466640472412, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8564729690551758, "num_tokens": 113277075.0, "step": 2966 }, { "epoch": 0.3774328965780435, "ewc_loss": 0.030076827853918076, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011277999146841466, "grad_norm": 4.352283954620361, "learning_rate": 1e-06, "loss": 0.3756, "mean_token_accuracy": 0.8815292119979858, "num_tokens": 113313708.0, "step": 2967 }, { "epoch": 0.377560106856634, "ewc_loss": 0.030351944267749786, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011308975808788091, "grad_norm": 4.496199131011963, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.8371162414550781, "num_tokens": 113344027.0, "step": 2968 }, { "epoch": 0.3776873171352245, "ewc_loss": 0.030177829787135124, "ewc_loss_diag": 1.8835067749023438e-05, "ewc_loss_parallel": 0.00011379001807654276, "grad_norm": 4.2829179763793945, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8581442832946777, "num_tokens": 113388444.0, "step": 2969 }, { "epoch": 0.37781452741381505, "ewc_loss": 0.03030611388385296, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011263145279372111, "grad_norm": 4.378488063812256, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8503156900405884, "num_tokens": 113428660.0, "step": 2970 }, { "epoch": 0.3779417376924055, "ewc_loss": 0.030455198138952255, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011412228195695207, "grad_norm": 4.410897254943848, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8330845832824707, "num_tokens": 113469160.0, "step": 2971 }, { "epoch": 0.37806894797099605, "ewc_loss": 0.030439302325248718, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011396333866287023, "grad_norm": 4.425107955932617, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8311392664909363, "num_tokens": 113504291.0, "step": 2972 }, { "epoch": 0.3781961582495866, "ewc_loss": 0.030430868268013, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.000113878988486249, "grad_norm": 4.301358222961426, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8561737537384033, "num_tokens": 113548085.0, "step": 2973 }, { "epoch": 0.37832336852817705, "ewc_loss": 0.03042151778936386, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011378549970686436, "grad_norm": 4.465309143066406, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8599504232406616, "num_tokens": 113581664.0, "step": 2974 }, { "epoch": 0.3784505788067676, "ewc_loss": 0.030517326667904854, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011474357597762719, "grad_norm": 4.388847827911377, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8671713471412659, "num_tokens": 113618552.0, "step": 2975 }, { "epoch": 0.3785777890853581, "ewc_loss": 0.03040497750043869, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011362009536242113, "grad_norm": 4.461146354675293, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.833972692489624, "num_tokens": 113653406.0, "step": 2976 }, { "epoch": 0.3787049993639486, "ewc_loss": 0.03050335682928562, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011460387759143487, "grad_norm": 4.349713325500488, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8635655641555786, "num_tokens": 113693803.0, "step": 2977 }, { "epoch": 0.3788322096425391, "ewc_loss": 0.030392883345484734, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011349914711900055, "grad_norm": 4.346876621246338, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.868743360042572, "num_tokens": 113734637.0, "step": 2978 }, { "epoch": 0.37895941992112964, "ewc_loss": 0.030476318672299385, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001143334957305342, "grad_norm": 4.367379188537598, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8479673862457275, "num_tokens": 113776579.0, "step": 2979 }, { "epoch": 0.37908663019972016, "ewc_loss": 0.03046233579516411, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001141936591011472, "grad_norm": 4.381883144378662, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8438600301742554, "num_tokens": 113818857.0, "step": 2980 }, { "epoch": 0.37921384047831064, "ewc_loss": 0.030448857694864273, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011405890109017491, "grad_norm": 4.399096965789795, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8466229438781738, "num_tokens": 113853248.0, "step": 2981 }, { "epoch": 0.37934105075690117, "ewc_loss": 0.030492328107357025, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011449358134996146, "grad_norm": 4.411291599273682, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8599698543548584, "num_tokens": 113894206.0, "step": 2982 }, { "epoch": 0.3794682610354917, "ewc_loss": 0.030477093532681465, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011434125190135092, "grad_norm": 4.408387660980225, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8610974550247192, "num_tokens": 113931309.0, "step": 2983 }, { "epoch": 0.37959547131408217, "ewc_loss": 0.030469108372926712, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011426138371462002, "grad_norm": 4.330513954162598, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8435657024383545, "num_tokens": 113976559.0, "step": 2984 }, { "epoch": 0.3797226815926727, "ewc_loss": 0.030441176146268845, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011398207425372675, "grad_norm": 4.527804851531982, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8542608022689819, "num_tokens": 114010814.0, "step": 2985 }, { "epoch": 0.3798498918712632, "ewc_loss": 0.030545674264431, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011502705456223339, "grad_norm": 4.384612560272217, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8729758262634277, "num_tokens": 114049389.0, "step": 2986 }, { "epoch": 0.3799771021498537, "ewc_loss": 0.03037988394498825, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011336914758430794, "grad_norm": 4.409867763519287, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.872252881526947, "num_tokens": 114081463.0, "step": 2987 }, { "epoch": 0.3801043124284442, "ewc_loss": 0.03048127144575119, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011438303044997156, "grad_norm": 4.398689270019531, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8704304695129395, "num_tokens": 114117175.0, "step": 2988 }, { "epoch": 0.38023152270703475, "ewc_loss": 0.030427757650613785, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001138478983193636, "grad_norm": 4.375176429748535, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8430089950561523, "num_tokens": 114153764.0, "step": 2989 }, { "epoch": 0.3803587329856252, "ewc_loss": 0.03046768717467785, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011424718104535714, "grad_norm": 4.373344898223877, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8537900447845459, "num_tokens": 114191487.0, "step": 2990 }, { "epoch": 0.38048594326421575, "ewc_loss": 0.030467521399259567, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011424553667893633, "grad_norm": 4.403999328613281, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.8306180238723755, "num_tokens": 114233880.0, "step": 2991 }, { "epoch": 0.3806131535428063, "ewc_loss": 0.030487243086099625, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011444273695815355, "grad_norm": 4.430385112762451, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8523085117340088, "num_tokens": 114269238.0, "step": 2992 }, { "epoch": 0.38074036382139675, "ewc_loss": 0.030480273067951202, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011437304056016728, "grad_norm": 4.341592311859131, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8668864965438843, "num_tokens": 114307645.0, "step": 2993 }, { "epoch": 0.3808675740999873, "ewc_loss": 0.030450504273176193, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011407535203034058, "grad_norm": 4.403306007385254, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8581263422966003, "num_tokens": 114341800.0, "step": 2994 }, { "epoch": 0.3809947843785778, "ewc_loss": 0.030515827238559723, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011472858022898436, "grad_norm": 4.3589863777160645, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8642062544822693, "num_tokens": 114378363.0, "step": 2995 }, { "epoch": 0.3811219946571683, "ewc_loss": 0.030448947101831436, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011405977420508862, "grad_norm": 4.404086112976074, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8433573842048645, "num_tokens": 114414489.0, "step": 2996 }, { "epoch": 0.3812492049357588, "ewc_loss": 0.03052082285284996, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011477855150587857, "grad_norm": 4.436183929443359, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.849957287311554, "num_tokens": 114449197.0, "step": 2997 }, { "epoch": 0.38137641521434934, "ewc_loss": 0.030512895435094833, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001146992581197992, "grad_norm": 4.452698230743408, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8525272607803345, "num_tokens": 114478654.0, "step": 2998 }, { "epoch": 0.3815036254929398, "ewc_loss": 0.0305170975625515, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011474129132693633, "grad_norm": 4.311679840087891, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8630582094192505, "num_tokens": 114521807.0, "step": 2999 }, { "epoch": 0.38163083577153034, "ewc_loss": 0.030464280396699905, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011421312956372276, "grad_norm": 4.4093499183654785, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8679200410842896, "num_tokens": 114558297.0, "step": 3000 }, { "epoch": 0.38175804605012087, "ewc_loss": 0.03054797649383545, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011505009024403989, "grad_norm": 4.352478504180908, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8610666990280151, "num_tokens": 114599630.0, "step": 3001 }, { "epoch": 0.38188525632871134, "ewc_loss": 0.030504729598760605, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011461762187536806, "grad_norm": 4.4238600730896, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8516528010368347, "num_tokens": 114638697.0, "step": 3002 }, { "epoch": 0.38201246660730187, "ewc_loss": 0.030558712780475616, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011515744699863717, "grad_norm": 4.393562316894531, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8539769649505615, "num_tokens": 114676106.0, "step": 3003 }, { "epoch": 0.3821396768858924, "ewc_loss": 0.03052428364753723, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001148131414083764, "grad_norm": 4.436530590057373, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8522706031799316, "num_tokens": 114712026.0, "step": 3004 }, { "epoch": 0.38226688716448287, "ewc_loss": 0.030546199530363083, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011503230052767321, "grad_norm": 4.395935535430908, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8333194255828857, "num_tokens": 114752348.0, "step": 3005 }, { "epoch": 0.3823940974430734, "ewc_loss": 0.030658002942800522, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011492962948977947, "grad_norm": 4.393021583557129, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8591985702514648, "num_tokens": 114789922.0, "step": 3006 }, { "epoch": 0.38252130772166393, "ewc_loss": 0.03064218908548355, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011477150837890804, "grad_norm": 4.337390422821045, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8488374948501587, "num_tokens": 114838834.0, "step": 3007 }, { "epoch": 0.3826485180002544, "ewc_loss": 0.03063705377280712, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011472014739410952, "grad_norm": 4.4511799812316895, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8341441750526428, "num_tokens": 114873320.0, "step": 3008 }, { "epoch": 0.38277572827884493, "ewc_loss": 0.03071235492825508, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011547315079951659, "grad_norm": 4.40994930267334, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8660503625869751, "num_tokens": 114908293.0, "step": 3009 }, { "epoch": 0.38290293855743546, "ewc_loss": 0.030612211674451828, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011447172437328845, "grad_norm": 4.365911960601807, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8447755575180054, "num_tokens": 114950083.0, "step": 3010 }, { "epoch": 0.38303014883602593, "ewc_loss": 0.030657773837447166, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011492735211504623, "grad_norm": 4.45836877822876, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8581942319869995, "num_tokens": 114982353.0, "step": 3011 }, { "epoch": 0.38315735911461646, "ewc_loss": 0.030571825802326202, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.0001152885815827176, "grad_norm": 4.396757125854492, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8598310351371765, "num_tokens": 115017668.0, "step": 3012 }, { "epoch": 0.383284569393207, "ewc_loss": 0.030637063086032867, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011472023470560089, "grad_norm": 4.364980697631836, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.866470456123352, "num_tokens": 115055266.0, "step": 3013 }, { "epoch": 0.38341177967179746, "ewc_loss": 0.03052288293838501, "ewc_loss_diag": 1.9073486328125e-05, "ewc_loss_parallel": 0.00011479913518996909, "grad_norm": 4.413331508636475, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8495392799377441, "num_tokens": 115089728.0, "step": 3014 }, { "epoch": 0.383538989950388, "ewc_loss": 0.030691727995872498, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011526689195306972, "grad_norm": 4.352219104766846, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.839489221572876, "num_tokens": 115135192.0, "step": 3015 }, { "epoch": 0.3836662002289785, "ewc_loss": 0.030649937689304352, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011484899732749909, "grad_norm": 4.488616943359375, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8367393612861633, "num_tokens": 115170350.0, "step": 3016 }, { "epoch": 0.383793410507569, "ewc_loss": 0.030759230256080627, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011594191892072558, "grad_norm": 4.383284568786621, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8615976572036743, "num_tokens": 115202724.0, "step": 3017 }, { "epoch": 0.3839206207861595, "ewc_loss": 0.030646715313196182, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011481675755931064, "grad_norm": 4.426372528076172, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8546866178512573, "num_tokens": 115239274.0, "step": 3018 }, { "epoch": 0.38404783106475004, "ewc_loss": 0.03072735294699669, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011562313738977537, "grad_norm": 4.369785308837891, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8625876903533936, "num_tokens": 115278330.0, "step": 3019 }, { "epoch": 0.3841750413433405, "ewc_loss": 0.030686115846037865, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011521077249199152, "grad_norm": 4.400440692901611, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8505992889404297, "num_tokens": 115321258.0, "step": 3020 }, { "epoch": 0.38430225162193105, "ewc_loss": 0.03072550892829895, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011560470738913864, "grad_norm": 4.403756141662598, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8560712337493896, "num_tokens": 115362609.0, "step": 3021 }, { "epoch": 0.3844294619005216, "ewc_loss": 0.030669761821627617, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001150472235167399, "grad_norm": 4.415129661560059, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8282730579376221, "num_tokens": 115405814.0, "step": 3022 }, { "epoch": 0.38455667217911205, "ewc_loss": 0.030674628913402557, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011509588512126356, "grad_norm": 4.4348673820495605, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8489619493484497, "num_tokens": 115440856.0, "step": 3023 }, { "epoch": 0.3846838824577026, "ewc_loss": 0.03071022219955921, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011545183224370703, "grad_norm": 4.389305114746094, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.860568642616272, "num_tokens": 115482582.0, "step": 3024 }, { "epoch": 0.3848110927362931, "ewc_loss": 0.03066314198076725, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011498102685436606, "grad_norm": 4.364426136016846, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.846043050289154, "num_tokens": 115527268.0, "step": 3025 }, { "epoch": 0.3849383030148836, "ewc_loss": 0.03070336952805519, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011538330727489665, "grad_norm": 4.446414947509766, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8491605520248413, "num_tokens": 115563757.0, "step": 3026 }, { "epoch": 0.3850655132934741, "ewc_loss": 0.03075082041323185, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011585780885070562, "grad_norm": 4.379988193511963, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8584116697311401, "num_tokens": 115599629.0, "step": 3027 }, { "epoch": 0.38519272357206463, "ewc_loss": 0.03068462386727333, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011519585677888244, "grad_norm": 4.468220233917236, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8588111400604248, "num_tokens": 115634261.0, "step": 3028 }, { "epoch": 0.3853199338506551, "ewc_loss": 0.030741151422262192, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011576113320188597, "grad_norm": 4.362560272216797, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8673206567764282, "num_tokens": 115671150.0, "step": 3029 }, { "epoch": 0.38544714412924563, "ewc_loss": 0.030667781829833984, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011502744018798694, "grad_norm": 4.360323905944824, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8676384687423706, "num_tokens": 115714328.0, "step": 3030 }, { "epoch": 0.38557435440783616, "ewc_loss": 0.030724868178367615, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011559828999452293, "grad_norm": 4.345920562744141, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8698827028274536, "num_tokens": 115747118.0, "step": 3031 }, { "epoch": 0.3857015646864267, "ewc_loss": 0.030701499432325363, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011536459351191297, "grad_norm": 4.414887428283691, "learning_rate": 1e-06, "loss": 0.5286, "mean_token_accuracy": 0.8350454568862915, "num_tokens": 115790309.0, "step": 3032 }, { "epoch": 0.38582877496501716, "ewc_loss": 0.030851054936647415, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00011563944281078875, "grad_norm": 4.380398750305176, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8663694858551025, "num_tokens": 115825630.0, "step": 3033 }, { "epoch": 0.3859559852436077, "ewc_loss": 0.030815094709396362, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001152798649854958, "grad_norm": 4.425335884094238, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8625442981719971, "num_tokens": 115858848.0, "step": 3034 }, { "epoch": 0.3860831955221982, "ewc_loss": 0.03084075078368187, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00011553642252692953, "grad_norm": 4.365835189819336, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8633028864860535, "num_tokens": 115900024.0, "step": 3035 }, { "epoch": 0.3862104058007887, "ewc_loss": 0.030809899792075157, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00011522790009621531, "grad_norm": 4.428952217102051, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8310261964797974, "num_tokens": 115938899.0, "step": 3036 }, { "epoch": 0.3863376160793792, "ewc_loss": 0.03073446825146675, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001156943035311997, "grad_norm": 4.477719306945801, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8621492981910706, "num_tokens": 115972199.0, "step": 3037 }, { "epoch": 0.38646482635796975, "ewc_loss": 0.030703937634825706, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011538898979779333, "grad_norm": 4.366581916809082, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8648286461830139, "num_tokens": 116011506.0, "step": 3038 }, { "epoch": 0.3865920366365602, "ewc_loss": 0.030659440904855728, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.0001149440067820251, "grad_norm": 4.440943717956543, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8320544958114624, "num_tokens": 116051428.0, "step": 3039 }, { "epoch": 0.38671924691515075, "ewc_loss": 0.030745305120944977, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011580266436794773, "grad_norm": 4.359958171844482, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8631188869476318, "num_tokens": 116090307.0, "step": 3040 }, { "epoch": 0.3868464571937413, "ewc_loss": 0.030654070898890495, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011489031749079004, "grad_norm": 4.427073001861572, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8451828956604004, "num_tokens": 116123381.0, "step": 3041 }, { "epoch": 0.38697366747233175, "ewc_loss": 0.030756400898098946, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011591361544560641, "grad_norm": 4.383849143981934, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8584831953048706, "num_tokens": 116157755.0, "step": 3042 }, { "epoch": 0.3871008777509223, "ewc_loss": 0.03069988079369068, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011534841905813664, "grad_norm": 4.372957706451416, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8559627532958984, "num_tokens": 116197371.0, "step": 3043 }, { "epoch": 0.3872280880295128, "ewc_loss": 0.030736759305000305, "ewc_loss_diag": 1.919269561767578e-05, "ewc_loss_parallel": 0.00011571720824576914, "grad_norm": 4.375284194946289, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8395900726318359, "num_tokens": 116240544.0, "step": 3044 }, { "epoch": 0.3873552983081033, "ewc_loss": 0.030869504436850548, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00011582394654396921, "grad_norm": 4.452966690063477, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8518882989883423, "num_tokens": 116272079.0, "step": 3045 }, { "epoch": 0.3874825085866938, "ewc_loss": 0.03092012368142605, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00011633014219114557, "grad_norm": 4.372433185577393, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8523910045623779, "num_tokens": 116309641.0, "step": 3046 }, { "epoch": 0.38760971886528434, "ewc_loss": 0.030863948166370392, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00011576839460758492, "grad_norm": 4.465160369873047, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8499817252159119, "num_tokens": 116344470.0, "step": 3047 }, { "epoch": 0.3877369291438748, "ewc_loss": 0.030960235744714737, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001167312657344155, "grad_norm": 4.417991638183594, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.858187735080719, "num_tokens": 116380177.0, "step": 3048 }, { "epoch": 0.38786413942246534, "ewc_loss": 0.0308866985142231, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.00011599588469834998, "grad_norm": 4.349143981933594, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8660798072814941, "num_tokens": 116416482.0, "step": 3049 }, { "epoch": 0.38799134970105587, "ewc_loss": 0.03087468445301056, "ewc_loss_diag": 1.9311904907226562e-05, "ewc_loss_parallel": 0.0001158757513621822, "grad_norm": 4.416996955871582, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8415789008140564, "num_tokens": 116456103.0, "step": 3050 }, { "epoch": 0.38811855997964634, "ewc_loss": 0.031049851328134537, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011640671436907724, "grad_norm": 4.341230392456055, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8605468273162842, "num_tokens": 116496810.0, "step": 3051 }, { "epoch": 0.38824577025823687, "ewc_loss": 0.03099765256047249, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011588471534196287, "grad_norm": 4.435436248779297, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8516535758972168, "num_tokens": 116532999.0, "step": 3052 }, { "epoch": 0.3883729805368274, "ewc_loss": 0.03112528845667839, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011716107837855816, "grad_norm": 4.4292778968811035, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8634580373764038, "num_tokens": 116565700.0, "step": 3053 }, { "epoch": 0.38850019081541787, "ewc_loss": 0.03105548396706581, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011646304483292624, "grad_norm": 4.352717399597168, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8592528104782104, "num_tokens": 116602798.0, "step": 3054 }, { "epoch": 0.3886274010940084, "ewc_loss": 0.031073391437530518, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011664212070172653, "grad_norm": 4.442840576171875, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.86679607629776, "num_tokens": 116638743.0, "step": 3055 }, { "epoch": 0.3887546113725989, "ewc_loss": 0.031134851276874542, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011725670628948137, "grad_norm": 4.442711353302002, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8670493364334106, "num_tokens": 116672290.0, "step": 3056 }, { "epoch": 0.3888818216511894, "ewc_loss": 0.031084798276424408, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001167561931652017, "grad_norm": 4.352903842926025, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.856508731842041, "num_tokens": 116712593.0, "step": 3057 }, { "epoch": 0.3890090319297799, "ewc_loss": 0.031082142144441605, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001167296213679947, "grad_norm": 4.470094680786133, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8543046712875366, "num_tokens": 116748663.0, "step": 3058 }, { "epoch": 0.38913624220837045, "ewc_loss": 0.03112507238984108, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011715892469510436, "grad_norm": 4.3301239013671875, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8513232469558716, "num_tokens": 116796018.0, "step": 3059 }, { "epoch": 0.3892634524869609, "ewc_loss": 0.031008267775177956, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011599087883951142, "grad_norm": 4.412917613983154, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8532423973083496, "num_tokens": 116837287.0, "step": 3060 }, { "epoch": 0.38939066276555145, "ewc_loss": 0.031127039343118668, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001171785916085355, "grad_norm": 4.477630615234375, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8390694856643677, "num_tokens": 116872163.0, "step": 3061 }, { "epoch": 0.389517873044142, "ewc_loss": 0.03110354207456112, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011694362183334306, "grad_norm": 4.356368064880371, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8626477718353271, "num_tokens": 116913445.0, "step": 3062 }, { "epoch": 0.38964508332273246, "ewc_loss": 0.031042572110891342, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011633392568910494, "grad_norm": 4.399501323699951, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8464841246604919, "num_tokens": 116954469.0, "step": 3063 }, { "epoch": 0.389772293601323, "ewc_loss": 0.031110581010580063, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001170140239992179, "grad_norm": 4.425180435180664, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.854516863822937, "num_tokens": 116994339.0, "step": 3064 }, { "epoch": 0.3898995038799135, "ewc_loss": 0.03106882981956005, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001165965004474856, "grad_norm": 4.386516571044922, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8676673769950867, "num_tokens": 117030867.0, "step": 3065 }, { "epoch": 0.390026714158504, "ewc_loss": 0.031054656952619553, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001164547647931613, "grad_norm": 4.4170145988464355, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8632674217224121, "num_tokens": 117068309.0, "step": 3066 }, { "epoch": 0.3901539244370945, "ewc_loss": 0.03106602467596531, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011656845163088292, "grad_norm": 4.396081447601318, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8495739102363586, "num_tokens": 117105278.0, "step": 3067 }, { "epoch": 0.39028113471568504, "ewc_loss": 0.031044887378811836, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011635707778623328, "grad_norm": 4.419693470001221, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8476831912994385, "num_tokens": 117143096.0, "step": 3068 }, { "epoch": 0.3904083449942755, "ewc_loss": 0.031107351183891296, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.0001169817041954957, "grad_norm": 4.412377834320068, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8551274538040161, "num_tokens": 117179638.0, "step": 3069 }, { "epoch": 0.39053555527286604, "ewc_loss": 0.03105498105287552, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011645800259429961, "grad_norm": 4.493654251098633, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8675395250320435, "num_tokens": 117211560.0, "step": 3070 }, { "epoch": 0.39066276555145657, "ewc_loss": 0.031103432178497314, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011694252316374332, "grad_norm": 4.4017744064331055, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8472549915313721, "num_tokens": 117250622.0, "step": 3071 }, { "epoch": 0.39078997583004704, "ewc_loss": 0.031149830669164658, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011618581629591063, "grad_norm": 4.733199596405029, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8751108646392822, "num_tokens": 117287600.0, "step": 3072 }, { "epoch": 0.39091718610863757, "ewc_loss": 0.031220169737935066, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011810989963123575, "grad_norm": 4.391236782073975, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8398432731628418, "num_tokens": 117328188.0, "step": 3073 }, { "epoch": 0.3910443963872281, "ewc_loss": 0.03092086873948574, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011511689081089571, "grad_norm": 4.420864582061768, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.863722026348114, "num_tokens": 117367282.0, "step": 3074 }, { "epoch": 0.39117160666581857, "ewc_loss": 0.031079720705747604, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011670539970509708, "grad_norm": 4.396553039550781, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8659393787384033, "num_tokens": 117403376.0, "step": 3075 }, { "epoch": 0.3912988169444091, "ewc_loss": 0.031007863581180573, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011598683340707794, "grad_norm": 4.504850387573242, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8457165360450745, "num_tokens": 117438661.0, "step": 3076 }, { "epoch": 0.39142602722299963, "ewc_loss": 0.0310914758592844, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011682295735226944, "grad_norm": 4.443164348602295, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8592115044593811, "num_tokens": 117477351.0, "step": 3077 }, { "epoch": 0.3915532375015901, "ewc_loss": 0.031028296798467636, "ewc_loss_diag": 1.9431114196777344e-05, "ewc_loss_parallel": 0.00011619117140071467, "grad_norm": 4.41969108581543, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8529839515686035, "num_tokens": 117512773.0, "step": 3078 }, { "epoch": 0.39168044778018063, "ewc_loss": 0.03116765432059765, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011636404087767005, "grad_norm": 4.418858528137207, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.839080810546875, "num_tokens": 117555370.0, "step": 3079 }, { "epoch": 0.39180765805877116, "ewc_loss": 0.031184537336230278, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011653287219814956, "grad_norm": 4.434812068939209, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8617440462112427, "num_tokens": 117594208.0, "step": 3080 }, { "epoch": 0.3919348683373617, "ewc_loss": 0.031180014833807945, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001164876448456198, "grad_norm": 4.3884501457214355, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8581769466400146, "num_tokens": 117632715.0, "step": 3081 }, { "epoch": 0.39206207861595216, "ewc_loss": 0.03115672990679741, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011625480692600831, "grad_norm": 4.4499030113220215, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8507469892501831, "num_tokens": 117669221.0, "step": 3082 }, { "epoch": 0.3921892888945427, "ewc_loss": 0.03121780790388584, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011686557991197333, "grad_norm": 4.410261631011963, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8497105836868286, "num_tokens": 117709559.0, "step": 3083 }, { "epoch": 0.3923164991731332, "ewc_loss": 0.03114689514040947, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011615644325502217, "grad_norm": 4.405013084411621, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8630752563476562, "num_tokens": 117745240.0, "step": 3084 }, { "epoch": 0.3924437094517237, "ewc_loss": 0.03121151216328144, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001168026210507378, "grad_norm": 4.380149841308594, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.857864499092102, "num_tokens": 117787897.0, "step": 3085 }, { "epoch": 0.3925709197303142, "ewc_loss": 0.031204570084810257, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011673319386318326, "grad_norm": 4.45488166809082, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8507389426231384, "num_tokens": 117826862.0, "step": 3086 }, { "epoch": 0.39269813000890474, "ewc_loss": 0.031226810067892075, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011695559078361839, "grad_norm": 4.35794734954834, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8509756326675415, "num_tokens": 117866932.0, "step": 3087 }, { "epoch": 0.3928253402874952, "ewc_loss": 0.03116791322827339, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011636664567049593, "grad_norm": 4.3965606689453125, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.855889081954956, "num_tokens": 117905538.0, "step": 3088 }, { "epoch": 0.39295255056608575, "ewc_loss": 0.031254708766937256, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001172346092062071, "grad_norm": 4.416140079498291, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8432849645614624, "num_tokens": 117945123.0, "step": 3089 }, { "epoch": 0.3930797608446763, "ewc_loss": 0.031203866004943848, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011672617256408557, "grad_norm": 4.40102481842041, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8578693866729736, "num_tokens": 117988743.0, "step": 3090 }, { "epoch": 0.39320697112326675, "ewc_loss": 0.031235158443450928, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011703909694915637, "grad_norm": 4.377374649047852, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8695081472396851, "num_tokens": 118026078.0, "step": 3091 }, { "epoch": 0.3933341814018573, "ewc_loss": 0.031215541064739227, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011684292257996276, "grad_norm": 4.411608695983887, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8670367002487183, "num_tokens": 118060105.0, "step": 3092 }, { "epoch": 0.3934613916804478, "ewc_loss": 0.031247125938534737, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011715875734807923, "grad_norm": 4.496784210205078, "learning_rate": 1e-06, "loss": 0.3705, "mean_token_accuracy": 0.8836609721183777, "num_tokens": 118089677.0, "step": 3093 }, { "epoch": 0.3935886019590383, "ewc_loss": 0.031244512647390366, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011713261483237147, "grad_norm": 4.360569953918457, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8687071204185486, "num_tokens": 118128449.0, "step": 3094 }, { "epoch": 0.3937158122376288, "ewc_loss": 0.031175225973129272, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011643974721664563, "grad_norm": 4.441740989685059, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8648402690887451, "num_tokens": 118168411.0, "step": 3095 }, { "epoch": 0.39384302251621933, "ewc_loss": 0.03127554431557655, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011744294170057401, "grad_norm": 4.395524978637695, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8659305572509766, "num_tokens": 118206097.0, "step": 3096 }, { "epoch": 0.3939702327948098, "ewc_loss": 0.03117620199918747, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001164495333796367, "grad_norm": 4.505674839019775, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8681973218917847, "num_tokens": 118244429.0, "step": 3097 }, { "epoch": 0.39409744307340033, "ewc_loss": 0.03127899765968323, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.0001174774588434957, "grad_norm": 4.446593284606934, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8609758019447327, "num_tokens": 118279125.0, "step": 3098 }, { "epoch": 0.39422465335199086, "ewc_loss": 0.031165461987257004, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011634212569333613, "grad_norm": 4.445236682891846, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8539831638336182, "num_tokens": 118313156.0, "step": 3099 }, { "epoch": 0.39435186363058133, "ewc_loss": 0.031248247250914574, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011716996959876269, "grad_norm": 4.435757160186768, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8424296379089355, "num_tokens": 118349868.0, "step": 3100 }, { "epoch": 0.39447907390917186, "ewc_loss": 0.031209003180265427, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011677752627292648, "grad_norm": 4.3914923667907715, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8465507626533508, "num_tokens": 118394998.0, "step": 3101 }, { "epoch": 0.3946062841877624, "ewc_loss": 0.031221836805343628, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011690586688928306, "grad_norm": 4.444467544555664, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8673500418663025, "num_tokens": 118436002.0, "step": 3102 }, { "epoch": 0.39473349446635286, "ewc_loss": 0.03122565895318985, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011694409477058798, "grad_norm": 4.362686634063721, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8614200949668884, "num_tokens": 118475094.0, "step": 3103 }, { "epoch": 0.3948607047449434, "ewc_loss": 0.031222380697727203, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011691130202962086, "grad_norm": 4.414166450500488, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8538240194320679, "num_tokens": 118514164.0, "step": 3104 }, { "epoch": 0.3949879150235339, "ewc_loss": 0.03126813843846321, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011736889427993447, "grad_norm": 4.417520046234131, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8511093258857727, "num_tokens": 118554271.0, "step": 3105 }, { "epoch": 0.3951151253021244, "ewc_loss": 0.03121739812195301, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011686148354783654, "grad_norm": 4.343633651733398, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8565149307250977, "num_tokens": 118597123.0, "step": 3106 }, { "epoch": 0.3952423355807149, "ewc_loss": 0.031231611967086792, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011700361937982962, "grad_norm": 4.399993896484375, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8503632545471191, "num_tokens": 118636683.0, "step": 3107 }, { "epoch": 0.39536954585930545, "ewc_loss": 0.03129128739237785, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011760036431951448, "grad_norm": 4.506847381591797, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8595554828643799, "num_tokens": 118665770.0, "step": 3108 }, { "epoch": 0.3954967561378959, "ewc_loss": 0.031295448541641235, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011764198279706761, "grad_norm": 4.401366233825684, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8657627701759338, "num_tokens": 118701652.0, "step": 3109 }, { "epoch": 0.39562396641648645, "ewc_loss": 0.031222935765981674, "ewc_loss_diag": 1.9550323486328125e-05, "ewc_loss_parallel": 0.00011691684630932286, "grad_norm": 4.406106948852539, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8625218868255615, "num_tokens": 118741756.0, "step": 3110 }, { "epoch": 0.395751176695077, "ewc_loss": 0.031420182436704636, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011766862735385075, "grad_norm": 4.388092041015625, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8570635318756104, "num_tokens": 118784320.0, "step": 3111 }, { "epoch": 0.39587838697366745, "ewc_loss": 0.031383346766233444, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011730026017175987, "grad_norm": 4.441578388214111, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8500024080276489, "num_tokens": 118821456.0, "step": 3112 }, { "epoch": 0.396005597252258, "ewc_loss": 0.031446170061826706, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011792850273195654, "grad_norm": 4.425909042358398, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8547946214675903, "num_tokens": 118861746.0, "step": 3113 }, { "epoch": 0.3961328075308485, "ewc_loss": 0.031432539224624634, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011779216583818197, "grad_norm": 4.417932987213135, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.86114102602005, "num_tokens": 118900346.0, "step": 3114 }, { "epoch": 0.396260017809439, "ewc_loss": 0.03145315498113632, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011799834464909509, "grad_norm": 4.423118591308594, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8566297292709351, "num_tokens": 118939707.0, "step": 3115 }, { "epoch": 0.3963872280880295, "ewc_loss": 0.03141968697309494, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011766367970267311, "grad_norm": 4.448338508605957, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8554205894470215, "num_tokens": 118971154.0, "step": 3116 }, { "epoch": 0.39651443836662004, "ewc_loss": 0.031458474695682526, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011805152462329715, "grad_norm": 4.402565002441406, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8407133221626282, "num_tokens": 119012344.0, "step": 3117 }, { "epoch": 0.3966416486452105, "ewc_loss": 0.031447187066078186, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011793864541687071, "grad_norm": 4.502666473388672, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.854878306388855, "num_tokens": 119047677.0, "step": 3118 }, { "epoch": 0.39676885892380104, "ewc_loss": 0.03149615600705147, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011842836829600856, "grad_norm": 4.37836217880249, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8693447113037109, "num_tokens": 119086357.0, "step": 3119 }, { "epoch": 0.39689606920239157, "ewc_loss": 0.03140632063150406, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.0001175299912574701, "grad_norm": 4.42514181137085, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8636329770088196, "num_tokens": 119122854.0, "step": 3120 }, { "epoch": 0.39702327948098204, "ewc_loss": 0.03149735927581787, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011844038817798719, "grad_norm": 4.435993194580078, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8636711239814758, "num_tokens": 119155449.0, "step": 3121 }, { "epoch": 0.39715048975957257, "ewc_loss": 0.03146490082144737, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011811579315690324, "grad_norm": 4.396125793457031, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.850887656211853, "num_tokens": 119197287.0, "step": 3122 }, { "epoch": 0.3972777000381631, "ewc_loss": 0.03149138018488884, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011838060891022906, "grad_norm": 4.429686069488525, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8440261483192444, "num_tokens": 119235749.0, "step": 3123 }, { "epoch": 0.39740491031675357, "ewc_loss": 0.03150373697280884, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011850414739456028, "grad_norm": 4.485757827758789, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8522680401802063, "num_tokens": 119267282.0, "step": 3124 }, { "epoch": 0.3975321205953441, "ewc_loss": 0.031525615602731705, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011872296454384923, "grad_norm": 4.408487319946289, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8483560085296631, "num_tokens": 119304387.0, "step": 3125 }, { "epoch": 0.3976593308739346, "ewc_loss": 0.031469620764255524, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011816301412181929, "grad_norm": 4.561146259307861, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8454458713531494, "num_tokens": 119341470.0, "step": 3126 }, { "epoch": 0.3977865411525251, "ewc_loss": 0.0316091850399971, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011955863737966865, "grad_norm": 4.3892059326171875, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8455154299736023, "num_tokens": 119381067.0, "step": 3127 }, { "epoch": 0.3979137514311156, "ewc_loss": 0.031442902982234955, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011789581185439602, "grad_norm": 4.459022521972656, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8566913604736328, "num_tokens": 119415680.0, "step": 3128 }, { "epoch": 0.39804096170970615, "ewc_loss": 0.031586818397045135, "ewc_loss_diag": 1.9669532775878906e-05, "ewc_loss_parallel": 0.00011933495989069343, "grad_norm": 4.429934024810791, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8466042280197144, "num_tokens": 119454300.0, "step": 3129 }, { "epoch": 0.3981681719882967, "ewc_loss": 0.03174009919166565, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011842640378745273, "grad_norm": 4.458108425140381, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8686320185661316, "num_tokens": 119485599.0, "step": 3130 }, { "epoch": 0.39829538226688715, "ewc_loss": 0.03179530054330826, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011897840886376798, "grad_norm": 4.426050662994385, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8415842056274414, "num_tokens": 119525820.0, "step": 3131 }, { "epoch": 0.3984225925454777, "ewc_loss": 0.03177023306488991, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011872773029608652, "grad_norm": 4.387596607208252, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8618463277816772, "num_tokens": 119568216.0, "step": 3132 }, { "epoch": 0.3985498028240682, "ewc_loss": 0.03176858648657799, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011871126480400562, "grad_norm": 4.569268703460693, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.84210205078125, "num_tokens": 119602914.0, "step": 3133 }, { "epoch": 0.3986770131026587, "ewc_loss": 0.03187031298875809, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011972852371400222, "grad_norm": 4.43475341796875, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8522207736968994, "num_tokens": 119637024.0, "step": 3134 }, { "epoch": 0.3988042233812492, "ewc_loss": 0.03168074041604996, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011783278023358434, "grad_norm": 4.400152206420898, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8474462032318115, "num_tokens": 119679265.0, "step": 3135 }, { "epoch": 0.39893143365983974, "ewc_loss": 0.0317828543484211, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011885393178090453, "grad_norm": 4.456953048706055, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8461651802062988, "num_tokens": 119716254.0, "step": 3136 }, { "epoch": 0.3990586439384302, "ewc_loss": 0.03176560625433922, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011868144065374509, "grad_norm": 4.352629661560059, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8491629362106323, "num_tokens": 119761325.0, "step": 3137 }, { "epoch": 0.39918585421702074, "ewc_loss": 0.03174305707216263, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011845595145132393, "grad_norm": 4.4226155281066895, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.878243088722229, "num_tokens": 119795468.0, "step": 3138 }, { "epoch": 0.39931306449561127, "ewc_loss": 0.03177691251039505, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011879452358698472, "grad_norm": 4.380061626434326, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8463930487632751, "num_tokens": 119833804.0, "step": 3139 }, { "epoch": 0.39944027477420174, "ewc_loss": 0.031739164143800735, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011841703963000327, "grad_norm": 4.411353588104248, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8516567349433899, "num_tokens": 119870852.0, "step": 3140 }, { "epoch": 0.39956748505279227, "ewc_loss": 0.0318017452955246, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011904283019248396, "grad_norm": 4.402110576629639, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8585228323936462, "num_tokens": 119909258.0, "step": 3141 }, { "epoch": 0.3996946953313828, "ewc_loss": 0.031768254935741425, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011870792513946071, "grad_norm": 4.477943420410156, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.844032347202301, "num_tokens": 119940701.0, "step": 3142 }, { "epoch": 0.39982190560997327, "ewc_loss": 0.03184541314840317, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011947953316848725, "grad_norm": 4.450306415557861, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8561546802520752, "num_tokens": 119976780.0, "step": 3143 }, { "epoch": 0.3999491158885638, "ewc_loss": 0.031761787831783295, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011864327098010108, "grad_norm": 4.419144153594971, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8480355143547058, "num_tokens": 120017543.0, "step": 3144 }, { "epoch": 0.40007632616715433, "ewc_loss": 0.03181398659944534, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.0001191652481793426, "grad_norm": 4.47188138961792, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8520675897598267, "num_tokens": 120050036.0, "step": 3145 }, { "epoch": 0.4002035364457448, "ewc_loss": 0.031927287578582764, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011907757289009169, "grad_norm": 4.432754993438721, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8699024319648743, "num_tokens": 120084582.0, "step": 3146 }, { "epoch": 0.40033074672433533, "ewc_loss": 0.03188806772232056, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011868535511894152, "grad_norm": 4.446132183074951, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8502885103225708, "num_tokens": 120120218.0, "step": 3147 }, { "epoch": 0.40045795700292586, "ewc_loss": 0.03191535174846649, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011895818897755817, "grad_norm": 4.344301700592041, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8756083846092224, "num_tokens": 120161650.0, "step": 3148 }, { "epoch": 0.40058516728151633, "ewc_loss": 0.031754020601511, "ewc_loss_diag": 1.990795135498047e-05, "ewc_loss_parallel": 0.00011856560740852728, "grad_norm": 4.430880546569824, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8643916845321655, "num_tokens": 120197409.0, "step": 3149 }, { "epoch": 0.40071237756010686, "ewc_loss": 0.031974807381629944, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011955277295783162, "grad_norm": 4.381963729858398, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8428564667701721, "num_tokens": 120240700.0, "step": 3150 }, { "epoch": 0.4008395878386974, "ewc_loss": 0.03191547095775604, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011895941133843735, "grad_norm": 4.369899272918701, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8452770113945007, "num_tokens": 120286741.0, "step": 3151 }, { "epoch": 0.40096679811728786, "ewc_loss": 0.03194952756166458, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011929995525861159, "grad_norm": 4.44858980178833, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8582190275192261, "num_tokens": 120323236.0, "step": 3152 }, { "epoch": 0.4010940083958784, "ewc_loss": 0.03196979686617851, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011950266343774274, "grad_norm": 4.416428565979004, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8548797369003296, "num_tokens": 120362185.0, "step": 3153 }, { "epoch": 0.4012212186744689, "ewc_loss": 0.031961895525455475, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011942362471017987, "grad_norm": 4.455329895019531, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8477277755737305, "num_tokens": 120397524.0, "step": 3154 }, { "epoch": 0.4013484289530594, "ewc_loss": 0.03195994719862938, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011940416879951954, "grad_norm": 4.388356685638428, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8589614033699036, "num_tokens": 120439723.0, "step": 3155 }, { "epoch": 0.4014756392316499, "ewc_loss": 0.03193037211894989, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011910843022633344, "grad_norm": 4.427157402038574, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8593360781669617, "num_tokens": 120483693.0, "step": 3156 }, { "epoch": 0.40160284951024044, "ewc_loss": 0.03194800019264221, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011928466847166419, "grad_norm": 4.434312343597412, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8649454116821289, "num_tokens": 120520558.0, "step": 3157 }, { "epoch": 0.4017300597888309, "ewc_loss": 0.03193600848317146, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011916477524209768, "grad_norm": 4.4402313232421875, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8652704358100891, "num_tokens": 120557901.0, "step": 3158 }, { "epoch": 0.40185727006742145, "ewc_loss": 0.03192313015460968, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011903599079232663, "grad_norm": 4.433687686920166, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8612335920333862, "num_tokens": 120593851.0, "step": 3159 }, { "epoch": 0.401984480346012, "ewc_loss": 0.031919926404953, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011900394747499377, "grad_norm": 4.579493522644043, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8479738235473633, "num_tokens": 120624533.0, "step": 3160 }, { "epoch": 0.40211169062460245, "ewc_loss": 0.03198551759123802, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011965986777795479, "grad_norm": 4.436336517333984, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8517594337463379, "num_tokens": 120660820.0, "step": 3161 }, { "epoch": 0.402238900903193, "ewc_loss": 0.03185329586267471, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011833766620839015, "grad_norm": 4.402223587036133, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8678075075149536, "num_tokens": 120701485.0, "step": 3162 }, { "epoch": 0.4023661111817835, "ewc_loss": 0.03192703053355217, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011907500447705388, "grad_norm": 4.466509819030762, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8644099831581116, "num_tokens": 120736677.0, "step": 3163 }, { "epoch": 0.402493321460374, "ewc_loss": 0.03193775191903114, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011918221571249887, "grad_norm": 4.482143878936768, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.853290319442749, "num_tokens": 120773933.0, "step": 3164 }, { "epoch": 0.4026205317389645, "ewc_loss": 0.031902723014354706, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011883192928507924, "grad_norm": 4.505763530731201, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.860431432723999, "num_tokens": 120804671.0, "step": 3165 }, { "epoch": 0.40274774201755503, "ewc_loss": 0.03194315358996391, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011923621786991134, "grad_norm": 4.470959663391113, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8599433302879333, "num_tokens": 120837042.0, "step": 3166 }, { "epoch": 0.4028749522961455, "ewc_loss": 0.031877804547548294, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011858272046083584, "grad_norm": 4.391484260559082, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8523216247558594, "num_tokens": 120875127.0, "step": 3167 }, { "epoch": 0.40300216257473603, "ewc_loss": 0.03191797435283661, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011898442608071491, "grad_norm": 4.4754791259765625, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8548039197921753, "num_tokens": 120915937.0, "step": 3168 }, { "epoch": 0.40312937285332656, "ewc_loss": 0.031958915293216705, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011939385149162263, "grad_norm": 4.456536293029785, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8398884534835815, "num_tokens": 120953334.0, "step": 3169 }, { "epoch": 0.40325658313191703, "ewc_loss": 0.031897835433483124, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011878304212586954, "grad_norm": 4.4625349044799805, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8563912510871887, "num_tokens": 120986674.0, "step": 3170 }, { "epoch": 0.40338379341050756, "ewc_loss": 0.03194088488817215, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011921353143407032, "grad_norm": 4.394958972930908, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8617283701896667, "num_tokens": 121026305.0, "step": 3171 }, { "epoch": 0.4035110036890981, "ewc_loss": 0.03191957622766495, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011900046956725419, "grad_norm": 4.468883514404297, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8696123361587524, "num_tokens": 121061644.0, "step": 3172 }, { "epoch": 0.40363821396768856, "ewc_loss": 0.0319652259349823, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011945695587201044, "grad_norm": 4.367926597595215, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8519745469093323, "num_tokens": 121102860.0, "step": 3173 }, { "epoch": 0.4037654242462791, "ewc_loss": 0.03191903978586197, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001189950926345773, "grad_norm": 4.502207279205322, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8686593770980835, "num_tokens": 121136566.0, "step": 3174 }, { "epoch": 0.4038926345248696, "ewc_loss": 0.03201792761683464, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011998395348200575, "grad_norm": 4.440083980560303, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8652897477149963, "num_tokens": 121173759.0, "step": 3175 }, { "epoch": 0.4040198448034601, "ewc_loss": 0.03190011903643608, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011880588135682046, "grad_norm": 4.406183242797852, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8560882210731506, "num_tokens": 121219881.0, "step": 3176 }, { "epoch": 0.4041470550820506, "ewc_loss": 0.03194780275225639, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011928271851502359, "grad_norm": 4.489147186279297, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8547527194023132, "num_tokens": 121258719.0, "step": 3177 }, { "epoch": 0.40427426536064115, "ewc_loss": 0.03195897117257118, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011939438991248608, "grad_norm": 4.4844536781311035, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8327867388725281, "num_tokens": 121298013.0, "step": 3178 }, { "epoch": 0.4044014756392316, "ewc_loss": 0.03194383904337883, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011924308637389913, "grad_norm": 4.5197529792785645, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.86799156665802, "num_tokens": 121329615.0, "step": 3179 }, { "epoch": 0.40452868591782215, "ewc_loss": 0.031932052224874496, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011912521586054936, "grad_norm": 4.4416351318359375, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8665597438812256, "num_tokens": 121365199.0, "step": 3180 }, { "epoch": 0.4046558961964127, "ewc_loss": 0.03189009428024292, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011870562593685463, "grad_norm": 4.471695423126221, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8517100811004639, "num_tokens": 121402393.0, "step": 3181 }, { "epoch": 0.4047831064750032, "ewc_loss": 0.03191298618912697, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001189345566672273, "grad_norm": 4.440611362457275, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8561737537384033, "num_tokens": 121439678.0, "step": 3182 }, { "epoch": 0.4049103167535937, "ewc_loss": 0.031890079379081726, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011870548041770235, "grad_norm": 4.479423522949219, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8508436679840088, "num_tokens": 121480781.0, "step": 3183 }, { "epoch": 0.4050375270321842, "ewc_loss": 0.03191056847572327, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011891037866007537, "grad_norm": 4.416039943695068, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8630422949790955, "num_tokens": 121517614.0, "step": 3184 }, { "epoch": 0.40516473731077474, "ewc_loss": 0.03184560686349869, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011826075206045061, "grad_norm": 4.516622543334961, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8525524139404297, "num_tokens": 121552225.0, "step": 3185 }, { "epoch": 0.4052919475893652, "ewc_loss": 0.03192741051316261, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011907880252692848, "grad_norm": 4.435218334197998, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8514238595962524, "num_tokens": 121587177.0, "step": 3186 }, { "epoch": 0.40541915786795574, "ewc_loss": 0.03186681866645813, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011847289715660736, "grad_norm": 4.598415374755859, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8375922441482544, "num_tokens": 121619547.0, "step": 3187 }, { "epoch": 0.40554636814654627, "ewc_loss": 0.03200734779238701, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011987816105829552, "grad_norm": 4.42202615737915, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8629698753356934, "num_tokens": 121657396.0, "step": 3188 }, { "epoch": 0.40567357842513674, "ewc_loss": 0.03183992952108383, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011820397776318714, "grad_norm": 4.454676151275635, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8401882648468018, "num_tokens": 121691867.0, "step": 3189 }, { "epoch": 0.40580078870372727, "ewc_loss": 0.03195839747786522, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011938867100980133, "grad_norm": 4.527429580688477, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8503618240356445, "num_tokens": 121723546.0, "step": 3190 }, { "epoch": 0.4059279989823178, "ewc_loss": 0.031921982765197754, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011902453115908429, "grad_norm": 4.391651630401611, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.855398416519165, "num_tokens": 121764589.0, "step": 3191 }, { "epoch": 0.40605520926090827, "ewc_loss": 0.03190189227461815, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011882360558956861, "grad_norm": 4.508869647979736, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8454688787460327, "num_tokens": 121803281.0, "step": 3192 }, { "epoch": 0.4061824195394988, "ewc_loss": 0.03198712691664696, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011967594764428213, "grad_norm": 4.474539279937744, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8432235717773438, "num_tokens": 121841710.0, "step": 3193 }, { "epoch": 0.4063096298180893, "ewc_loss": 0.03196528181433678, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011945749429287389, "grad_norm": 4.421435832977295, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.853886604309082, "num_tokens": 121881979.0, "step": 3194 }, { "epoch": 0.4064368400966798, "ewc_loss": 0.03196802735328674, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011948496103286743, "grad_norm": 4.437869071960449, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8604147434234619, "num_tokens": 121921599.0, "step": 3195 }, { "epoch": 0.4065640503752703, "ewc_loss": 0.031999677419662476, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011980146518908441, "grad_norm": 4.452434062957764, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8644351959228516, "num_tokens": 121960545.0, "step": 3196 }, { "epoch": 0.40669126065386085, "ewc_loss": 0.031980834901332855, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011961302516283467, "grad_norm": 4.431739807128906, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8370857834815979, "num_tokens": 121999259.0, "step": 3197 }, { "epoch": 0.4068184709324513, "ewc_loss": 0.03198548033833504, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011965949670411646, "grad_norm": 4.42742395401001, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8690966367721558, "num_tokens": 122039066.0, "step": 3198 }, { "epoch": 0.40694568121104185, "ewc_loss": 0.03200312703847885, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011983596778009087, "grad_norm": 4.403085231781006, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.86305832862854, "num_tokens": 122085035.0, "step": 3199 }, { "epoch": 0.4070728914896324, "ewc_loss": 0.031974032521247864, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011954499495914206, "grad_norm": 4.416940689086914, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8698583841323853, "num_tokens": 122128007.0, "step": 3200 }, { "epoch": 0.40720010176822286, "ewc_loss": 0.032003603875637054, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011984074808424339, "grad_norm": 4.490547180175781, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8537682294845581, "num_tokens": 122169623.0, "step": 3201 }, { "epoch": 0.4073273120468134, "ewc_loss": 0.03198699653148651, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011967465252382681, "grad_norm": 4.550562381744385, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8508909940719604, "num_tokens": 122201755.0, "step": 3202 }, { "epoch": 0.4074545223254039, "ewc_loss": 0.03198997303843498, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011970441846642643, "grad_norm": 4.514444351196289, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8673375844955444, "num_tokens": 122233680.0, "step": 3203 }, { "epoch": 0.4075817326039944, "ewc_loss": 0.031958624720573425, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011939094110857695, "grad_norm": 4.576210021972656, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.852886438369751, "num_tokens": 122265107.0, "step": 3204 }, { "epoch": 0.4077089428825849, "ewc_loss": 0.031972482800483704, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011952951172133908, "grad_norm": 4.436328887939453, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8603765368461609, "num_tokens": 122302750.0, "step": 3205 }, { "epoch": 0.40783615316117544, "ewc_loss": 0.03186672925949097, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001184719949378632, "grad_norm": 4.406519412994385, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.847342848777771, "num_tokens": 122346820.0, "step": 3206 }, { "epoch": 0.4079633634397659, "ewc_loss": 0.03196428716182709, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011944757716264576, "grad_norm": 4.5061259269714355, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8602917790412903, "num_tokens": 122383254.0, "step": 3207 }, { "epoch": 0.40809057371835644, "ewc_loss": 0.031984444707632065, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011964913574047387, "grad_norm": 4.479667663574219, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8679267764091492, "num_tokens": 122416693.0, "step": 3208 }, { "epoch": 0.40821778399694697, "ewc_loss": 0.031908068805933, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011888537119375542, "grad_norm": 4.471181869506836, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8619083166122437, "num_tokens": 122452403.0, "step": 3209 }, { "epoch": 0.40834499427553744, "ewc_loss": 0.03194265067577362, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011923119745915756, "grad_norm": 4.486145973205566, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8447288870811462, "num_tokens": 122489685.0, "step": 3210 }, { "epoch": 0.40847220455412797, "ewc_loss": 0.03196612372994423, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011946593440370634, "grad_norm": 4.550230026245117, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8383466005325317, "num_tokens": 122529079.0, "step": 3211 }, { "epoch": 0.4085994148327185, "ewc_loss": 0.03196946904063225, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011949938198085874, "grad_norm": 4.4003071784973145, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8495706915855408, "num_tokens": 122569933.0, "step": 3212 }, { "epoch": 0.40872662511130897, "ewc_loss": 0.03193489462137222, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011915364302694798, "grad_norm": 4.528393268585205, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8357481956481934, "num_tokens": 122607884.0, "step": 3213 }, { "epoch": 0.4088538353898995, "ewc_loss": 0.03204948082566261, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012029949721181765, "grad_norm": 4.417431354522705, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8566842675209045, "num_tokens": 122651968.0, "step": 3214 }, { "epoch": 0.40898104566849003, "ewc_loss": 0.03192214295268059, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001190261245938018, "grad_norm": 4.572384834289551, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8601510524749756, "num_tokens": 122688680.0, "step": 3215 }, { "epoch": 0.4091082559470805, "ewc_loss": 0.03206494450569153, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012045415496686473, "grad_norm": 4.412363529205322, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8619205951690674, "num_tokens": 122728271.0, "step": 3216 }, { "epoch": 0.40923546622567103, "ewc_loss": 0.03194557502865791, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011926043225685135, "grad_norm": 4.542628765106201, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8551071882247925, "num_tokens": 122759691.0, "step": 3217 }, { "epoch": 0.40936267650426156, "ewc_loss": 0.032066844403743744, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012047315249219537, "grad_norm": 4.486852169036865, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.853946328163147, "num_tokens": 122800185.0, "step": 3218 }, { "epoch": 0.40948988678285203, "ewc_loss": 0.03193504735827446, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011915514915017411, "grad_norm": 4.455371856689453, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.865370512008667, "num_tokens": 122839548.0, "step": 3219 }, { "epoch": 0.40961709706144256, "ewc_loss": 0.03196552395820618, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011945993901463225, "grad_norm": 4.444501876831055, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8424587249755859, "num_tokens": 122879621.0, "step": 3220 }, { "epoch": 0.4097443073400331, "ewc_loss": 0.03197307884693146, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011953547800658271, "grad_norm": 4.482428073883057, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8506503105163574, "num_tokens": 122921495.0, "step": 3221 }, { "epoch": 0.40987151761862356, "ewc_loss": 0.032010965049266815, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011991434439551085, "grad_norm": 4.480586528778076, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8631150722503662, "num_tokens": 122957197.0, "step": 3222 }, { "epoch": 0.4099987278972141, "ewc_loss": 0.03199964761734009, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011980118142673746, "grad_norm": 4.4623847007751465, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8376805186271667, "num_tokens": 122999999.0, "step": 3223 }, { "epoch": 0.4101259381758046, "ewc_loss": 0.032010890543460846, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001199136022478342, "grad_norm": 4.416159152984619, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8605093359947205, "num_tokens": 123040402.0, "step": 3224 }, { "epoch": 0.4102531484543951, "ewc_loss": 0.032183900475502014, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012042297748848796, "grad_norm": 4.466392993927002, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8589324951171875, "num_tokens": 123080086.0, "step": 3225 }, { "epoch": 0.4103803587329856, "ewc_loss": 0.03202967345714569, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001201014019898139, "grad_norm": 4.451009750366211, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8570206165313721, "num_tokens": 123120058.0, "step": 3226 }, { "epoch": 0.41050756901157615, "ewc_loss": 0.03203461691737175, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001201508566737175, "grad_norm": 4.459413051605225, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8574580550193787, "num_tokens": 123162003.0, "step": 3227 }, { "epoch": 0.4106347792901666, "ewc_loss": 0.03205231949687004, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001203278879984282, "grad_norm": 4.545095920562744, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8240703344345093, "num_tokens": 123202249.0, "step": 3228 }, { "epoch": 0.41076198956875715, "ewc_loss": 0.03218767046928406, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001204607033287175, "grad_norm": 4.476440906524658, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8494784832000732, "num_tokens": 123237401.0, "step": 3229 }, { "epoch": 0.4108891998473477, "ewc_loss": 0.03201552852988243, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011995996464975178, "grad_norm": 4.544487476348877, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8714460134506226, "num_tokens": 123269364.0, "step": 3230 }, { "epoch": 0.4110164101259382, "ewc_loss": 0.03205948695540428, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012039954890497029, "grad_norm": 4.466382026672363, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8588905334472656, "num_tokens": 123307743.0, "step": 3231 }, { "epoch": 0.4111436204045287, "ewc_loss": 0.0319775715470314, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011958039249293506, "grad_norm": 4.474039077758789, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8615038394927979, "num_tokens": 123340102.0, "step": 3232 }, { "epoch": 0.4112708306831192, "ewc_loss": 0.03206311911344528, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012043588503729552, "grad_norm": 4.491214275360107, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8512980937957764, "num_tokens": 123374956.0, "step": 3233 }, { "epoch": 0.41139804096170973, "ewc_loss": 0.032196082174777985, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012054481339873746, "grad_norm": 4.491481781005859, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8653932809829712, "num_tokens": 123418070.0, "step": 3234 }, { "epoch": 0.4115252512403002, "ewc_loss": 0.032081037759780884, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012061505549354479, "grad_norm": 4.558620452880859, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8538281917572021, "num_tokens": 123453183.0, "step": 3235 }, { "epoch": 0.41165246151889073, "ewc_loss": 0.0320977047085762, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012078172585461289, "grad_norm": 4.445635795593262, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8570985198020935, "num_tokens": 123491526.0, "step": 3236 }, { "epoch": 0.41177967179748126, "ewc_loss": 0.03217814117670059, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001203653882839717, "grad_norm": 4.518509387969971, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8571489453315735, "num_tokens": 123529743.0, "step": 3237 }, { "epoch": 0.41190688207607173, "ewc_loss": 0.03226453438401222, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012122932093916461, "grad_norm": 4.444482803344727, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8590033054351807, "num_tokens": 123573402.0, "step": 3238 }, { "epoch": 0.41203409235466226, "ewc_loss": 0.032213255763053894, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.0001207165332743898, "grad_norm": 4.709898948669434, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8540202379226685, "num_tokens": 123604167.0, "step": 3239 }, { "epoch": 0.4121613026332528, "ewc_loss": 0.03239164128899574, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012250039435457438, "grad_norm": 4.402907371520996, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8527218103408813, "num_tokens": 123648104.0, "step": 3240 }, { "epoch": 0.41228851291184326, "ewc_loss": 0.03211991861462593, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00011978316615568474, "grad_norm": 4.464334964752197, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8458244204521179, "num_tokens": 123689678.0, "step": 3241 }, { "epoch": 0.4124157231904338, "ewc_loss": 0.03234357014298439, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012201969366287813, "grad_norm": 4.464168071746826, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8640564680099487, "num_tokens": 123729257.0, "step": 3242 }, { "epoch": 0.4125429334690243, "ewc_loss": 0.03219040855765343, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012048806820530444, "grad_norm": 4.4629740715026855, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8569502234458923, "num_tokens": 123772251.0, "step": 3243 }, { "epoch": 0.4126701437476148, "ewc_loss": 0.03212212398648262, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012102592882001773, "grad_norm": 4.498905658721924, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8684518337249756, "num_tokens": 123810345.0, "step": 3244 }, { "epoch": 0.4127973540262053, "ewc_loss": 0.03225746005773544, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012115859135519713, "grad_norm": 4.441028118133545, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8526314496994019, "num_tokens": 123855276.0, "step": 3245 }, { "epoch": 0.41292456430479585, "ewc_loss": 0.03222161531448364, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012080015585524961, "grad_norm": 4.441171169281006, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8587875366210938, "num_tokens": 123896613.0, "step": 3246 }, { "epoch": 0.4130517745833863, "ewc_loss": 0.03214609622955322, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012126565707148984, "grad_norm": 4.47733736038208, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8395422697067261, "num_tokens": 123940713.0, "step": 3247 }, { "epoch": 0.41317898486197685, "ewc_loss": 0.032123442739248276, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012103912013117224, "grad_norm": 4.475030899047852, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8455068469047546, "num_tokens": 123981485.0, "step": 3248 }, { "epoch": 0.4133061951405674, "ewc_loss": 0.03211938589811325, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012099853483960032, "grad_norm": 4.533409595489502, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8567402362823486, "num_tokens": 124018159.0, "step": 3249 }, { "epoch": 0.41343340541915785, "ewc_loss": 0.032115619629621506, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012096088903490454, "grad_norm": 4.481760025024414, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8662686347961426, "num_tokens": 124049911.0, "step": 3250 }, { "epoch": 0.4135606156977484, "ewc_loss": 0.0320880189538002, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001206848828587681, "grad_norm": 4.481134414672852, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8763357400894165, "num_tokens": 124084260.0, "step": 3251 }, { "epoch": 0.4136878259763389, "ewc_loss": 0.03213841840624809, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012118887389078736, "grad_norm": 4.824445724487305, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.858761191368103, "num_tokens": 124122741.0, "step": 3252 }, { "epoch": 0.4138150362549294, "ewc_loss": 0.03236066922545433, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012219068594276905, "grad_norm": 4.431413650512695, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8533084392547607, "num_tokens": 124162793.0, "step": 3253 }, { "epoch": 0.4139422465335199, "ewc_loss": 0.031959302723407745, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00011939772957703099, "grad_norm": 4.489770889282227, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8487843871116638, "num_tokens": 124206889.0, "step": 3254 }, { "epoch": 0.41406945681211044, "ewc_loss": 0.0321476049721241, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012128074740758166, "grad_norm": 4.471900463104248, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8669320940971375, "num_tokens": 124244551.0, "step": 3255 }, { "epoch": 0.4141966670907009, "ewc_loss": 0.03206663951277733, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012047109339619055, "grad_norm": 4.441673755645752, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8581651449203491, "num_tokens": 124290485.0, "step": 3256 }, { "epoch": 0.41432387736929144, "ewc_loss": 0.03215078264474869, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001213125215144828, "grad_norm": 4.55109977722168, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8633319139480591, "num_tokens": 124323763.0, "step": 3257 }, { "epoch": 0.41445108764788197, "ewc_loss": 0.03228749334812164, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012145893560955301, "grad_norm": 4.472663879394531, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8564180135726929, "num_tokens": 124363318.0, "step": 3258 }, { "epoch": 0.41457829792647244, "ewc_loss": 0.03214621543884277, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012126682850066572, "grad_norm": 4.47885274887085, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8782849311828613, "num_tokens": 124400845.0, "step": 3259 }, { "epoch": 0.41470550820506297, "ewc_loss": 0.03216557949781418, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001214604708366096, "grad_norm": 4.426501274108887, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.863438606262207, "num_tokens": 124442546.0, "step": 3260 }, { "epoch": 0.4148327184836535, "ewc_loss": 0.03215910866856575, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012139577302150428, "grad_norm": 4.587934970855713, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8513767719268799, "num_tokens": 124477737.0, "step": 3261 }, { "epoch": 0.41495992876224397, "ewc_loss": 0.03225123882293701, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.0001223170547746122, "grad_norm": 4.5103607177734375, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8442320823669434, "num_tokens": 124518081.0, "step": 3262 }, { "epoch": 0.4150871390408345, "ewc_loss": 0.03210880979895592, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012089278607163578, "grad_norm": 4.508896350860596, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8456005454063416, "num_tokens": 124556361.0, "step": 3263 }, { "epoch": 0.415214349319425, "ewc_loss": 0.03217518329620361, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012155653530498967, "grad_norm": 4.500185012817383, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.855860710144043, "num_tokens": 124588490.0, "step": 3264 }, { "epoch": 0.4153415595980155, "ewc_loss": 0.03218346834182739, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012163937935838476, "grad_norm": 4.470042705535889, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8549866676330566, "num_tokens": 124624656.0, "step": 3265 }, { "epoch": 0.415468769876606, "ewc_loss": 0.0321984589099884, "ewc_loss_diag": 2.002716064453125e-05, "ewc_loss_parallel": 0.00012178927136119455, "grad_norm": 4.563337802886963, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8418570756912231, "num_tokens": 124660681.0, "step": 3266 }, { "epoch": 0.41559598015519655, "ewc_loss": 0.0323624387383461, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012220836651977152, "grad_norm": 4.4567999839782715, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8574525713920593, "num_tokens": 124703448.0, "step": 3267 }, { "epoch": 0.415723190433787, "ewc_loss": 0.03230905532836914, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012167455133749172, "grad_norm": 4.50830602645874, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.85434889793396, "num_tokens": 124742078.0, "step": 3268 }, { "epoch": 0.41585040071237755, "ewc_loss": 0.032365042716264725, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012223439989611506, "grad_norm": 4.5466694831848145, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8433910608291626, "num_tokens": 124779894.0, "step": 3269 }, { "epoch": 0.4159776109909681, "ewc_loss": 0.03236673027276993, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012225130922161043, "grad_norm": 4.5165300369262695, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.851578950881958, "num_tokens": 124817918.0, "step": 3270 }, { "epoch": 0.41610482126955856, "ewc_loss": 0.03234010562300682, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.000121985052828677, "grad_norm": 4.438978672027588, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8722793459892273, "num_tokens": 124854839.0, "step": 3271 }, { "epoch": 0.4162320315481491, "ewc_loss": 0.03235670179128647, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012215101742185652, "grad_norm": 4.626817226409912, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8412497043609619, "num_tokens": 124886779.0, "step": 3272 }, { "epoch": 0.4163592418267396, "ewc_loss": 0.032437700778245926, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012296099157538265, "grad_norm": 4.485150337219238, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8645535111427307, "num_tokens": 124927919.0, "step": 3273 }, { "epoch": 0.4164864521053301, "ewc_loss": 0.032293614000082016, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012152011186117306, "grad_norm": 4.579261302947998, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.853263795375824, "num_tokens": 124966475.0, "step": 3274 }, { "epoch": 0.4166136623839206, "ewc_loss": 0.03244122490286827, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012299623631406575, "grad_norm": 4.474099636077881, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8316704630851746, "num_tokens": 125004110.0, "step": 3275 }, { "epoch": 0.41674087266251114, "ewc_loss": 0.032316695898771286, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012175095616839826, "grad_norm": 4.465243816375732, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8695381879806519, "num_tokens": 125043164.0, "step": 3276 }, { "epoch": 0.4168680829411016, "ewc_loss": 0.032403238117694855, "ewc_loss_diag": 2.014636993408203e-05, "ewc_loss_parallel": 0.00012261635856702924, "grad_norm": 4.482398986816406, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8595521450042725, "num_tokens": 125084255.0, "step": 3277 }, { "epoch": 0.41699529321969214, "ewc_loss": 0.032482173293828964, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012218502524774522, "grad_norm": 4.495297431945801, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8692619204521179, "num_tokens": 125124971.0, "step": 3278 }, { "epoch": 0.41712250349828267, "ewc_loss": 0.03250967711210251, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012246005644556135, "grad_norm": 4.55116081237793, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.85274338722229, "num_tokens": 125161470.0, "step": 3279 }, { "epoch": 0.4172497137768732, "ewc_loss": 0.03250517696142197, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012241503281984478, "grad_norm": 4.509184837341309, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8397839069366455, "num_tokens": 125199117.0, "step": 3280 }, { "epoch": 0.41737692405546367, "ewc_loss": 0.032486893236637115, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012223220255691558, "grad_norm": 4.565354824066162, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8372704982757568, "num_tokens": 125238136.0, "step": 3281 }, { "epoch": 0.4175041343340542, "ewc_loss": 0.03251586854457855, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012252196029294282, "grad_norm": 4.445150852203369, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8534896969795227, "num_tokens": 125281041.0, "step": 3282 }, { "epoch": 0.41763134461264473, "ewc_loss": 0.03247685730457306, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012213183799758554, "grad_norm": 4.545199394226074, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8586850762367249, "num_tokens": 125314739.0, "step": 3283 }, { "epoch": 0.4177585548912352, "ewc_loss": 0.03268515318632126, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.0001229941117344424, "grad_norm": 4.533938884735107, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8496189117431641, "num_tokens": 125353129.0, "step": 3284 }, { "epoch": 0.41788576516982573, "ewc_loss": 0.032461874186992645, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012198203330626711, "grad_norm": 4.4598870277404785, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8540758490562439, "num_tokens": 125394092.0, "step": 3285 }, { "epoch": 0.41801297544841626, "ewc_loss": 0.03251710534095764, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.0001225343148689717, "grad_norm": 4.521356105804443, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8498976230621338, "num_tokens": 125438262.0, "step": 3286 }, { "epoch": 0.41814018572700673, "ewc_loss": 0.03251970559358597, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012256033369340003, "grad_norm": 4.472252368927002, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8545227646827698, "num_tokens": 125477842.0, "step": 3287 }, { "epoch": 0.41826739600559726, "ewc_loss": 0.032479800283908844, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012216130562592298, "grad_norm": 4.492509841918945, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8669371008872986, "num_tokens": 125517236.0, "step": 3288 }, { "epoch": 0.4183946062841878, "ewc_loss": 0.032511476427316666, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.0001224780426127836, "grad_norm": 4.451288223266602, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.848947286605835, "num_tokens": 125561598.0, "step": 3289 }, { "epoch": 0.41852181656277826, "ewc_loss": 0.03249238431453705, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.0001222871069330722, "grad_norm": 4.549074649810791, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.85331130027771, "num_tokens": 125599104.0, "step": 3290 }, { "epoch": 0.4186490268413688, "ewc_loss": 0.03255540132522583, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012291727762203664, "grad_norm": 4.539932727813721, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8555648922920227, "num_tokens": 125635286.0, "step": 3291 }, { "epoch": 0.4187762371199593, "ewc_loss": 0.032490819692611694, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012227146362420171, "grad_norm": 4.743338108062744, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8604975938796997, "num_tokens": 125669534.0, "step": 3292 }, { "epoch": 0.4189034473985498, "ewc_loss": 0.03260307013988495, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012339399836491793, "grad_norm": 4.482753753662109, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8545449376106262, "num_tokens": 125707966.0, "step": 3293 }, { "epoch": 0.4190306576771403, "ewc_loss": 0.032374441623687744, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012110767420381308, "grad_norm": 4.586492538452148, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8510576486587524, "num_tokens": 125741448.0, "step": 3294 }, { "epoch": 0.41915786795573085, "ewc_loss": 0.03254977986216545, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012286107812542468, "grad_norm": 4.509926795959473, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8483327627182007, "num_tokens": 125776887.0, "step": 3295 }, { "epoch": 0.4192850782343213, "ewc_loss": 0.03244205564260483, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012178382166894153, "grad_norm": 4.556906700134277, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8601253628730774, "num_tokens": 125807899.0, "step": 3296 }, { "epoch": 0.41941228851291185, "ewc_loss": 0.032551802694797516, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012288131983950734, "grad_norm": 4.48320198059082, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8626168370246887, "num_tokens": 125847711.0, "step": 3297 }, { "epoch": 0.4195394987915024, "ewc_loss": 0.03245772421360016, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012194054579595104, "grad_norm": 4.495418548583984, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8689157962799072, "num_tokens": 125883248.0, "step": 3298 }, { "epoch": 0.41966670907009285, "ewc_loss": 0.032541632652282715, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012277961650397629, "grad_norm": 4.464739799499512, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8441777229309082, "num_tokens": 125925771.0, "step": 3299 }, { "epoch": 0.4197939193486834, "ewc_loss": 0.032506294548511505, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012242620869074017, "grad_norm": 4.464749813079834, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8443425893783569, "num_tokens": 125968815.0, "step": 3300 }, { "epoch": 0.4199211296272739, "ewc_loss": 0.03254269063472748, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012279016664251685, "grad_norm": 4.477404594421387, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8505251407623291, "num_tokens": 126007761.0, "step": 3301 }, { "epoch": 0.4200483399058644, "ewc_loss": 0.03255508095026016, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012291409075260162, "grad_norm": 4.509944915771484, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8367332220077515, "num_tokens": 126047872.0, "step": 3302 }, { "epoch": 0.4201755501844549, "ewc_loss": 0.032589495182037354, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.0001232582435477525, "grad_norm": 4.528155326843262, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8624886274337769, "num_tokens": 126081273.0, "step": 3303 }, { "epoch": 0.42030276046304543, "ewc_loss": 0.03257071599364281, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012307045108173043, "grad_norm": 4.516702175140381, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8406122922897339, "num_tokens": 126121590.0, "step": 3304 }, { "epoch": 0.4204299707416359, "ewc_loss": 0.03259127959609032, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012327608419582248, "grad_norm": 4.502300262451172, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8383357524871826, "num_tokens": 126168396.0, "step": 3305 }, { "epoch": 0.42055718102022643, "ewc_loss": 0.032543666660785675, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012279994552955031, "grad_norm": 4.522670745849609, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8509483337402344, "num_tokens": 126205220.0, "step": 3306 }, { "epoch": 0.42068439129881696, "ewc_loss": 0.03256426751613617, "ewc_loss_diag": 2.0265579223632812e-05, "ewc_loss_parallel": 0.00012300597154535353, "grad_norm": 4.530070781707764, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8499284982681274, "num_tokens": 126239131.0, "step": 3307 }, { "epoch": 0.42081160157740743, "ewc_loss": 0.03268726170063019, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00012301519745960832, "grad_norm": 4.51196813583374, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8571489453315735, "num_tokens": 126271931.0, "step": 3308 }, { "epoch": 0.42093881185599796, "ewc_loss": 0.03278040140867233, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012272590538486838, "grad_norm": 4.476702690124512, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8640553951263428, "num_tokens": 126307730.0, "step": 3309 }, { "epoch": 0.4210660221345885, "ewc_loss": 0.032679133117198944, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00012293391046114266, "grad_norm": 4.513286590576172, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8577221035957336, "num_tokens": 126344528.0, "step": 3310 }, { "epoch": 0.42119323241317896, "ewc_loss": 0.032836902886629105, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012329089804552495, "grad_norm": 4.489136219024658, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8637248277664185, "num_tokens": 126377260.0, "step": 3311 }, { "epoch": 0.4213204426917695, "ewc_loss": 0.03293639421463013, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012306509597692639, "grad_norm": 4.549123287200928, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8519518375396729, "num_tokens": 126409723.0, "step": 3312 }, { "epoch": 0.42144765297036, "ewc_loss": 0.03296920657157898, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001233932562172413, "grad_norm": 4.451704025268555, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.859341025352478, "num_tokens": 126450929.0, "step": 3313 }, { "epoch": 0.4215748632489505, "ewc_loss": 0.032925352454185486, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012295469059608877, "grad_norm": 4.487462043762207, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8647869825363159, "num_tokens": 126486878.0, "step": 3314 }, { "epoch": 0.421702073527541, "ewc_loss": 0.032850231975317, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001234241935890168, "grad_norm": 4.551062107086182, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8433698415756226, "num_tokens": 126522884.0, "step": 3315 }, { "epoch": 0.42182928380613155, "ewc_loss": 0.03297848999500275, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012348608288448304, "grad_norm": 4.510139465332031, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8566085696220398, "num_tokens": 126560848.0, "step": 3316 }, { "epoch": 0.421956494084722, "ewc_loss": 0.032972630113363266, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001234274823218584, "grad_norm": 4.576262474060059, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8577505350112915, "num_tokens": 126592947.0, "step": 3317 }, { "epoch": 0.42208370436331255, "ewc_loss": 0.033008821308612823, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001237893884535879, "grad_norm": 4.493658065795898, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8661007285118103, "num_tokens": 126630631.0, "step": 3318 }, { "epoch": 0.4222109146419031, "ewc_loss": 0.032929785549640656, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012299904483370483, "grad_norm": 4.470952987670898, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8525309562683105, "num_tokens": 126673395.0, "step": 3319 }, { "epoch": 0.42233812492049355, "ewc_loss": 0.03298281878232956, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012352934572845697, "grad_norm": 4.518085956573486, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8716881275177002, "num_tokens": 126711504.0, "step": 3320 }, { "epoch": 0.4224653351990841, "ewc_loss": 0.03298818692564964, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012358302774373442, "grad_norm": 4.463082313537598, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.868476390838623, "num_tokens": 126753217.0, "step": 3321 }, { "epoch": 0.4225925454776746, "ewc_loss": 0.032910339534282684, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012280458759050816, "grad_norm": 4.457702159881592, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8619872331619263, "num_tokens": 126792982.0, "step": 3322 }, { "epoch": 0.4227197557562651, "ewc_loss": 0.03295133635401726, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012321454414632171, "grad_norm": 4.6102447509765625, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8501186966896057, "num_tokens": 126823771.0, "step": 3323 }, { "epoch": 0.4228469660348556, "ewc_loss": 0.033020246773958206, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012390365009196103, "grad_norm": 4.43361234664917, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8595505952835083, "num_tokens": 126867137.0, "step": 3324 }, { "epoch": 0.42297417631344614, "ewc_loss": 0.0328841358423233, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012254253670107573, "grad_norm": 4.508606433868408, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8619349598884583, "num_tokens": 126902680.0, "step": 3325 }, { "epoch": 0.4231013865920366, "ewc_loss": 0.032889243215322495, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012381431588437408, "grad_norm": 4.518928050994873, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.86860191822052, "num_tokens": 126941037.0, "step": 3326 }, { "epoch": 0.42322859687062714, "ewc_loss": 0.032782189548015594, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001227437605848536, "grad_norm": 4.5107502937316895, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8598657846450806, "num_tokens": 126980009.0, "step": 3327 }, { "epoch": 0.42335580714921767, "ewc_loss": 0.03295358270406723, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012323699775151908, "grad_norm": 4.485986709594727, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.868567705154419, "num_tokens": 127020022.0, "step": 3328 }, { "epoch": 0.42348301742780814, "ewc_loss": 0.03281958028674126, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001231176865985617, "grad_norm": 4.507596969604492, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8371239900588989, "num_tokens": 127060500.0, "step": 3329 }, { "epoch": 0.42361022770639867, "ewc_loss": 0.03282977268099785, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012321959366090596, "grad_norm": 4.496998310089111, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8527186512947083, "num_tokens": 127095886.0, "step": 3330 }, { "epoch": 0.4237374379849892, "ewc_loss": 0.03281319513916969, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001230538182426244, "grad_norm": 4.585029125213623, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8462482690811157, "num_tokens": 127134334.0, "step": 3331 }, { "epoch": 0.4238646482635797, "ewc_loss": 0.0328814759850502, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001237366523128003, "grad_norm": 4.5730156898498535, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8487005829811096, "num_tokens": 127174288.0, "step": 3332 }, { "epoch": 0.4239918585421702, "ewc_loss": 0.03279875963926315, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012290947779547423, "grad_norm": 4.534050464630127, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8532335758209229, "num_tokens": 127210311.0, "step": 3333 }, { "epoch": 0.4241190688207607, "ewc_loss": 0.032811325043439865, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012303511903155595, "grad_norm": 4.430100917816162, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8774677515029907, "num_tokens": 127252813.0, "step": 3334 }, { "epoch": 0.42424627909935125, "ewc_loss": 0.03277216851711273, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012264354154467583, "grad_norm": 4.58371639251709, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8444986343383789, "num_tokens": 127285847.0, "step": 3335 }, { "epoch": 0.4243734893779417, "ewc_loss": 0.032895661890506744, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001238785043824464, "grad_norm": 4.574172019958496, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.870546817779541, "num_tokens": 127325360.0, "step": 3336 }, { "epoch": 0.42450069965653225, "ewc_loss": 0.0327797532081604, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012271941523067653, "grad_norm": 4.457781791687012, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8598183393478394, "num_tokens": 127366446.0, "step": 3337 }, { "epoch": 0.4246279099351228, "ewc_loss": 0.032759033143520355, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012251221050973982, "grad_norm": 4.534027576446533, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8656191229820251, "num_tokens": 127402608.0, "step": 3338 }, { "epoch": 0.42475512021371326, "ewc_loss": 0.032858408987522125, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012350597535260022, "grad_norm": 4.483996868133545, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8622677326202393, "num_tokens": 127445355.0, "step": 3339 }, { "epoch": 0.4248823304923038, "ewc_loss": 0.032779760658741, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001227195025421679, "grad_norm": 4.548027992248535, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8503519892692566, "num_tokens": 127480523.0, "step": 3340 }, { "epoch": 0.4250095407708943, "ewc_loss": 0.03287309408187866, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012365281872916967, "grad_norm": 4.495707988739014, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8760281801223755, "num_tokens": 127520787.0, "step": 3341 }, { "epoch": 0.4251367510494848, "ewc_loss": 0.032806988805532455, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012299175432417542, "grad_norm": 4.484263896942139, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8525418043136597, "num_tokens": 127565576.0, "step": 3342 }, { "epoch": 0.4252639613280753, "ewc_loss": 0.032829806208610535, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001232199283549562, "grad_norm": 4.557775020599365, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8670370578765869, "num_tokens": 127599144.0, "step": 3343 }, { "epoch": 0.42539117160666584, "ewc_loss": 0.032879043370485306, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012371230695862323, "grad_norm": 4.506368160247803, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8749388456344604, "num_tokens": 127637361.0, "step": 3344 }, { "epoch": 0.4255183818852563, "ewc_loss": 0.032800786197185516, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012292974861338735, "grad_norm": 4.533578872680664, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8463582992553711, "num_tokens": 127675139.0, "step": 3345 }, { "epoch": 0.42564559216384684, "ewc_loss": 0.032842956483364105, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012335143401287496, "grad_norm": 4.459722995758057, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8622032403945923, "num_tokens": 127718822.0, "step": 3346 }, { "epoch": 0.42577280244243737, "ewc_loss": 0.032784733921289444, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012276922643650323, "grad_norm": 4.475917339324951, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8606773018836975, "num_tokens": 127763332.0, "step": 3347 }, { "epoch": 0.42590001272102784, "ewc_loss": 0.032844021916389465, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012336211511865258, "grad_norm": 4.5227813720703125, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8686568140983582, "num_tokens": 127800346.0, "step": 3348 }, { "epoch": 0.42602722299961837, "ewc_loss": 0.03283651918172836, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012328708544373512, "grad_norm": 4.545497894287109, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8540021777153015, "num_tokens": 127845142.0, "step": 3349 }, { "epoch": 0.4261544332782089, "ewc_loss": 0.03283482417464256, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012327010335866362, "grad_norm": 4.582638740539551, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.839225709438324, "num_tokens": 127881131.0, "step": 3350 }, { "epoch": 0.4262816435567994, "ewc_loss": 0.03285077214241028, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001234295777976513, "grad_norm": 4.600910186767578, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8706169128417969, "num_tokens": 127917941.0, "step": 3351 }, { "epoch": 0.4264088538353899, "ewc_loss": 0.03264149650931358, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00012255753972567618, "grad_norm": 4.493122100830078, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8442169427871704, "num_tokens": 127963345.0, "step": 3352 }, { "epoch": 0.42653606411398043, "ewc_loss": 0.032618582248687744, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00012232839071657509, "grad_norm": 4.517035961151123, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8660275340080261, "num_tokens": 128004839.0, "step": 3353 }, { "epoch": 0.4266632743925709, "ewc_loss": 0.032771360129117966, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012263546523172408, "grad_norm": 4.513914585113525, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8507659435272217, "num_tokens": 128042709.0, "step": 3354 }, { "epoch": 0.42679048467116143, "ewc_loss": 0.03264065459370613, "ewc_loss_diag": 2.0384788513183594e-05, "ewc_loss_parallel": 0.00012254911416675895, "grad_norm": 4.572170734405518, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8413459062576294, "num_tokens": 128085069.0, "step": 3355 }, { "epoch": 0.42691769494975196, "ewc_loss": 0.03280907869338989, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012301266542635858, "grad_norm": 4.539932727813721, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8511394262313843, "num_tokens": 128124300.0, "step": 3356 }, { "epoch": 0.42704490522834243, "ewc_loss": 0.03275003284215927, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012242219236213714, "grad_norm": 4.466984272003174, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8540925979614258, "num_tokens": 128164410.0, "step": 3357 }, { "epoch": 0.42717211550693296, "ewc_loss": 0.03280555084347725, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001229773770319298, "grad_norm": 4.535522937774658, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8664699792861938, "num_tokens": 128206524.0, "step": 3358 }, { "epoch": 0.4272993257855235, "ewc_loss": 0.03284623473882675, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012338421947788447, "grad_norm": 4.51094913482666, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8481327891349792, "num_tokens": 128246387.0, "step": 3359 }, { "epoch": 0.42742653606411396, "ewc_loss": 0.032829031348228455, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012321218673605472, "grad_norm": 4.500705718994141, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8668226003646851, "num_tokens": 128287794.0, "step": 3360 }, { "epoch": 0.4275537463427045, "ewc_loss": 0.03287059813737869, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012362786219455302, "grad_norm": 4.5477447509765625, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8541597723960876, "num_tokens": 128328467.0, "step": 3361 }, { "epoch": 0.427680956621295, "ewc_loss": 0.03283657878637314, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012328765296842903, "grad_norm": 4.553499221801758, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.865665078163147, "num_tokens": 128364319.0, "step": 3362 }, { "epoch": 0.4278081668998855, "ewc_loss": 0.032867901027202606, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012360088294371963, "grad_norm": 4.539649486541748, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8517401814460754, "num_tokens": 128403892.0, "step": 3363 }, { "epoch": 0.427935377178476, "ewc_loss": 0.03283695504069328, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001232914364663884, "grad_norm": 4.504951477050781, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8368910551071167, "num_tokens": 128446760.0, "step": 3364 }, { "epoch": 0.42806258745706655, "ewc_loss": 0.03283298388123512, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012325170973781496, "grad_norm": 4.549759387969971, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8324871063232422, "num_tokens": 128485498.0, "step": 3365 }, { "epoch": 0.428189797735657, "ewc_loss": 0.03287959843873978, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012371783668641, "grad_norm": 4.49137544631958, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.844840407371521, "num_tokens": 128527871.0, "step": 3366 }, { "epoch": 0.42831700801424755, "ewc_loss": 0.03286033868789673, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001235252566402778, "grad_norm": 4.549775123596191, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8429110050201416, "num_tokens": 128567792.0, "step": 3367 }, { "epoch": 0.4284442182928381, "ewc_loss": 0.032922592014074326, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012414780212566257, "grad_norm": 4.51400089263916, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8460089564323425, "num_tokens": 128607217.0, "step": 3368 }, { "epoch": 0.42857142857142855, "ewc_loss": 0.03288166597485542, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012373852950986475, "grad_norm": 4.5068182945251465, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8626869916915894, "num_tokens": 128642787.0, "step": 3369 }, { "epoch": 0.4286986388500191, "ewc_loss": 0.0329388789832592, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012431065260898322, "grad_norm": 4.507615566253662, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8675260543823242, "num_tokens": 128680462.0, "step": 3370 }, { "epoch": 0.4288258491286096, "ewc_loss": 0.03292521834373474, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012417406833264977, "grad_norm": 4.639316558837891, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8528405427932739, "num_tokens": 128715101.0, "step": 3371 }, { "epoch": 0.4289530594072001, "ewc_loss": 0.03298420086503029, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001247638720087707, "grad_norm": 4.517385482788086, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8484710454940796, "num_tokens": 128749601.0, "step": 3372 }, { "epoch": 0.4290802696857906, "ewc_loss": 0.032886482775211334, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001237866817973554, "grad_norm": 4.553467273712158, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.852676272392273, "num_tokens": 128788744.0, "step": 3373 }, { "epoch": 0.42920747996438113, "ewc_loss": 0.032986223697662354, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012478411372285336, "grad_norm": 4.467743396759033, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8522749543190002, "num_tokens": 128828102.0, "step": 3374 }, { "epoch": 0.4293346902429716, "ewc_loss": 0.032920338213443756, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012412526120897382, "grad_norm": 4.562816619873047, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8505725860595703, "num_tokens": 128863610.0, "step": 3375 }, { "epoch": 0.42946190052156213, "ewc_loss": 0.03299931809306145, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001249150518560782, "grad_norm": 4.5290207862854, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8706717491149902, "num_tokens": 128898836.0, "step": 3376 }, { "epoch": 0.42958911080015266, "ewc_loss": 0.032936595380306244, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012428783520590514, "grad_norm": 4.51890230178833, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8571602702140808, "num_tokens": 128934821.0, "step": 3377 }, { "epoch": 0.42971632107874314, "ewc_loss": 0.03297578543424606, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001246797328349203, "grad_norm": 4.544226169586182, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8546895980834961, "num_tokens": 128966914.0, "step": 3378 }, { "epoch": 0.42984353135733366, "ewc_loss": 0.0329928956925869, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001248508197022602, "grad_norm": 4.462935924530029, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8623620867729187, "num_tokens": 129008305.0, "step": 3379 }, { "epoch": 0.4299707416359242, "ewc_loss": 0.032982923090457916, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012475110997911543, "grad_norm": 4.529785633087158, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8768976330757141, "num_tokens": 129043331.0, "step": 3380 }, { "epoch": 0.4300979519145147, "ewc_loss": 0.03317240998148918, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001254252711078152, "grad_norm": 4.4905877113342285, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.861210823059082, "num_tokens": 129083513.0, "step": 3381 }, { "epoch": 0.4302251621931052, "ewc_loss": 0.03309926390647888, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001246937899850309, "grad_norm": 4.63594388961792, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8554774522781372, "num_tokens": 129113463.0, "step": 3382 }, { "epoch": 0.4303523724716957, "ewc_loss": 0.03322698175907135, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012597101158462465, "grad_norm": 4.552012920379639, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8324673175811768, "num_tokens": 129153080.0, "step": 3383 }, { "epoch": 0.43047958275028625, "ewc_loss": 0.033115312457084656, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012485429760999978, "grad_norm": 4.546834945678711, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8530887365341187, "num_tokens": 129193338.0, "step": 3384 }, { "epoch": 0.4306067930288767, "ewc_loss": 0.0331529825925827, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001252310030395165, "grad_norm": 4.540910243988037, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8464305400848389, "num_tokens": 129234816.0, "step": 3385 }, { "epoch": 0.43073400330746725, "ewc_loss": 0.033136509358882904, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012506624625530094, "grad_norm": 4.504648685455322, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8595762252807617, "num_tokens": 129275228.0, "step": 3386 }, { "epoch": 0.4308612135860578, "ewc_loss": 0.03312773257493973, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012497851275838912, "grad_norm": 4.5559611320495605, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.837866485118866, "num_tokens": 129309747.0, "step": 3387 }, { "epoch": 0.43098842386464825, "ewc_loss": 0.03317306563258171, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012543181946966797, "grad_norm": 4.539882183074951, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8584537506103516, "num_tokens": 129342580.0, "step": 3388 }, { "epoch": 0.4311156341432388, "ewc_loss": 0.03314383327960968, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001251394860446453, "grad_norm": 4.6127190589904785, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8517483472824097, "num_tokens": 129376599.0, "step": 3389 }, { "epoch": 0.4312428444218293, "ewc_loss": 0.03307422250509262, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012566409714054316, "grad_norm": 4.544094562530518, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8533263206481934, "num_tokens": 129414783.0, "step": 3390 }, { "epoch": 0.4313700547004198, "ewc_loss": 0.03299758583307266, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012489773507695645, "grad_norm": 4.549476623535156, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8547816276550293, "num_tokens": 129450287.0, "step": 3391 }, { "epoch": 0.4314972649790103, "ewc_loss": 0.03305312991142273, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012545316712930799, "grad_norm": 4.59867000579834, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8374113440513611, "num_tokens": 129486733.0, "step": 3392 }, { "epoch": 0.43162447525760084, "ewc_loss": 0.033031824976205826, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.0001252401270903647, "grad_norm": 4.547070026397705, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8557919859886169, "num_tokens": 129524392.0, "step": 3393 }, { "epoch": 0.4317516855361913, "ewc_loss": 0.03301459178328514, "ewc_loss_diag": 2.0503997802734375e-05, "ewc_loss_parallel": 0.00012506778875831515, "grad_norm": 4.556745529174805, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8413057327270508, "num_tokens": 129558452.0, "step": 3394 }, { "epoch": 0.43187889581478184, "ewc_loss": 0.033172305673360825, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000125424237921834, "grad_norm": 4.505903720855713, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8543232083320618, "num_tokens": 129597076.0, "step": 3395 }, { "epoch": 0.43200610609337237, "ewc_loss": 0.033130910247564316, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012501027958933264, "grad_norm": 4.512291431427002, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8505644202232361, "num_tokens": 129635861.0, "step": 3396 }, { "epoch": 0.43213331637196284, "ewc_loss": 0.03319917991757393, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012569296814035624, "grad_norm": 4.520442962646484, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8512754440307617, "num_tokens": 129673760.0, "step": 3397 }, { "epoch": 0.43226052665055337, "ewc_loss": 0.03316523879766464, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012535357382148504, "grad_norm": 4.49631404876709, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8574416637420654, "num_tokens": 129710730.0, "step": 3398 }, { "epoch": 0.4323877369291439, "ewc_loss": 0.03320426493883133, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001257438416359946, "grad_norm": 4.559238910675049, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8528232574462891, "num_tokens": 129748724.0, "step": 3399 }, { "epoch": 0.43251494720773437, "ewc_loss": 0.033232465386390686, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012602581409737468, "grad_norm": 4.562575340270996, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8681259155273438, "num_tokens": 129787478.0, "step": 3400 }, { "epoch": 0.4326421574863249, "ewc_loss": 0.033203087747097015, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012573206913657486, "grad_norm": 4.50385856628418, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8661050796508789, "num_tokens": 129829852.0, "step": 3401 }, { "epoch": 0.4327693677649154, "ewc_loss": 0.03315963223576546, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012529749074019492, "grad_norm": 4.484679698944092, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8545578122138977, "num_tokens": 129869279.0, "step": 3402 }, { "epoch": 0.4328965780435059, "ewc_loss": 0.03320595622062683, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012576073640957475, "grad_norm": 4.604560852050781, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8496386408805847, "num_tokens": 129903347.0, "step": 3403 }, { "epoch": 0.4330237883220964, "ewc_loss": 0.03322914242744446, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012599260662682354, "grad_norm": 4.500110626220703, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.866012454032898, "num_tokens": 129940028.0, "step": 3404 }, { "epoch": 0.43315099860068695, "ewc_loss": 0.03311879560351372, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012488912034314126, "grad_norm": 4.53648567199707, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.864365816116333, "num_tokens": 129976438.0, "step": 3405 }, { "epoch": 0.4332782088792774, "ewc_loss": 0.03318626806139946, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012556384899653494, "grad_norm": 4.468348026275635, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8607149720191956, "num_tokens": 130015698.0, "step": 3406 }, { "epoch": 0.43340541915786795, "ewc_loss": 0.03314879164099693, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012518907897174358, "grad_norm": 4.513162136077881, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8708381652832031, "num_tokens": 130056000.0, "step": 3407 }, { "epoch": 0.4335326294364585, "ewc_loss": 0.033220499753952026, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012590616825036705, "grad_norm": 4.514703273773193, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8533973693847656, "num_tokens": 130096990.0, "step": 3408 }, { "epoch": 0.43365983971504896, "ewc_loss": 0.033199481666088104, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012569599493872374, "grad_norm": 4.553178310394287, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8656566143035889, "num_tokens": 130128556.0, "step": 3409 }, { "epoch": 0.4337870499936395, "ewc_loss": 0.03320689871907234, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001257701514987275, "grad_norm": 4.462401390075684, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8630836009979248, "num_tokens": 130170607.0, "step": 3410 }, { "epoch": 0.43391426027223, "ewc_loss": 0.033186957240104675, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012557076115626842, "grad_norm": 4.566566467285156, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.851839005947113, "num_tokens": 130212511.0, "step": 3411 }, { "epoch": 0.4340414705508205, "ewc_loss": 0.03323658928275108, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012606705422513187, "grad_norm": 4.575644493103027, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8491672277450562, "num_tokens": 130249246.0, "step": 3412 }, { "epoch": 0.434168680829411, "ewc_loss": 0.03318590298295021, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012556021101772785, "grad_norm": 4.5081892013549805, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8756040930747986, "num_tokens": 130291250.0, "step": 3413 }, { "epoch": 0.43429589110800154, "ewc_loss": 0.03315969556570053, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012529813102446496, "grad_norm": 4.598308563232422, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8364490270614624, "num_tokens": 130327094.0, "step": 3414 }, { "epoch": 0.434423101386592, "ewc_loss": 0.033304985612630844, "ewc_loss_diag": 2.0742416381835938e-05, "ewc_loss_parallel": 0.00012553032138384879, "grad_norm": 4.507111549377441, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8475386500358582, "num_tokens": 130366482.0, "step": 3415 }, { "epoch": 0.43455031166518254, "ewc_loss": 0.03314613178372383, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012516249262262136, "grad_norm": 4.525952339172363, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8555343151092529, "num_tokens": 130407416.0, "step": 3416 }, { "epoch": 0.43467752194377307, "ewc_loss": 0.03315243124961853, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012522547331172973, "grad_norm": 4.4919586181640625, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8620842695236206, "num_tokens": 130447125.0, "step": 3417 }, { "epoch": 0.43480473222236354, "ewc_loss": 0.03313485160470009, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012504970072768629, "grad_norm": 4.532068252563477, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8621845841407776, "num_tokens": 130480955.0, "step": 3418 }, { "epoch": 0.43493194250095407, "ewc_loss": 0.03318510204553604, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012555220746435225, "grad_norm": 4.554682731628418, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.868716835975647, "num_tokens": 130511771.0, "step": 3419 }, { "epoch": 0.4350591527795446, "ewc_loss": 0.033169060945510864, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012539175804704428, "grad_norm": 4.5123772621154785, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8544602394104004, "num_tokens": 130550080.0, "step": 3420 }, { "epoch": 0.4351863630581351, "ewc_loss": 0.03315233439207077, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012522452743723989, "grad_norm": 4.521151542663574, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8451983332633972, "num_tokens": 130588109.0, "step": 3421 }, { "epoch": 0.4353135733367256, "ewc_loss": 0.033206306397914886, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012576425797306, "grad_norm": 4.5302934646606445, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8426079750061035, "num_tokens": 130626832.0, "step": 3422 }, { "epoch": 0.43544078361531613, "ewc_loss": 0.0332140177488327, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012584133946802467, "grad_norm": 4.543380260467529, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8602774143218994, "num_tokens": 130662202.0, "step": 3423 }, { "epoch": 0.4355679938939066, "ewc_loss": 0.03320540487766266, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012575523578561842, "grad_norm": 4.509477138519287, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8515183925628662, "num_tokens": 130700747.0, "step": 3424 }, { "epoch": 0.43569520417249713, "ewc_loss": 0.03319191187620163, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000125620310427621, "grad_norm": 4.495258331298828, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8542435169219971, "num_tokens": 130742849.0, "step": 3425 }, { "epoch": 0.43582241445108766, "ewc_loss": 0.03321705013513565, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012587165110744536, "grad_norm": 4.498321056365967, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8700752854347229, "num_tokens": 130780156.0, "step": 3426 }, { "epoch": 0.43594962472967813, "ewc_loss": 0.03321162238717079, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000125817401567474, "grad_norm": 4.4695658683776855, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8564924597740173, "num_tokens": 130825824.0, "step": 3427 }, { "epoch": 0.43607683500826866, "ewc_loss": 0.03321388363838196, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012584000069182366, "grad_norm": 4.556593418121338, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8642032146453857, "num_tokens": 130863603.0, "step": 3428 }, { "epoch": 0.4362040452868592, "ewc_loss": 0.03324798122048378, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012618099572136998, "grad_norm": 4.54391622543335, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8577752113342285, "num_tokens": 130906689.0, "step": 3429 }, { "epoch": 0.4363312555654497, "ewc_loss": 0.033207014203071594, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.000125771330203861, "grad_norm": 4.523588180541992, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8536780476570129, "num_tokens": 130947785.0, "step": 3430 }, { "epoch": 0.4364584658440402, "ewc_loss": 0.03319710120558739, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012567218800541013, "grad_norm": 4.520852565765381, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8662582039833069, "num_tokens": 130985489.0, "step": 3431 }, { "epoch": 0.4365856761226307, "ewc_loss": 0.0332065224647522, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012576639710459858, "grad_norm": 4.517963886260986, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8650435209274292, "num_tokens": 131025274.0, "step": 3432 }, { "epoch": 0.43671288640122125, "ewc_loss": 0.033160895109176636, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012531013635452837, "grad_norm": 4.498374938964844, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8554325103759766, "num_tokens": 131060132.0, "step": 3433 }, { "epoch": 0.4368400966798117, "ewc_loss": 0.033203668892383575, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012573786079883575, "grad_norm": 4.500731468200684, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8451778888702393, "num_tokens": 131105109.0, "step": 3434 }, { "epoch": 0.43696730695840225, "ewc_loss": 0.033223748207092285, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012593863357324153, "grad_norm": 4.545718193054199, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8722488284111023, "num_tokens": 131142094.0, "step": 3435 }, { "epoch": 0.4370945172369928, "ewc_loss": 0.033221885561943054, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012592003622557968, "grad_norm": 4.614194393157959, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8555458784103394, "num_tokens": 131174648.0, "step": 3436 }, { "epoch": 0.43722172751558325, "ewc_loss": 0.033244963735342026, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012615080049727112, "grad_norm": 4.494570732116699, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8715708255767822, "num_tokens": 131213447.0, "step": 3437 }, { "epoch": 0.4373489377941738, "ewc_loss": 0.03313794732093811, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012508065265137702, "grad_norm": 4.598572731018066, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8527528643608093, "num_tokens": 131249622.0, "step": 3438 }, { "epoch": 0.4374761480727643, "ewc_loss": 0.03330256789922714, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012672685261350125, "grad_norm": 4.493814945220947, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8718199133872986, "num_tokens": 131285883.0, "step": 3439 }, { "epoch": 0.4376033583513548, "ewc_loss": 0.03314819931983948, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012518317089416087, "grad_norm": 4.537387847900391, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8599104285240173, "num_tokens": 131322774.0, "step": 3440 }, { "epoch": 0.4377305686299453, "ewc_loss": 0.033258795738220215, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001262891455553472, "grad_norm": 4.488924980163574, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.868924617767334, "num_tokens": 131364513.0, "step": 3441 }, { "epoch": 0.43785777890853583, "ewc_loss": 0.033179815858602524, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.0001254993403563276, "grad_norm": 4.544295310974121, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8481749296188354, "num_tokens": 131402299.0, "step": 3442 }, { "epoch": 0.4379849891871263, "ewc_loss": 0.03328816220164299, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012658280320465565, "grad_norm": 4.471480369567871, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8643759489059448, "num_tokens": 131447830.0, "step": 3443 }, { "epoch": 0.43811219946571683, "ewc_loss": 0.033427558839321136, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001255353563465178, "grad_norm": 4.608981609344482, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.849053680896759, "num_tokens": 131481058.0, "step": 3444 }, { "epoch": 0.43823940974430736, "ewc_loss": 0.03357353061437607, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012699508806690574, "grad_norm": 4.580101013183594, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8534000515937805, "num_tokens": 131518376.0, "step": 3445 }, { "epoch": 0.43836662002289783, "ewc_loss": 0.03323071449995041, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012600829359143972, "grad_norm": 4.577712535858154, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8427823781967163, "num_tokens": 131553702.0, "step": 3446 }, { "epoch": 0.43849383030148836, "ewc_loss": 0.033278390765190125, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012648507254198194, "grad_norm": 4.586946964263916, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8645851612091064, "num_tokens": 131587026.0, "step": 3447 }, { "epoch": 0.4386210405800789, "ewc_loss": 0.03324391320347786, "ewc_loss_diag": 2.0623207092285156e-05, "ewc_loss_parallel": 0.00012614030856639147, "grad_norm": 4.520290374755859, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8633767366409302, "num_tokens": 131621145.0, "step": 3448 }, { "epoch": 0.43874825085866936, "ewc_loss": 0.03347856551408768, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012604543007910252, "grad_norm": 4.489837169647217, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.863406777381897, "num_tokens": 131661618.0, "step": 3449 }, { "epoch": 0.4388754611372599, "ewc_loss": 0.03350934758782387, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012635324674192816, "grad_norm": 4.601640224456787, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8515459895133972, "num_tokens": 131693840.0, "step": 3450 }, { "epoch": 0.4390026714158504, "ewc_loss": 0.03356001898646355, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001268599444301799, "grad_norm": 4.539825916290283, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8595668077468872, "num_tokens": 131730856.0, "step": 3451 }, { "epoch": 0.4391298816944409, "ewc_loss": 0.033502306789159775, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001262828300241381, "grad_norm": 4.5616774559021, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8328066468238831, "num_tokens": 131769172.0, "step": 3452 }, { "epoch": 0.4392570919730314, "ewc_loss": 0.03357207030057907, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.000126980448840186, "grad_norm": 4.550673484802246, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.859492838382721, "num_tokens": 131810481.0, "step": 3453 }, { "epoch": 0.43938430225162195, "ewc_loss": 0.03352166712284088, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012647644325625151, "grad_norm": 4.605270862579346, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8589528799057007, "num_tokens": 131850742.0, "step": 3454 }, { "epoch": 0.4395115125302124, "ewc_loss": 0.03356383740901947, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012689815775956959, "grad_norm": 4.602876663208008, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.854871392250061, "num_tokens": 131886675.0, "step": 3455 }, { "epoch": 0.43963872280880295, "ewc_loss": 0.03352809697389603, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012654073361773044, "grad_norm": 4.559612274169922, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8629452586174011, "num_tokens": 131926831.0, "step": 3456 }, { "epoch": 0.4397659330873935, "ewc_loss": 0.033520542085170746, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012646519462577999, "grad_norm": 4.598700046539307, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8559689521789551, "num_tokens": 131962265.0, "step": 3457 }, { "epoch": 0.43989314336598395, "ewc_loss": 0.033548399806022644, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001267437619389966, "grad_norm": 4.595836639404297, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8514971733093262, "num_tokens": 131999906.0, "step": 3458 }, { "epoch": 0.4400203536445745, "ewc_loss": 0.03354595601558685, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012671931472141296, "grad_norm": 4.632729530334473, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8729119300842285, "num_tokens": 132031330.0, "step": 3459 }, { "epoch": 0.440147563923165, "ewc_loss": 0.03354932367801666, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012675301695708185, "grad_norm": 4.583806991577148, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8560119867324829, "num_tokens": 132065049.0, "step": 3460 }, { "epoch": 0.4402747742017555, "ewc_loss": 0.033653080463409424, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.0001265698520001024, "grad_norm": 4.508745193481445, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8619288206100464, "num_tokens": 132100993.0, "step": 3461 }, { "epoch": 0.440401984480346, "ewc_loss": 0.03353266417980194, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012658641207963228, "grad_norm": 4.5852532386779785, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8555271029472351, "num_tokens": 132138169.0, "step": 3462 }, { "epoch": 0.44052919475893654, "ewc_loss": 0.033611997961997986, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012737975339405239, "grad_norm": 4.605246543884277, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8633749485015869, "num_tokens": 132170498.0, "step": 3463 }, { "epoch": 0.440656405037527, "ewc_loss": 0.03357885032892227, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012704827531706542, "grad_norm": 4.550498008728027, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8388639688491821, "num_tokens": 132213583.0, "step": 3464 }, { "epoch": 0.44078361531611754, "ewc_loss": 0.03355204313993454, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012678018538281322, "grad_norm": 4.4992594718933105, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8523152470588684, "num_tokens": 132255427.0, "step": 3465 }, { "epoch": 0.44091082559470807, "ewc_loss": 0.03352409601211548, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012650071585085243, "grad_norm": 4.537159442901611, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8545887470245361, "num_tokens": 132295795.0, "step": 3466 }, { "epoch": 0.44103803587329854, "ewc_loss": 0.03361184149980545, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001273781672352925, "grad_norm": 4.566946506500244, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8394503593444824, "num_tokens": 132336881.0, "step": 3467 }, { "epoch": 0.44116524615188907, "ewc_loss": 0.03357464820146561, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012700623483397067, "grad_norm": 4.522167205810547, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8571692705154419, "num_tokens": 132375136.0, "step": 3468 }, { "epoch": 0.4412924564304796, "ewc_loss": 0.033546317368745804, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012672293814830482, "grad_norm": 4.551654815673828, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8543537259101868, "num_tokens": 132412893.0, "step": 3469 }, { "epoch": 0.44141966670907007, "ewc_loss": 0.03368835896253586, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012692266318481416, "grad_norm": 4.658361911773682, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8513993620872498, "num_tokens": 132444360.0, "step": 3470 }, { "epoch": 0.4415468769876606, "ewc_loss": 0.03360707312822342, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001273305097129196, "grad_norm": 4.586026668548584, "learning_rate": 1e-06, "loss": 0.5561, "mean_token_accuracy": 0.8320233821868896, "num_tokens": 132485022.0, "step": 3471 }, { "epoch": 0.4416740872662511, "ewc_loss": 0.033542610704898834, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.00012668585986830294, "grad_norm": 4.584014415740967, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.841977596282959, "num_tokens": 132522800.0, "step": 3472 }, { "epoch": 0.4418012975448416, "ewc_loss": 0.03354920446872711, "ewc_loss_diag": 2.086162567138672e-05, "ewc_loss_parallel": 0.0001267518091481179, "grad_norm": 4.51409387588501, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8586360216140747, "num_tokens": 132560527.0, "step": 3473 }, { "epoch": 0.4419285078234321, "ewc_loss": 0.03368101269006729, "ewc_loss_diag": 2.09808349609375e-05, "ewc_loss_parallel": 0.00012684920511674136, "grad_norm": 4.544722557067871, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8546521663665771, "num_tokens": 132601075.0, "step": 3474 }, { "epoch": 0.44205571810202265, "ewc_loss": 0.033797331154346466, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012679168139584363, "grad_norm": 4.554385185241699, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8468631505966187, "num_tokens": 132639851.0, "step": 3475 }, { "epoch": 0.4421829283806131, "ewc_loss": 0.03380689397454262, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.000126887287478894, "grad_norm": 4.5065507888793945, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8589901328086853, "num_tokens": 132679849.0, "step": 3476 }, { "epoch": 0.44231013865920366, "ewc_loss": 0.033824726939201355, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012706563575193286, "grad_norm": 4.531946659088135, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8612934350967407, "num_tokens": 132721020.0, "step": 3477 }, { "epoch": 0.4424373489377942, "ewc_loss": 0.03386234492063522, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001274418318644166, "grad_norm": 4.595296382904053, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.843458890914917, "num_tokens": 132760933.0, "step": 3478 }, { "epoch": 0.44256455921638466, "ewc_loss": 0.033866409212350845, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001274824608117342, "grad_norm": 4.499900817871094, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8671298027038574, "num_tokens": 132800627.0, "step": 3479 }, { "epoch": 0.4426917694949752, "ewc_loss": 0.03379417583346367, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001267601182917133, "grad_norm": 4.577883720397949, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8562705516815186, "num_tokens": 132833232.0, "step": 3480 }, { "epoch": 0.4428189797735657, "ewc_loss": 0.033893562853336334, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012775397044606507, "grad_norm": 4.52145528793335, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8546048402786255, "num_tokens": 132874670.0, "step": 3481 }, { "epoch": 0.44294619005215624, "ewc_loss": 0.03380407392978668, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012685908586718142, "grad_norm": 4.552482604980469, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8589587807655334, "num_tokens": 132916187.0, "step": 3482 }, { "epoch": 0.4430734003307467, "ewc_loss": 0.033892299979925156, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001277413684874773, "grad_norm": 4.557734489440918, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8599064350128174, "num_tokens": 132956948.0, "step": 3483 }, { "epoch": 0.44320061060933724, "ewc_loss": 0.03385031968355179, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012732156028505415, "grad_norm": 4.592526435852051, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8617916107177734, "num_tokens": 132995213.0, "step": 3484 }, { "epoch": 0.44332782088792777, "ewc_loss": 0.033889465034008026, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012771300680469722, "grad_norm": 4.5697021484375, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8537655472755432, "num_tokens": 133038850.0, "step": 3485 }, { "epoch": 0.44345503116651824, "ewc_loss": 0.033845700323581696, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012727534340228885, "grad_norm": 4.5122246742248535, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8586367964744568, "num_tokens": 133080766.0, "step": 3486 }, { "epoch": 0.44358224144510877, "ewc_loss": 0.033822786062955856, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012704622349701822, "grad_norm": 4.611688137054443, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8529219627380371, "num_tokens": 133120845.0, "step": 3487 }, { "epoch": 0.4437094517236993, "ewc_loss": 0.033865075558423996, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012746910215355456, "grad_norm": 4.523075103759766, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8537143468856812, "num_tokens": 133166011.0, "step": 3488 }, { "epoch": 0.4438366620022898, "ewc_loss": 0.03382456302642822, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012706400593742728, "grad_norm": 4.595131874084473, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8540856838226318, "num_tokens": 133203507.0, "step": 3489 }, { "epoch": 0.4439638722808803, "ewc_loss": 0.03386715054512024, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012748986773658544, "grad_norm": 4.630502700805664, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8330246210098267, "num_tokens": 133240554.0, "step": 3490 }, { "epoch": 0.44409108255947083, "ewc_loss": 0.03385724127292633, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012739076919388026, "grad_norm": 4.714208602905273, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8391531705856323, "num_tokens": 133274987.0, "step": 3491 }, { "epoch": 0.4442182928380613, "ewc_loss": 0.033858075737953186, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012739913654513657, "grad_norm": 4.557126998901367, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8289510011672974, "num_tokens": 133319287.0, "step": 3492 }, { "epoch": 0.44434550311665183, "ewc_loss": 0.03377559781074524, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012657433398999274, "grad_norm": 4.64121675491333, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8605535626411438, "num_tokens": 133353670.0, "step": 3493 }, { "epoch": 0.44447271339524236, "ewc_loss": 0.033869463950395584, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001275129907298833, "grad_norm": 4.5702385902404785, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8569855093955994, "num_tokens": 133391950.0, "step": 3494 }, { "epoch": 0.44459992367383283, "ewc_loss": 0.03376549482345581, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012647328549064696, "grad_norm": 4.548311233520508, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.861735463142395, "num_tokens": 133424758.0, "step": 3495 }, { "epoch": 0.44472713395242336, "ewc_loss": 0.033822957426309586, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012704794062301517, "grad_norm": 4.582028865814209, "learning_rate": 1e-06, "loss": 0.3718, "mean_token_accuracy": 0.8839478492736816, "num_tokens": 133459869.0, "step": 3496 }, { "epoch": 0.4448543442310139, "ewc_loss": 0.0338677279651165, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001274956448469311, "grad_norm": 4.65531063079834, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8557358980178833, "num_tokens": 133494598.0, "step": 3497 }, { "epoch": 0.44498155450960436, "ewc_loss": 0.033831436187028885, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012713273463305086, "grad_norm": 4.512770652770996, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8649806976318359, "num_tokens": 133536403.0, "step": 3498 }, { "epoch": 0.4451087647881949, "ewc_loss": 0.03380764275789261, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012689479626715183, "grad_norm": 4.681012153625488, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8350969552993774, "num_tokens": 133567319.0, "step": 3499 }, { "epoch": 0.4452359750667854, "ewc_loss": 0.03391318768262863, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012795024667866528, "grad_norm": 4.57481050491333, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8485004901885986, "num_tokens": 133604829.0, "step": 3500 }, { "epoch": 0.4453631853453759, "ewc_loss": 0.03377921134233475, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012661049549933523, "grad_norm": 4.563467979431152, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8561705350875854, "num_tokens": 133638373.0, "step": 3501 }, { "epoch": 0.4454903956239664, "ewc_loss": 0.033854153007268906, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012735989002976567, "grad_norm": 4.614808082580566, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8669421076774597, "num_tokens": 133674110.0, "step": 3502 }, { "epoch": 0.44561760590255695, "ewc_loss": 0.033828943967819214, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001271077780984342, "grad_norm": 4.534994125366211, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8576000928878784, "num_tokens": 133711005.0, "step": 3503 }, { "epoch": 0.4457448161811474, "ewc_loss": 0.033804431557655334, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012686268019024283, "grad_norm": 4.527640342712402, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.868390679359436, "num_tokens": 133746258.0, "step": 3504 }, { "epoch": 0.44587202645973795, "ewc_loss": 0.033852700144052505, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012734535266645253, "grad_norm": 4.5127272605896, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8646813035011292, "num_tokens": 133790695.0, "step": 3505 }, { "epoch": 0.4459992367383285, "ewc_loss": 0.0338534452021122, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012735280324704945, "grad_norm": 4.54489803314209, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8531297445297241, "num_tokens": 133830181.0, "step": 3506 }, { "epoch": 0.44612644701691895, "ewc_loss": 0.033879972994327545, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012761808466166258, "grad_norm": 4.533369064331055, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8607547283172607, "num_tokens": 133866849.0, "step": 3507 }, { "epoch": 0.4462536572955095, "ewc_loss": 0.0338425412774086, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012724379485007375, "grad_norm": 4.594974994659424, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8565897941589355, "num_tokens": 133906812.0, "step": 3508 }, { "epoch": 0.4463808675741, "ewc_loss": 0.03389132022857666, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001277315750485286, "grad_norm": 4.497488975524902, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8498450517654419, "num_tokens": 133948956.0, "step": 3509 }, { "epoch": 0.4465080778526905, "ewc_loss": 0.03383113443851471, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012712969328276813, "grad_norm": 4.589352130889893, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8397865295410156, "num_tokens": 133988764.0, "step": 3510 }, { "epoch": 0.446635288131281, "ewc_loss": 0.03392554819583893, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001280738360946998, "grad_norm": 4.508261680603027, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8614788055419922, "num_tokens": 134028148.0, "step": 3511 }, { "epoch": 0.44676249840987153, "ewc_loss": 0.03384172171354294, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001272355584660545, "grad_norm": 4.6042609214782715, "learning_rate": 1e-06, "loss": 0.5403, "mean_token_accuracy": 0.8321858048439026, "num_tokens": 134065909.0, "step": 3512 }, { "epoch": 0.446889708688462, "ewc_loss": 0.03395836800336838, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012840202543884516, "grad_norm": 4.520507335662842, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8628900647163391, "num_tokens": 134108342.0, "step": 3513 }, { "epoch": 0.44701691896705253, "ewc_loss": 0.03383301571011543, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012714852346107364, "grad_norm": 4.574398994445801, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8552123308181763, "num_tokens": 134145928.0, "step": 3514 }, { "epoch": 0.44714412924564306, "ewc_loss": 0.03393426910042763, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012816106027457863, "grad_norm": 4.521849155426025, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8524037599563599, "num_tokens": 134185955.0, "step": 3515 }, { "epoch": 0.44727133952423354, "ewc_loss": 0.03386970981955528, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001275154500035569, "grad_norm": 4.587896347045898, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8618934154510498, "num_tokens": 134225829.0, "step": 3516 }, { "epoch": 0.44739854980282406, "ewc_loss": 0.033935558050870895, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012817393871955574, "grad_norm": 4.549570083618164, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.846855878829956, "num_tokens": 134261381.0, "step": 3517 }, { "epoch": 0.4475257600814146, "ewc_loss": 0.03388785943388939, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012769695604220033, "grad_norm": 4.576628684997559, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8645539283752441, "num_tokens": 134294697.0, "step": 3518 }, { "epoch": 0.44765297036000506, "ewc_loss": 0.03395528718829155, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001283712190343067, "grad_norm": 4.533680438995361, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8769098520278931, "num_tokens": 134334349.0, "step": 3519 }, { "epoch": 0.4477801806385956, "ewc_loss": 0.03389682620763779, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001277866103919223, "grad_norm": 4.551616191864014, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8452813625335693, "num_tokens": 134373381.0, "step": 3520 }, { "epoch": 0.4479073909171861, "ewc_loss": 0.03392672538757324, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012808562314603478, "grad_norm": 4.627496242523193, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.861417829990387, "num_tokens": 134407952.0, "step": 3521 }, { "epoch": 0.4480346011957766, "ewc_loss": 0.033944882452487946, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012826717284042388, "grad_norm": 4.491996765136719, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8466088771820068, "num_tokens": 134458862.0, "step": 3522 }, { "epoch": 0.4481618114743671, "ewc_loss": 0.033888254314661026, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012770089961122721, "grad_norm": 4.618804454803467, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8536555767059326, "num_tokens": 134494007.0, "step": 3523 }, { "epoch": 0.44828902175295765, "ewc_loss": 0.03399309888482094, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001287493359996006, "grad_norm": 4.610199928283691, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8422404527664185, "num_tokens": 134534195.0, "step": 3524 }, { "epoch": 0.4484162320315481, "ewc_loss": 0.03393707051873207, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012818907271139324, "grad_norm": 4.6078691482543945, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8633565902709961, "num_tokens": 134570241.0, "step": 3525 }, { "epoch": 0.44854344231013865, "ewc_loss": 0.03396562859416008, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001284746394958347, "grad_norm": 4.519083023071289, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8551352620124817, "num_tokens": 134611904.0, "step": 3526 }, { "epoch": 0.4486706525887292, "ewc_loss": 0.03392239660024643, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012804231664631516, "grad_norm": 4.664788722991943, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8460519313812256, "num_tokens": 134644426.0, "step": 3527 }, { "epoch": 0.44879786286731965, "ewc_loss": 0.03400559350848198, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001288743078475818, "grad_norm": 4.549932479858398, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.864581286907196, "num_tokens": 134678979.0, "step": 3528 }, { "epoch": 0.4489250731459102, "ewc_loss": 0.03389408439397812, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012775918003171682, "grad_norm": 4.564293384552002, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8588707447052002, "num_tokens": 134717270.0, "step": 3529 }, { "epoch": 0.4490522834245007, "ewc_loss": 0.0339897982776165, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012871633225586265, "grad_norm": 4.590572357177734, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8568615913391113, "num_tokens": 134755464.0, "step": 3530 }, { "epoch": 0.44917949370309124, "ewc_loss": 0.03397573530673981, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012857571709901094, "grad_norm": 4.524616718292236, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8490253686904907, "num_tokens": 134793737.0, "step": 3531 }, { "epoch": 0.4493067039816817, "ewc_loss": 0.03393220156431198, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001281403674511239, "grad_norm": 4.546600341796875, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8513040542602539, "num_tokens": 134834332.0, "step": 3532 }, { "epoch": 0.44943391426027224, "ewc_loss": 0.03400636464357376, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012888202036265284, "grad_norm": 4.584907054901123, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8674488067626953, "num_tokens": 134871027.0, "step": 3533 }, { "epoch": 0.44956112453886277, "ewc_loss": 0.03398900106549263, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012870837235823274, "grad_norm": 4.554923057556152, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8569020628929138, "num_tokens": 134908529.0, "step": 3534 }, { "epoch": 0.44968833481745324, "ewc_loss": 0.03396423161029816, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.0001284606842091307, "grad_norm": 4.5848517417907715, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8662142753601074, "num_tokens": 134944177.0, "step": 3535 }, { "epoch": 0.44981554509604377, "ewc_loss": 0.03397450968623161, "ewc_loss_diag": 2.110004425048828e-05, "ewc_loss_parallel": 0.00012856344983447343, "grad_norm": 4.610067844390869, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8560259342193604, "num_tokens": 134977928.0, "step": 3536 }, { "epoch": 0.4499427553746343, "ewc_loss": 0.03414037451148033, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012900140427518636, "grad_norm": 4.576948165893555, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8563054800033569, "num_tokens": 135017554.0, "step": 3537 }, { "epoch": 0.45006996565322477, "ewc_loss": 0.034095313400030136, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012855078966822475, "grad_norm": 4.5391740798950195, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.862830638885498, "num_tokens": 135058694.0, "step": 3538 }, { "epoch": 0.4501971759318153, "ewc_loss": 0.03409275785088539, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001285252219531685, "grad_norm": 4.614933967590332, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8592167496681213, "num_tokens": 135094037.0, "step": 3539 }, { "epoch": 0.4503243862104058, "ewc_loss": 0.034182194620370865, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.00012941959721501917, "grad_norm": 4.589059829711914, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8505082130432129, "num_tokens": 135132758.0, "step": 3540 }, { "epoch": 0.4504515964889963, "ewc_loss": 0.034118134528398514, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000128778992802836, "grad_norm": 4.607588291168213, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8679300546646118, "num_tokens": 135169398.0, "step": 3541 }, { "epoch": 0.4505788067675868, "ewc_loss": 0.034117866307497025, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.0001287763298023492, "grad_norm": 4.651512145996094, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8408259153366089, "num_tokens": 135204312.0, "step": 3542 }, { "epoch": 0.45070601704617735, "ewc_loss": 0.034264497458934784, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012902192247565836, "grad_norm": 4.576351165771484, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8539976477622986, "num_tokens": 135246287.0, "step": 3543 }, { "epoch": 0.4508332273247678, "ewc_loss": 0.03416728228330612, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012804976722691208, "grad_norm": 4.570026397705078, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8586989045143127, "num_tokens": 135280970.0, "step": 3544 }, { "epoch": 0.45096043760335836, "ewc_loss": 0.03411897271871567, "ewc_loss_diag": 2.1219253540039062e-05, "ewc_loss_parallel": 0.000128787403809838, "grad_norm": 4.56724739074707, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8782404661178589, "num_tokens": 135317873.0, "step": 3545 }, { "epoch": 0.4510876478819489, "ewc_loss": 0.034241192042827606, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001287888881051913, "grad_norm": 4.629845142364502, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8635044693946838, "num_tokens": 135354994.0, "step": 3546 }, { "epoch": 0.45121485816053936, "ewc_loss": 0.034229449927806854, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012867143959738314, "grad_norm": 4.6541242599487305, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8511691689491272, "num_tokens": 135393050.0, "step": 3547 }, { "epoch": 0.4513420684391299, "ewc_loss": 0.03424195945262909, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012879655696451664, "grad_norm": 4.534458160400391, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8614015579223633, "num_tokens": 135431704.0, "step": 3548 }, { "epoch": 0.4514692787177204, "ewc_loss": 0.034171778708696365, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012809474719688296, "grad_norm": 4.5968828201293945, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8345365524291992, "num_tokens": 135471306.0, "step": 3549 }, { "epoch": 0.4515964889963109, "ewc_loss": 0.034246645867824554, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012884339957963675, "grad_norm": 4.584375858306885, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8439897298812866, "num_tokens": 135509647.0, "step": 3550 }, { "epoch": 0.4517236992749014, "ewc_loss": 0.03424973785877228, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012887432239949703, "grad_norm": 4.578268051147461, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.864231288433075, "num_tokens": 135550469.0, "step": 3551 }, { "epoch": 0.45185090955349194, "ewc_loss": 0.03422752395272255, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012865220196545124, "grad_norm": 4.543669700622559, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8559520244598389, "num_tokens": 135589290.0, "step": 3552 }, { "epoch": 0.4519781198320824, "ewc_loss": 0.03425010293722153, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012887798948213458, "grad_norm": 4.634555816650391, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8503106832504272, "num_tokens": 135624416.0, "step": 3553 }, { "epoch": 0.45210533011067294, "ewc_loss": 0.034275591373443604, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012913288082927465, "grad_norm": 4.553643226623535, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8606781959533691, "num_tokens": 135662316.0, "step": 3554 }, { "epoch": 0.45223254038926347, "ewc_loss": 0.03425437584519386, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012892071390524507, "grad_norm": 4.587146759033203, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.858224093914032, "num_tokens": 135701739.0, "step": 3555 }, { "epoch": 0.45235975066785394, "ewc_loss": 0.03429321572184563, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001293091190746054, "grad_norm": 4.579618453979492, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8530499339103699, "num_tokens": 135741372.0, "step": 3556 }, { "epoch": 0.45248696094644447, "ewc_loss": 0.03423158451914787, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012869280180893838, "grad_norm": 4.603618144989014, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8786684274673462, "num_tokens": 135779945.0, "step": 3557 }, { "epoch": 0.452614171225035, "ewc_loss": 0.03424651920795441, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001288421481149271, "grad_norm": 4.602449893951416, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8508104085922241, "num_tokens": 135813179.0, "step": 3558 }, { "epoch": 0.4527413815036255, "ewc_loss": 0.034251049160957336, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012888746277894825, "grad_norm": 4.5769429206848145, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8599948883056641, "num_tokens": 135855147.0, "step": 3559 }, { "epoch": 0.452868591782216, "ewc_loss": 0.03423673287034035, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012874428648501635, "grad_norm": 4.568404674530029, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8645755648612976, "num_tokens": 135892990.0, "step": 3560 }, { "epoch": 0.45299580206080653, "ewc_loss": 0.03422531485557556, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012863009760621935, "grad_norm": 4.588342666625977, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8748762607574463, "num_tokens": 135926911.0, "step": 3561 }, { "epoch": 0.453123012339397, "ewc_loss": 0.034258127212524414, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012895824329461902, "grad_norm": 4.5545573234558105, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8650292158126831, "num_tokens": 135963530.0, "step": 3562 }, { "epoch": 0.45325022261798753, "ewc_loss": 0.03421655669808388, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001285425096284598, "grad_norm": 4.571829319000244, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8472258448600769, "num_tokens": 136000677.0, "step": 3563 }, { "epoch": 0.45337743289657806, "ewc_loss": 0.034271422773599625, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012909117504023015, "grad_norm": 4.605342388153076, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8464099168777466, "num_tokens": 136037283.0, "step": 3564 }, { "epoch": 0.45350464317516853, "ewc_loss": 0.03424610570073128, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012883801537100226, "grad_norm": 4.69615364074707, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8396387696266174, "num_tokens": 136071454.0, "step": 3565 }, { "epoch": 0.45363185345375906, "ewc_loss": 0.034285977482795715, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001292367378482595, "grad_norm": 4.535641670227051, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8455319404602051, "num_tokens": 136105638.0, "step": 3566 }, { "epoch": 0.4537590637323496, "ewc_loss": 0.034188736230134964, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001282643061131239, "grad_norm": 4.506891250610352, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8602322340011597, "num_tokens": 136147968.0, "step": 3567 }, { "epoch": 0.45388627401094006, "ewc_loss": 0.03428046405315399, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012918160064145923, "grad_norm": 4.569918632507324, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8577144145965576, "num_tokens": 136190468.0, "step": 3568 }, { "epoch": 0.4540134842895306, "ewc_loss": 0.03429482877254486, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012932525714859366, "grad_norm": 4.532327175140381, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8609756827354431, "num_tokens": 136229363.0, "step": 3569 }, { "epoch": 0.4541406945681211, "ewc_loss": 0.03427278250455856, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001291047956328839, "grad_norm": 4.546615123748779, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8504391312599182, "num_tokens": 136268834.0, "step": 3570 }, { "epoch": 0.4542679048467116, "ewc_loss": 0.03432386368513107, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012961559696123004, "grad_norm": 4.596851825714111, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.833238422870636, "num_tokens": 136306761.0, "step": 3571 }, { "epoch": 0.4543951151253021, "ewc_loss": 0.03430813550949097, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012945830530952662, "grad_norm": 4.5379109382629395, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8556853532791138, "num_tokens": 136347736.0, "step": 3572 }, { "epoch": 0.45452232540389265, "ewc_loss": 0.03432171046733856, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012959406012669206, "grad_norm": 4.6647186279296875, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8597772121429443, "num_tokens": 136381009.0, "step": 3573 }, { "epoch": 0.4546495356824831, "ewc_loss": 0.03438221663236618, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001301990996580571, "grad_norm": 4.554516315460205, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8671301603317261, "num_tokens": 136418050.0, "step": 3574 }, { "epoch": 0.45477674596107365, "ewc_loss": 0.03427417203783989, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012911867816001177, "grad_norm": 4.5857625007629395, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8389173150062561, "num_tokens": 136459109.0, "step": 3575 }, { "epoch": 0.4549039562396642, "ewc_loss": 0.03433713689446449, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012974832498002797, "grad_norm": 4.567336082458496, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8521549701690674, "num_tokens": 136497993.0, "step": 3576 }, { "epoch": 0.45503116651825465, "ewc_loss": 0.03428281843662262, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.0001292051310883835, "grad_norm": 4.585273742675781, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8596667051315308, "num_tokens": 136534446.0, "step": 3577 }, { "epoch": 0.4551583767968452, "ewc_loss": 0.03444988280534744, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012965509085915983, "grad_norm": 4.626875877380371, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8400554656982422, "num_tokens": 136571117.0, "step": 3578 }, { "epoch": 0.4552855870754357, "ewc_loss": 0.03431399166584015, "ewc_loss_diag": 2.1338462829589844e-05, "ewc_loss_parallel": 0.00012951689132023603, "grad_norm": 4.622455596923828, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.863598108291626, "num_tokens": 136601706.0, "step": 3579 }, { "epoch": 0.4554127973540262, "ewc_loss": 0.03441530466079712, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012930929369758815, "grad_norm": 4.630768299102783, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8562002182006836, "num_tokens": 136639735.0, "step": 3580 }, { "epoch": 0.4555400076326167, "ewc_loss": 0.034445326775312424, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012960951426066458, "grad_norm": 4.615689754486084, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.84976726770401, "num_tokens": 136675647.0, "step": 3581 }, { "epoch": 0.45566721791120723, "ewc_loss": 0.0344214141368866, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001293704117415473, "grad_norm": 4.54762077331543, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8646496534347534, "num_tokens": 136713361.0, "step": 3582 }, { "epoch": 0.45579442818979776, "ewc_loss": 0.03438221663236618, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001289783976972103, "grad_norm": 4.5958251953125, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8552325367927551, "num_tokens": 136751440.0, "step": 3583 }, { "epoch": 0.45592163846838824, "ewc_loss": 0.03447531536221504, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012990940012969077, "grad_norm": 4.630610942840576, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8521556258201599, "num_tokens": 136786078.0, "step": 3584 }, { "epoch": 0.45604884874697876, "ewc_loss": 0.034440066665410995, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012955690908711404, "grad_norm": 4.522960662841797, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8645296096801758, "num_tokens": 136828313.0, "step": 3585 }, { "epoch": 0.4561760590255693, "ewc_loss": 0.03443414345383644, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001294976973440498, "grad_norm": 4.62187385559082, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8547887206077576, "num_tokens": 136862549.0, "step": 3586 }, { "epoch": 0.45630326930415976, "ewc_loss": 0.03451443463563919, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013030061381869018, "grad_norm": 4.553348541259766, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8591850996017456, "num_tokens": 136906423.0, "step": 3587 }, { "epoch": 0.4564304795827503, "ewc_loss": 0.034421827644109726, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012937451538164169, "grad_norm": 4.638999938964844, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8452237844467163, "num_tokens": 136941550.0, "step": 3588 }, { "epoch": 0.4565576898613408, "ewc_loss": 0.03465122729539871, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013044782099314034, "grad_norm": 4.567124366760254, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8602852821350098, "num_tokens": 136980234.0, "step": 3589 }, { "epoch": 0.4566849001399313, "ewc_loss": 0.03444884717464447, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012964472989551723, "grad_norm": 4.645909786224365, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8601784706115723, "num_tokens": 137023991.0, "step": 3590 }, { "epoch": 0.4568121104185218, "ewc_loss": 0.034524399787187576, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013040023623034358, "grad_norm": 4.577476978302002, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8476394414901733, "num_tokens": 137063261.0, "step": 3591 }, { "epoch": 0.45693932069711235, "ewc_loss": 0.03441382199525833, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001292944507440552, "grad_norm": 4.548616409301758, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.867089033126831, "num_tokens": 137103935.0, "step": 3592 }, { "epoch": 0.4570665309757028, "ewc_loss": 0.03446109592914581, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001297672133659944, "grad_norm": 4.610956192016602, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8566117286682129, "num_tokens": 137140194.0, "step": 3593 }, { "epoch": 0.45719374125429335, "ewc_loss": 0.03448165953159332, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012997286103200167, "grad_norm": 4.596088409423828, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8475700616836548, "num_tokens": 137179086.0, "step": 3594 }, { "epoch": 0.4573209515328839, "ewc_loss": 0.03444938361644745, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012965007044840604, "grad_norm": 4.602806568145752, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8626748323440552, "num_tokens": 137216887.0, "step": 3595 }, { "epoch": 0.45744816181147435, "ewc_loss": 0.03448732569813728, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013002949708607048, "grad_norm": 4.562542915344238, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8720163106918335, "num_tokens": 137255029.0, "step": 3596 }, { "epoch": 0.4575753720900649, "ewc_loss": 0.03446928784251213, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012984912609681487, "grad_norm": 4.61590576171875, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8599597215652466, "num_tokens": 137291399.0, "step": 3597 }, { "epoch": 0.4577025823686554, "ewc_loss": 0.03450874611735344, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013024371583014727, "grad_norm": 4.596046447753906, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8613170385360718, "num_tokens": 137326569.0, "step": 3598 }, { "epoch": 0.4578297926472459, "ewc_loss": 0.03448760136961937, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013003226194996387, "grad_norm": 4.56201696395874, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8523414134979248, "num_tokens": 137370035.0, "step": 3599 }, { "epoch": 0.4579570029258364, "ewc_loss": 0.03447675332427025, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001299237774219364, "grad_norm": 4.625283718109131, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8528885841369629, "num_tokens": 137407969.0, "step": 3600 }, { "epoch": 0.45808421320442694, "ewc_loss": 0.03451215475797653, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013027778186369687, "grad_norm": 4.598008155822754, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.855242133140564, "num_tokens": 137447256.0, "step": 3601 }, { "epoch": 0.4582114234830174, "ewc_loss": 0.03444909304380417, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012964718916919082, "grad_norm": 4.659066200256348, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8430531620979309, "num_tokens": 137479762.0, "step": 3602 }, { "epoch": 0.45833863376160794, "ewc_loss": 0.034522753208875656, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001303837780142203, "grad_norm": 4.6068549156188965, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8494421243667603, "num_tokens": 137521823.0, "step": 3603 }, { "epoch": 0.45846584404019847, "ewc_loss": 0.034412093460559845, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001292772067245096, "grad_norm": 4.620364189147949, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8550301790237427, "num_tokens": 137558174.0, "step": 3604 }, { "epoch": 0.45859305431878894, "ewc_loss": 0.03448091447353363, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012996539589948952, "grad_norm": 4.605724811553955, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8329353332519531, "num_tokens": 137597265.0, "step": 3605 }, { "epoch": 0.45872026459737947, "ewc_loss": 0.03443741053342819, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012953035184182227, "grad_norm": 4.56149435043335, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8669356107711792, "num_tokens": 137635954.0, "step": 3606 }, { "epoch": 0.45884747487597, "ewc_loss": 0.034443870186805725, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012959496234543622, "grad_norm": 4.602997779846191, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8589767217636108, "num_tokens": 137676137.0, "step": 3607 }, { "epoch": 0.45897468515456047, "ewc_loss": 0.034466709941625595, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012982335465494543, "grad_norm": 4.600602626800537, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8598026037216187, "num_tokens": 137716723.0, "step": 3608 }, { "epoch": 0.459101895433151, "ewc_loss": 0.03444816917181015, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012963793415110558, "grad_norm": 4.623602390289307, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8395596146583557, "num_tokens": 137752435.0, "step": 3609 }, { "epoch": 0.4592291057117415, "ewc_loss": 0.03445736691355705, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001297299168072641, "grad_norm": 4.621640205383301, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.847712516784668, "num_tokens": 137786684.0, "step": 3610 }, { "epoch": 0.459356315990332, "ewc_loss": 0.034479182213544846, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012994807912036777, "grad_norm": 4.567676067352295, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8479816913604736, "num_tokens": 137830668.0, "step": 3611 }, { "epoch": 0.4594835262689225, "ewc_loss": 0.034452613443136215, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012968239025212824, "grad_norm": 4.629349708557129, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8635470271110535, "num_tokens": 137865262.0, "step": 3612 }, { "epoch": 0.45961073654751305, "ewc_loss": 0.03448143228888512, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012997057638131082, "grad_norm": 4.612530708312988, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.863763153553009, "num_tokens": 137898867.0, "step": 3613 }, { "epoch": 0.4597379468261035, "ewc_loss": 0.03447504714131355, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012990672257728875, "grad_norm": 4.612823009490967, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.857791543006897, "num_tokens": 137935108.0, "step": 3614 }, { "epoch": 0.45986515710469406, "ewc_loss": 0.0344909131526947, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013006539666093886, "grad_norm": 4.6081719398498535, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8676753640174866, "num_tokens": 137977714.0, "step": 3615 }, { "epoch": 0.4599923673832846, "ewc_loss": 0.034473828971385956, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012989452807232738, "grad_norm": 4.736954212188721, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8682884573936462, "num_tokens": 138006345.0, "step": 3616 }, { "epoch": 0.46011957766187506, "ewc_loss": 0.03453072905540466, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013046353706158698, "grad_norm": 4.612395763397217, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8570059537887573, "num_tokens": 138045106.0, "step": 3617 }, { "epoch": 0.4602467879404656, "ewc_loss": 0.03442971408367157, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012945338676217943, "grad_norm": 4.672510147094727, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8508906364440918, "num_tokens": 138079993.0, "step": 3618 }, { "epoch": 0.4603739982190561, "ewc_loss": 0.034505460411310196, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013021085760556161, "grad_norm": 4.566254615783691, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8602192997932434, "num_tokens": 138118555.0, "step": 3619 }, { "epoch": 0.4605012084976466, "ewc_loss": 0.03441689908504486, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012932525714859366, "grad_norm": 4.615767478942871, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.864943265914917, "num_tokens": 138155096.0, "step": 3620 }, { "epoch": 0.4606284187762371, "ewc_loss": 0.034543104469776154, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013058731565251946, "grad_norm": 4.609729766845703, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8586956858634949, "num_tokens": 138194529.0, "step": 3621 }, { "epoch": 0.46075562905482764, "ewc_loss": 0.03449073061347008, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013006354856770486, "grad_norm": 5.225326061248779, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8310576677322388, "num_tokens": 138235520.0, "step": 3622 }, { "epoch": 0.4608828393334181, "ewc_loss": 0.03486304730176926, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013378674339037389, "grad_norm": 4.583703994750977, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8671808242797852, "num_tokens": 138271391.0, "step": 3623 }, { "epoch": 0.46101004961200864, "ewc_loss": 0.034199729561805725, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012715355842374265, "grad_norm": 4.5784173011779785, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8517548441886902, "num_tokens": 138313923.0, "step": 3624 }, { "epoch": 0.46113725989059917, "ewc_loss": 0.03457152470946312, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.000130871485453099, "grad_norm": 4.555285453796387, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.848873496055603, "num_tokens": 138355992.0, "step": 3625 }, { "epoch": 0.46126447016918964, "ewc_loss": 0.03442469611763954, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00012940321175847203, "grad_norm": 4.6096110343933105, "learning_rate": 1e-06, "loss": 0.394, "mean_token_accuracy": 0.8768229484558105, "num_tokens": 138393074.0, "step": 3626 }, { "epoch": 0.4613916804477802, "ewc_loss": 0.03452346473932266, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.0001303909084526822, "grad_norm": 4.635982990264893, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8638891577720642, "num_tokens": 138427854.0, "step": 3627 }, { "epoch": 0.4615188907263707, "ewc_loss": 0.0344945527613163, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013010177644900978, "grad_norm": 4.5859880447387695, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8659300208091736, "num_tokens": 138467359.0, "step": 3628 }, { "epoch": 0.4616461010049612, "ewc_loss": 0.03464251756668091, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013036072778049856, "grad_norm": 4.6971235275268555, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8446393609046936, "num_tokens": 138505355.0, "step": 3629 }, { "epoch": 0.4617733112835517, "ewc_loss": 0.03458213806152344, "ewc_loss_diag": 2.1457672119140625e-05, "ewc_loss_parallel": 0.00013097762712277472, "grad_norm": 4.571938514709473, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8610169291496277, "num_tokens": 138543703.0, "step": 3630 }, { "epoch": 0.46190052156214223, "ewc_loss": 0.03459727391600609, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012990827963221818, "grad_norm": 4.607244968414307, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8492894172668457, "num_tokens": 138579860.0, "step": 3631 }, { "epoch": 0.46202773184073276, "ewc_loss": 0.034711334854364395, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013104888785164803, "grad_norm": 4.685068130493164, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8672938346862793, "num_tokens": 138615755.0, "step": 3632 }, { "epoch": 0.46215494211932323, "ewc_loss": 0.03466765582561493, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013061208301223814, "grad_norm": 4.672327041625977, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8517663478851318, "num_tokens": 138647607.0, "step": 3633 }, { "epoch": 0.46228215239791376, "ewc_loss": 0.034638017416000366, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013031573325861245, "grad_norm": 4.696674346923828, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8619616627693176, "num_tokens": 138679284.0, "step": 3634 }, { "epoch": 0.4624093626765043, "ewc_loss": 0.03470127657055855, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013094830501358956, "grad_norm": 4.597876071929932, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8665124773979187, "num_tokens": 138719690.0, "step": 3635 }, { "epoch": 0.46253657295509476, "ewc_loss": 0.03460153564810753, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012995090219192207, "grad_norm": 4.607935428619385, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.846028208732605, "num_tokens": 138758031.0, "step": 3636 }, { "epoch": 0.4626637832336853, "ewc_loss": 0.03466084599494934, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013054402370471507, "grad_norm": 4.633640289306641, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8567454814910889, "num_tokens": 138797283.0, "step": 3637 }, { "epoch": 0.4627909935122758, "ewc_loss": 0.03465821593999863, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013051771384198219, "grad_norm": 4.611390113830566, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8538328409194946, "num_tokens": 138834672.0, "step": 3638 }, { "epoch": 0.4629182037908663, "ewc_loss": 0.03465604782104492, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013049600238446146, "grad_norm": 4.6048078536987305, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8680945038795471, "num_tokens": 138871105.0, "step": 3639 }, { "epoch": 0.4630454140694568, "ewc_loss": 0.03467726707458496, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013070821296423674, "grad_norm": 4.642833709716797, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8584817051887512, "num_tokens": 138903516.0, "step": 3640 }, { "epoch": 0.46317262434804735, "ewc_loss": 0.034944698214530945, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013094113091938198, "grad_norm": 4.601600646972656, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8555337190628052, "num_tokens": 138942021.0, "step": 3641 }, { "epoch": 0.4632998346266378, "ewc_loss": 0.03487778455018997, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013027197564952075, "grad_norm": 4.636029243469238, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8661655187606812, "num_tokens": 138978210.0, "step": 3642 }, { "epoch": 0.46342704490522835, "ewc_loss": 0.03494672104716301, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001309613580815494, "grad_norm": 4.607223033905029, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8599165678024292, "num_tokens": 139019122.0, "step": 3643 }, { "epoch": 0.4635542551838189, "ewc_loss": 0.034907497465610504, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013056909665465355, "grad_norm": 8.600786209106445, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8677681088447571, "num_tokens": 139062187.0, "step": 3644 }, { "epoch": 0.46368146546240935, "ewc_loss": 0.0388563871383667, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00017005800327751786, "grad_norm": 5.302435398101807, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8763815760612488, "num_tokens": 139102960.0, "step": 3645 }, { "epoch": 0.4638086757409999, "ewc_loss": 0.033840619027614594, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012234174937475473, "grad_norm": 4.3722381591796875, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8588890433311462, "num_tokens": 139137302.0, "step": 3646 }, { "epoch": 0.4639358860195904, "ewc_loss": 0.03516967594623566, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013563230459112674, "grad_norm": 4.938584804534912, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8387296795845032, "num_tokens": 139177195.0, "step": 3647 }, { "epoch": 0.4640630962981809, "ewc_loss": 0.03512522578239441, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013518780178856105, "grad_norm": 4.633344650268555, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8541979193687439, "num_tokens": 139214722.0, "step": 3648 }, { "epoch": 0.4641903065767714, "ewc_loss": 0.03465089946985245, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013044453226029873, "grad_norm": 4.710643768310547, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8559930324554443, "num_tokens": 139248569.0, "step": 3649 }, { "epoch": 0.46431751685536193, "ewc_loss": 0.0349489264190197, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013342482270672917, "grad_norm": 4.633590221405029, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8638290166854858, "num_tokens": 139289141.0, "step": 3650 }, { "epoch": 0.4644447271339524, "ewc_loss": 0.03469788283109665, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013091438449919224, "grad_norm": 4.629910945892334, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8741835355758667, "num_tokens": 139330791.0, "step": 3651 }, { "epoch": 0.46457193741254293, "ewc_loss": 0.034816157072782516, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001320971205132082, "grad_norm": 4.713005542755127, "learning_rate": 1e-06, "loss": 0.5568, "mean_token_accuracy": 0.8251181840896606, "num_tokens": 139373009.0, "step": 3652 }, { "epoch": 0.46469914769113346, "ewc_loss": 0.03475445136427879, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013148006109986454, "grad_norm": 4.679742336273193, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8564107418060303, "num_tokens": 139408894.0, "step": 3653 }, { "epoch": 0.46482635796972394, "ewc_loss": 0.034699417650699615, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013092973676975816, "grad_norm": 4.644588470458984, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.85572749376297, "num_tokens": 139444040.0, "step": 3654 }, { "epoch": 0.46495356824831446, "ewc_loss": 0.03469909727573395, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013092650624457747, "grad_norm": 4.636134147644043, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8636714816093445, "num_tokens": 139479675.0, "step": 3655 }, { "epoch": 0.465080778526905, "ewc_loss": 0.03468979895114899, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013083353405818343, "grad_norm": 4.643152236938477, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8648924827575684, "num_tokens": 139521792.0, "step": 3656 }, { "epoch": 0.46520798880549546, "ewc_loss": 0.03469140827655792, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013084961392451078, "grad_norm": 4.592229843139648, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8636824488639832, "num_tokens": 139563471.0, "step": 3657 }, { "epoch": 0.465335199084086, "ewc_loss": 0.03468438237905502, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013077937182970345, "grad_norm": 4.675078392028809, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8579503297805786, "num_tokens": 139604702.0, "step": 3658 }, { "epoch": 0.4654624093626765, "ewc_loss": 0.034715935587882996, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001310948864556849, "grad_norm": 4.621051788330078, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8557171821594238, "num_tokens": 139644573.0, "step": 3659 }, { "epoch": 0.465589619641267, "ewc_loss": 0.03462742641568184, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001302098244195804, "grad_norm": 4.66617488861084, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8438311815261841, "num_tokens": 139680924.0, "step": 3660 }, { "epoch": 0.4657168299198575, "ewc_loss": 0.03468315303325653, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013076706090942025, "grad_norm": 4.645627975463867, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8497556447982788, "num_tokens": 139724369.0, "step": 3661 }, { "epoch": 0.46584404019844805, "ewc_loss": 0.034642960876226425, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.000130365151562728, "grad_norm": 4.5835442543029785, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8572741150856018, "num_tokens": 139769412.0, "step": 3662 }, { "epoch": 0.4659712504770385, "ewc_loss": 0.03463340178132057, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013026956003159285, "grad_norm": 4.696492671966553, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8334234356880188, "num_tokens": 139808066.0, "step": 3663 }, { "epoch": 0.46609846075562905, "ewc_loss": 0.03467947617173195, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013073031732346863, "grad_norm": 8.549972534179688, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8631364703178406, "num_tokens": 139851169.0, "step": 3664 }, { "epoch": 0.4662256710342196, "ewc_loss": 0.03857719153165817, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00016726605826988816, "grad_norm": 5.305235385894775, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8675464391708374, "num_tokens": 139890550.0, "step": 3665 }, { "epoch": 0.46635288131281005, "ewc_loss": 0.03377406299114227, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00012167615932412446, "grad_norm": 4.315171241760254, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8312630653381348, "num_tokens": 139931496.0, "step": 3666 }, { "epoch": 0.4664800915914006, "ewc_loss": 0.03511270880699158, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001350626553175971, "grad_norm": 4.970026016235352, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.848239004611969, "num_tokens": 139965872.0, "step": 3667 }, { "epoch": 0.4666073018699911, "ewc_loss": 0.03519277647137642, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013586330169346184, "grad_norm": 4.6745476722717285, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8424563407897949, "num_tokens": 140000394.0, "step": 3668 }, { "epoch": 0.4667345121485816, "ewc_loss": 0.034671179950237274, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001306473568547517, "grad_norm": 4.715002536773682, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8413500785827637, "num_tokens": 140039967.0, "step": 3669 }, { "epoch": 0.4668617224271721, "ewc_loss": 0.03493942692875862, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013332981325220317, "grad_norm": 4.651381015777588, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8575233221054077, "num_tokens": 140075739.0, "step": 3670 }, { "epoch": 0.46698893270576264, "ewc_loss": 0.03476748988032341, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001316104462603107, "grad_norm": 4.742934226989746, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8608185648918152, "num_tokens": 140110016.0, "step": 3671 }, { "epoch": 0.4671161429843531, "ewc_loss": 0.034908927977085114, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001330248051090166, "grad_norm": 4.666031837463379, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8478531837463379, "num_tokens": 140147905.0, "step": 3672 }, { "epoch": 0.46724335326294364, "ewc_loss": 0.03477034717798233, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013163899711798877, "grad_norm": 4.682921886444092, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8524949550628662, "num_tokens": 140182284.0, "step": 3673 }, { "epoch": 0.46737056354153417, "ewc_loss": 0.03482380509376526, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.0001321736053796485, "grad_norm": 4.604141712188721, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8499236106872559, "num_tokens": 140220669.0, "step": 3674 }, { "epoch": 0.46749777382012464, "ewc_loss": 0.03477223217487335, "ewc_loss_diag": 2.1576881408691406e-05, "ewc_loss_parallel": 0.00013165788550395519, "grad_norm": 4.713542461395264, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8426047563552856, "num_tokens": 140258399.0, "step": 3675 }, { "epoch": 0.46762498409871517, "ewc_loss": 0.035117559134960175, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013266972382552922, "grad_norm": 6.11728048324585, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8711879253387451, "num_tokens": 140296015.0, "step": 3676 }, { "epoch": 0.4677521943773057, "ewc_loss": 0.03612842410802841, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00014277840091381222, "grad_norm": 4.715258598327637, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8471488952636719, "num_tokens": 140338618.0, "step": 3677 }, { "epoch": 0.46787940465589617, "ewc_loss": 0.034541912376880646, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00012691326264757663, "grad_norm": 4.6503520011901855, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8617749214172363, "num_tokens": 140379151.0, "step": 3678 }, { "epoch": 0.4680066149344867, "ewc_loss": 0.035194918513298035, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013344334729481488, "grad_norm": 4.654921531677246, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8430690169334412, "num_tokens": 140417740.0, "step": 3679 }, { "epoch": 0.4681338252130772, "ewc_loss": 0.03489149361848831, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013040905469097197, "grad_norm": 4.69965124130249, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8527935743331909, "num_tokens": 140452454.0, "step": 3680 }, { "epoch": 0.46826103549166775, "ewc_loss": 0.03503879904747009, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013188213051762432, "grad_norm": 4.656627655029297, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.833655595779419, "num_tokens": 140492981.0, "step": 3681 }, { "epoch": 0.4683882457702582, "ewc_loss": 0.034832146018743515, "ewc_loss_diag": 2.1696090698242188e-05, "ewc_loss_parallel": 0.00013103631499689072, "grad_norm": 4.6162567138671875, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8452856540679932, "num_tokens": 140534131.0, "step": 3682 }, { "epoch": 0.46851545604884876, "ewc_loss": 0.0350242480635643, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013173662591725588, "grad_norm": 4.664758205413818, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8494817018508911, "num_tokens": 140576761.0, "step": 3683 }, { "epoch": 0.4686426663274393, "ewc_loss": 0.03506370633840561, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013213120109867305, "grad_norm": 4.726268768310547, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.836431622505188, "num_tokens": 140609538.0, "step": 3684 }, { "epoch": 0.46876987660602976, "ewc_loss": 0.035070937126874924, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.0001322035095654428, "grad_norm": 4.681268692016602, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.852486252784729, "num_tokens": 140641802.0, "step": 3685 }, { "epoch": 0.4688970868846203, "ewc_loss": 0.03503602743148804, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013185439456719905, "grad_norm": 4.628833770751953, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8450276255607605, "num_tokens": 140685832.0, "step": 3686 }, { "epoch": 0.4690242971632108, "ewc_loss": 0.03501557186245918, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013164985284674913, "grad_norm": 4.670132637023926, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8442876935005188, "num_tokens": 140725333.0, "step": 3687 }, { "epoch": 0.4691515074418013, "ewc_loss": 0.035069018602371216, "ewc_loss_diag": 2.181529998779297e-05, "ewc_loss_parallel": 0.00013218431558925658, "grad_norm": 4.686577796936035, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8533679246902466, "num_tokens": 140760838.0, "step": 3688 }, { "epoch": 0.4692787177203918, "ewc_loss": 0.03517569229006767, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013203035632614046, "grad_norm": 4.646626949310303, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8453274965286255, "num_tokens": 140797367.0, "step": 3689 }, { "epoch": 0.46940592799898234, "ewc_loss": 0.03516284003853798, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013190183381084353, "grad_norm": 4.573862075805664, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.854788601398468, "num_tokens": 140839778.0, "step": 3690 }, { "epoch": 0.4695331382775728, "ewc_loss": 0.03518418222665787, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013211525219958276, "grad_norm": 4.644586086273193, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8509358167648315, "num_tokens": 140880340.0, "step": 3691 }, { "epoch": 0.46966034855616334, "ewc_loss": 0.03527246415615082, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013299805868882686, "grad_norm": 4.728348255157471, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8491262197494507, "num_tokens": 140915451.0, "step": 3692 }, { "epoch": 0.46978755883475387, "ewc_loss": 0.035242415964603424, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000132697619847022, "grad_norm": 4.687480926513672, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8522344827651978, "num_tokens": 140956967.0, "step": 3693 }, { "epoch": 0.46991476911334434, "ewc_loss": 0.03522054851055145, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013247891911305487, "grad_norm": 4.593842506408691, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8707823157310486, "num_tokens": 140996640.0, "step": 3694 }, { "epoch": 0.47004197939193487, "ewc_loss": 0.0352095291018486, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001323687465628609, "grad_norm": 4.712451934814453, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8541609048843384, "num_tokens": 141036094.0, "step": 3695 }, { "epoch": 0.4701691896705254, "ewc_loss": 0.035256072878837585, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000132834175019525, "grad_norm": 4.747289180755615, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8459495902061462, "num_tokens": 141069033.0, "step": 3696 }, { "epoch": 0.4702963999491159, "ewc_loss": 0.035209983587265015, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013237327220849693, "grad_norm": 4.625053405761719, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8729995489120483, "num_tokens": 141111075.0, "step": 3697 }, { "epoch": 0.4704236102277064, "ewc_loss": 0.03514719754457474, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001317454152740538, "grad_norm": 4.674026012420654, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8709427118301392, "num_tokens": 141145270.0, "step": 3698 }, { "epoch": 0.47055082050629693, "ewc_loss": 0.03519195318222046, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001321929885307327, "grad_norm": 4.625792980194092, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8708263635635376, "num_tokens": 141182132.0, "step": 3699 }, { "epoch": 0.4706780307848874, "ewc_loss": 0.03515169396996498, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013179038069210947, "grad_norm": 4.631133079528809, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8571332097053528, "num_tokens": 141217455.0, "step": 3700 }, { "epoch": 0.47080524106347793, "ewc_loss": 0.035205110907554626, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001323245232924819, "grad_norm": 4.678189277648926, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8555347919464111, "num_tokens": 141249680.0, "step": 3701 }, { "epoch": 0.47093245134206846, "ewc_loss": 0.035202134400606155, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013229477917775512, "grad_norm": 4.579259872436523, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8557922840118408, "num_tokens": 141292085.0, "step": 3702 }, { "epoch": 0.47105966162065893, "ewc_loss": 0.035158783197402954, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001318612921750173, "grad_norm": 4.60983943939209, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8637304306030273, "num_tokens": 141331455.0, "step": 3703 }, { "epoch": 0.47118687189924946, "ewc_loss": 0.035206932574510574, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013234275684226304, "grad_norm": 4.6442790031433105, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8556773662567139, "num_tokens": 141365623.0, "step": 3704 }, { "epoch": 0.47131408217784, "ewc_loss": 0.03521200641989708, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001323935139225796, "grad_norm": 4.692321300506592, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8467217683792114, "num_tokens": 141399033.0, "step": 3705 }, { "epoch": 0.47144129245643046, "ewc_loss": 0.03522419556975365, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013251538621261716, "grad_norm": 4.711269855499268, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8632051348686218, "num_tokens": 141430436.0, "step": 3706 }, { "epoch": 0.471568502735021, "ewc_loss": 0.03521835431456566, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001324569748248905, "grad_norm": 4.6697611808776855, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8732891082763672, "num_tokens": 141463169.0, "step": 3707 }, { "epoch": 0.4716957130136115, "ewc_loss": 0.035211123526096344, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001323846954619512, "grad_norm": 4.689521312713623, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8750580549240112, "num_tokens": 141495524.0, "step": 3708 }, { "epoch": 0.471822923292202, "ewc_loss": 0.03521377965807915, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013241123815532774, "grad_norm": 4.576474666595459, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8674471378326416, "num_tokens": 141533099.0, "step": 3709 }, { "epoch": 0.4719501335707925, "ewc_loss": 0.03517822176218033, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013205564755480736, "grad_norm": 4.637393474578857, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8615462779998779, "num_tokens": 141576389.0, "step": 3710 }, { "epoch": 0.47207734384938305, "ewc_loss": 0.03523191064596176, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013259252591524273, "grad_norm": 4.643251895904541, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8593909740447998, "num_tokens": 141611870.0, "step": 3711 }, { "epoch": 0.4722045541279735, "ewc_loss": 0.03523964062333107, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013266985479276627, "grad_norm": 4.65997838973999, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8673883676528931, "num_tokens": 141648539.0, "step": 3712 }, { "epoch": 0.47233176440656405, "ewc_loss": 0.035218849778175354, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013246193702798337, "grad_norm": 4.625898361206055, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8603993654251099, "num_tokens": 141688549.0, "step": 3713 }, { "epoch": 0.4724589746851546, "ewc_loss": 0.03524169325828552, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001326903875451535, "grad_norm": 4.666633129119873, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.865595281124115, "num_tokens": 141725294.0, "step": 3714 }, { "epoch": 0.47258618496374505, "ewc_loss": 0.03523798659443855, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013265330926515162, "grad_norm": 4.696251392364502, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8406929969787598, "num_tokens": 141756761.0, "step": 3715 }, { "epoch": 0.4727133952423356, "ewc_loss": 0.03527645394206047, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001330379891442135, "grad_norm": 4.6864471435546875, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8413798809051514, "num_tokens": 141790417.0, "step": 3716 }, { "epoch": 0.4728406055209261, "ewc_loss": 0.03523869067430496, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013266035239212215, "grad_norm": 4.598720073699951, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8678402900695801, "num_tokens": 141830749.0, "step": 3717 }, { "epoch": 0.4729678157995166, "ewc_loss": 0.035242706537246704, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013270050112623721, "grad_norm": 4.648723125457764, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8610506057739258, "num_tokens": 141868433.0, "step": 3718 }, { "epoch": 0.4730950260781071, "ewc_loss": 0.03525803983211517, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013285383465699852, "grad_norm": 4.593117713928223, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8744194507598877, "num_tokens": 141907140.0, "step": 3719 }, { "epoch": 0.47322223635669763, "ewc_loss": 0.03525650501251221, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013283846783451736, "grad_norm": 4.6255927085876465, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8708924055099487, "num_tokens": 141947436.0, "step": 3720 }, { "epoch": 0.4733494466352881, "ewc_loss": 0.03526070713996887, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013288049376569688, "grad_norm": 4.665297031402588, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8499729633331299, "num_tokens": 141986876.0, "step": 3721 }, { "epoch": 0.47347665691387864, "ewc_loss": 0.03530730679631233, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013334651885088533, "grad_norm": 4.649725914001465, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8604838848114014, "num_tokens": 142025806.0, "step": 3722 }, { "epoch": 0.47360386719246916, "ewc_loss": 0.03523959219455719, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013266937457956374, "grad_norm": 4.727879047393799, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8569797277450562, "num_tokens": 142057932.0, "step": 3723 }, { "epoch": 0.47373107747105964, "ewc_loss": 0.03531424701213837, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013341588783077896, "grad_norm": 4.667015075683594, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8615553379058838, "num_tokens": 142088880.0, "step": 3724 }, { "epoch": 0.47385828774965016, "ewc_loss": 0.03520806506276131, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001323540782323107, "grad_norm": 4.633060932159424, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.870957612991333, "num_tokens": 142124959.0, "step": 3725 }, { "epoch": 0.4739854980282407, "ewc_loss": 0.03525598347187042, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013283328735269606, "grad_norm": 4.678086280822754, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8442744016647339, "num_tokens": 142160667.0, "step": 3726 }, { "epoch": 0.47411270830683117, "ewc_loss": 0.03531358391046524, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013340928126126528, "grad_norm": 4.64983606338501, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.857183575630188, "num_tokens": 142205017.0, "step": 3727 }, { "epoch": 0.4742399185854217, "ewc_loss": 0.03525320440530777, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013280546409077942, "grad_norm": 4.671383857727051, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8631934523582458, "num_tokens": 142241479.0, "step": 3728 }, { "epoch": 0.4743671288640122, "ewc_loss": 0.03527569770812988, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001330303930444643, "grad_norm": 4.6463422775268555, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8583114743232727, "num_tokens": 142280881.0, "step": 3729 }, { "epoch": 0.4744943391426027, "ewc_loss": 0.0352601632475853, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001328750659013167, "grad_norm": 4.598825454711914, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8648874163627625, "num_tokens": 142328248.0, "step": 3730 }, { "epoch": 0.4746215494211932, "ewc_loss": 0.035243500024080276, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013270843192003667, "grad_norm": 4.717459201812744, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8418210744857788, "num_tokens": 142364872.0, "step": 3731 }, { "epoch": 0.47474875969978375, "ewc_loss": 0.035301074385643005, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.000133284178446047, "grad_norm": 4.65005350112915, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8397067189216614, "num_tokens": 142400494.0, "step": 3732 }, { "epoch": 0.4748759699783743, "ewc_loss": 0.03520955890417099, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013236903760116547, "grad_norm": 4.694332599639893, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.855488657951355, "num_tokens": 142433739.0, "step": 3733 }, { "epoch": 0.47500318025696475, "ewc_loss": 0.0352797769010067, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013307121116667986, "grad_norm": 4.593478679656982, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8659605979919434, "num_tokens": 142469777.0, "step": 3734 }, { "epoch": 0.4751303905355553, "ewc_loss": 0.03523264825344086, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013259991828817874, "grad_norm": 4.68380069732666, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8477331399917603, "num_tokens": 142510052.0, "step": 3735 }, { "epoch": 0.4752576008141458, "ewc_loss": 0.035331591963768005, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013358936121221632, "grad_norm": 4.619767665863037, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8458867073059082, "num_tokens": 142545689.0, "step": 3736 }, { "epoch": 0.4753848110927363, "ewc_loss": 0.03528014197945595, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013307484914548695, "grad_norm": 4.643082618713379, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8561025261878967, "num_tokens": 142585916.0, "step": 3737 }, { "epoch": 0.4755120213713268, "ewc_loss": 0.03630091995000839, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00013351699453778565, "grad_norm": 35.30643081665039, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8644866347312927, "num_tokens": 142623019.0, "step": 3738 }, { "epoch": 0.47563923164991734, "ewc_loss": 0.04770119488239288, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00025606469716876745, "grad_norm": 7.004817485809326, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.853479266166687, "num_tokens": 142660986.0, "step": 3739 }, { "epoch": 0.4757664419285078, "ewc_loss": 0.03802415728569031, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00015807358431629837, "grad_norm": 8.355890274047852, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8613971471786499, "num_tokens": 142702876.0, "step": 3740 }, { "epoch": 0.47589365220709834, "ewc_loss": 0.03729204088449478, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001507524575572461, "grad_norm": 5.182934761047363, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8450818061828613, "num_tokens": 142744082.0, "step": 3741 }, { "epoch": 0.47602086248568887, "ewc_loss": 0.03554418683052063, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001344946213066578, "grad_norm": 4.713720798492432, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.838748574256897, "num_tokens": 142777511.0, "step": 3742 }, { "epoch": 0.47614807276427934, "ewc_loss": 0.03578655421733856, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013691827189177275, "grad_norm": 4.9854416847229, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.851559042930603, "num_tokens": 142818882.0, "step": 3743 }, { "epoch": 0.47627528304286987, "ewc_loss": 0.03571019694209099, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013615470379590988, "grad_norm": 4.8307108879089355, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8608510494232178, "num_tokens": 142852771.0, "step": 3744 }, { "epoch": 0.4764024933214604, "ewc_loss": 0.0354466438293457, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013351916277315468, "grad_norm": 4.810564041137695, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8580139875411987, "num_tokens": 142888398.0, "step": 3745 }, { "epoch": 0.47652970360005087, "ewc_loss": 0.035450078547000885, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013355350529309362, "grad_norm": 4.755586624145508, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8730814456939697, "num_tokens": 142924789.0, "step": 3746 }, { "epoch": 0.4766569138786414, "ewc_loss": 0.035419948399066925, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013325222244020551, "grad_norm": 4.7777018547058105, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8641848564147949, "num_tokens": 142958843.0, "step": 3747 }, { "epoch": 0.4767841241572319, "ewc_loss": 0.03544103354215622, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013346307969186455, "grad_norm": 4.773158073425293, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8681433200836182, "num_tokens": 142996408.0, "step": 3748 }, { "epoch": 0.4769113344358224, "ewc_loss": 0.03526753932237625, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013294884411152452, "grad_norm": 4.664397716522217, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8492217659950256, "num_tokens": 143040031.0, "step": 3749 }, { "epoch": 0.4770385447144129, "ewc_loss": 0.03526085615158081, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013288200716488063, "grad_norm": 4.73633337020874, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8347146511077881, "num_tokens": 143081648.0, "step": 3750 }, { "epoch": 0.47716575499300345, "ewc_loss": 0.035316742956638336, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001334408443653956, "grad_norm": 4.736923694610596, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8352717161178589, "num_tokens": 143118913.0, "step": 3751 }, { "epoch": 0.4772929652715939, "ewc_loss": 0.03526618331670761, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013293528172653168, "grad_norm": 4.705090522766113, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8663152456283569, "num_tokens": 143153858.0, "step": 3752 }, { "epoch": 0.47742017555018446, "ewc_loss": 0.03524499014019966, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013272333308123052, "grad_norm": 4.673887252807617, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.868130624294281, "num_tokens": 143190977.0, "step": 3753 }, { "epoch": 0.477547385828775, "ewc_loss": 0.03527062386274338, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013297967961989343, "grad_norm": 4.668074607849121, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.848950982093811, "num_tokens": 143228009.0, "step": 3754 }, { "epoch": 0.47767459610736546, "ewc_loss": 0.035261474549770355, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013288820628076792, "grad_norm": 4.654983043670654, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8468765020370483, "num_tokens": 143269820.0, "step": 3755 }, { "epoch": 0.477801806385956, "ewc_loss": 0.035234928131103516, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001326227211393416, "grad_norm": 4.688366413116455, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8615272045135498, "num_tokens": 143303441.0, "step": 3756 }, { "epoch": 0.4779290166645465, "ewc_loss": 0.035316161811351776, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001334350381512195, "grad_norm": 4.693413257598877, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8496894836425781, "num_tokens": 143338748.0, "step": 3757 }, { "epoch": 0.478056226943137, "ewc_loss": 0.03527187556028366, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001329921797150746, "grad_norm": 4.6478495597839355, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8572389483451843, "num_tokens": 143373900.0, "step": 3758 }, { "epoch": 0.4781834372217275, "ewc_loss": 0.035293884575366974, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013321227743290365, "grad_norm": 4.720767974853516, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8517489433288574, "num_tokens": 143411190.0, "step": 3759 }, { "epoch": 0.47831064750031804, "ewc_loss": 0.03533956781029701, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013366910570766777, "grad_norm": 4.62885856628418, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8529696464538574, "num_tokens": 143449470.0, "step": 3760 }, { "epoch": 0.4784378577789085, "ewc_loss": 0.03526189550757408, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.0001328923972323537, "grad_norm": 4.677063465118408, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8630026578903198, "num_tokens": 143485868.0, "step": 3761 }, { "epoch": 0.47856506805749904, "ewc_loss": 0.03533878177404404, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013366124767344445, "grad_norm": 4.668281078338623, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8659613132476807, "num_tokens": 143520334.0, "step": 3762 }, { "epoch": 0.47869227833608957, "ewc_loss": 0.03531467914581299, "ewc_loss_diag": 2.193450927734375e-05, "ewc_loss_parallel": 0.00013342020974960178, "grad_norm": 4.624948024749756, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8500110507011414, "num_tokens": 143557752.0, "step": 3763 }, { "epoch": 0.47881948861468004, "ewc_loss": 0.0354732908308506, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013378565199673176, "grad_norm": 4.8936896324157715, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8567266464233398, "num_tokens": 143593385.0, "step": 3764 }, { "epoch": 0.4789466988932706, "ewc_loss": 0.035577043890953064, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013482317444868386, "grad_norm": 4.629090309143066, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8366029858589172, "num_tokens": 143631975.0, "step": 3765 }, { "epoch": 0.4790739091718611, "ewc_loss": 0.03539744019508362, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013302714796736836, "grad_norm": 4.746870994567871, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8402902483940125, "num_tokens": 143668382.0, "step": 3766 }, { "epoch": 0.4792011194504516, "ewc_loss": 0.03558380529284477, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013489079719875008, "grad_norm": 4.663863658905029, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8439406752586365, "num_tokens": 143706623.0, "step": 3767 }, { "epoch": 0.4793283297290421, "ewc_loss": 0.03557703644037247, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013360241428017616, "grad_norm": 4.642828464508057, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8614197969436646, "num_tokens": 143745664.0, "step": 3768 }, { "epoch": 0.47945554000763263, "ewc_loss": 0.03551686182618141, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013422136544249952, "grad_norm": 4.728883743286133, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8473410606384277, "num_tokens": 143780761.0, "step": 3769 }, { "epoch": 0.4795827502862231, "ewc_loss": 0.03557440638542175, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013479679182637483, "grad_norm": 4.598971366882324, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8616397380828857, "num_tokens": 143825298.0, "step": 3770 }, { "epoch": 0.47970996056481363, "ewc_loss": 0.035493843257427216, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001339911832474172, "grad_norm": 4.655198574066162, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.876483142375946, "num_tokens": 143859855.0, "step": 3771 }, { "epoch": 0.47983717084340416, "ewc_loss": 0.03558649867773056, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013491771824192256, "grad_norm": 4.672223091125488, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8701357245445251, "num_tokens": 143898553.0, "step": 3772 }, { "epoch": 0.47996438112199463, "ewc_loss": 0.035524945706129074, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001343022013315931, "grad_norm": 4.71050500869751, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8538506031036377, "num_tokens": 143935362.0, "step": 3773 }, { "epoch": 0.48009159140058516, "ewc_loss": 0.035663872957229614, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013447075616568327, "grad_norm": 4.585798740386963, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8578723073005676, "num_tokens": 143978692.0, "step": 3774 }, { "epoch": 0.4802188016791757, "ewc_loss": 0.03562279790639877, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001340600138064474, "grad_norm": 8.614763259887695, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8643166422843933, "num_tokens": 144020951.0, "step": 3775 }, { "epoch": 0.48034601195776616, "ewc_loss": 0.03981590270996094, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00017599103739485145, "grad_norm": 5.383357524871826, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8654494881629944, "num_tokens": 144058341.0, "step": 3776 }, { "epoch": 0.4804732222363567, "ewc_loss": 0.03485642746090889, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001263963058590889, "grad_norm": 4.4022955894470215, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8499628305435181, "num_tokens": 144098232.0, "step": 3777 }, { "epoch": 0.4806004325149472, "ewc_loss": 0.03620964661240578, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013992849562782794, "grad_norm": 4.978192329406738, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8521745800971985, "num_tokens": 144137537.0, "step": 3778 }, { "epoch": 0.4807276427935377, "ewc_loss": 0.03619812801480293, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013981331721879542, "grad_norm": 4.616754055023193, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.863054633140564, "num_tokens": 144178459.0, "step": 3779 }, { "epoch": 0.4808548530721282, "ewc_loss": 0.03555261343717575, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013457887689583004, "grad_norm": 4.807697296142578, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8515492677688599, "num_tokens": 144219257.0, "step": 3780 }, { "epoch": 0.48098206335071875, "ewc_loss": 0.035910774022340775, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013816046703141183, "grad_norm": 4.667572021484375, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8625788688659668, "num_tokens": 144262232.0, "step": 3781 }, { "epoch": 0.4811092736293093, "ewc_loss": 0.03551324084401131, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001341851457254961, "grad_norm": 4.733648300170898, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8348896503448486, "num_tokens": 144297144.0, "step": 3782 }, { "epoch": 0.48123648390789975, "ewc_loss": 0.035739876329898834, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001364514755550772, "grad_norm": 4.718111038208008, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8579415082931519, "num_tokens": 144336078.0, "step": 3783 }, { "epoch": 0.4813636941864903, "ewc_loss": 0.03557102382183075, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001347629731753841, "grad_norm": 4.671274662017822, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8494977355003357, "num_tokens": 144380162.0, "step": 3784 }, { "epoch": 0.4814909044650808, "ewc_loss": 0.03555995598435402, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013465230586007237, "grad_norm": 4.730012893676758, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.842679500579834, "num_tokens": 144416735.0, "step": 3785 }, { "epoch": 0.4816181147436713, "ewc_loss": 0.03560996800661087, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013515242608264089, "grad_norm": 4.696130275726318, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8466204404830933, "num_tokens": 144452780.0, "step": 3786 }, { "epoch": 0.4817453250222618, "ewc_loss": 0.03554020822048187, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013445483637042344, "grad_norm": 4.622954845428467, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8623920679092407, "num_tokens": 144494155.0, "step": 3787 }, { "epoch": 0.48187253530085233, "ewc_loss": 0.03551563620567322, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.000134209098177962, "grad_norm": 4.634859561920166, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8663421869277954, "num_tokens": 144533989.0, "step": 3788 }, { "epoch": 0.4819997455794428, "ewc_loss": 0.035532478243112564, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001343775074928999, "grad_norm": 4.674089431762695, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8529701828956604, "num_tokens": 144577409.0, "step": 3789 }, { "epoch": 0.48212695585803333, "ewc_loss": 0.03549264371395111, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013397916336543858, "grad_norm": 4.653536319732666, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8485702276229858, "num_tokens": 144618501.0, "step": 3790 }, { "epoch": 0.48225416613662386, "ewc_loss": 0.03548606485128403, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013391338870860636, "grad_norm": 4.628319263458252, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8639249801635742, "num_tokens": 144659454.0, "step": 3791 }, { "epoch": 0.48238137641521434, "ewc_loss": 0.03549995645880699, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013405230129137635, "grad_norm": 4.671679496765137, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8628746867179871, "num_tokens": 144695580.0, "step": 3792 }, { "epoch": 0.48250858669380486, "ewc_loss": 0.03551911562681198, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013424389180727303, "grad_norm": 4.672613143920898, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8392637372016907, "num_tokens": 144735586.0, "step": 3793 }, { "epoch": 0.4826357969723954, "ewc_loss": 0.035543881356716156, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013449156540445983, "grad_norm": 4.647959232330322, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8575998544692993, "num_tokens": 144776371.0, "step": 3794 }, { "epoch": 0.48276300725098586, "ewc_loss": 0.03552081808447838, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013426091754809022, "grad_norm": 4.693060398101807, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8506467342376709, "num_tokens": 144816724.0, "step": 3795 }, { "epoch": 0.4828902175295764, "ewc_loss": 0.03555756062269211, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013462832430377603, "grad_norm": 4.698482513427734, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.862774133682251, "num_tokens": 144854591.0, "step": 3796 }, { "epoch": 0.4830174278081669, "ewc_loss": 0.035535890609025955, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013441163173411041, "grad_norm": 4.657499313354492, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8581607937812805, "num_tokens": 144893694.0, "step": 3797 }, { "epoch": 0.4831446380867574, "ewc_loss": 0.03551088646054268, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001341615861747414, "grad_norm": 4.687244415283203, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8534766435623169, "num_tokens": 144932985.0, "step": 3798 }, { "epoch": 0.4832718483653479, "ewc_loss": 0.035552673041820526, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013457944442052394, "grad_norm": 4.68253755569458, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8575189709663391, "num_tokens": 144967823.0, "step": 3799 }, { "epoch": 0.48339905864393845, "ewc_loss": 0.03555900231003761, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013464275980368257, "grad_norm": 4.611295223236084, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8775736689567566, "num_tokens": 145007750.0, "step": 3800 }, { "epoch": 0.4835262689225289, "ewc_loss": 0.035622600466012955, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013405804929789156, "grad_norm": 4.68031644821167, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8511756658554077, "num_tokens": 145048596.0, "step": 3801 }, { "epoch": 0.48365347920111945, "ewc_loss": 0.035576097667217255, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013481371570378542, "grad_norm": 4.886598587036133, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8469898104667664, "num_tokens": 145080868.0, "step": 3802 }, { "epoch": 0.48378068947971, "ewc_loss": 0.03560476005077362, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013510033022612333, "grad_norm": 4.6787943840026855, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8651817440986633, "num_tokens": 145115865.0, "step": 3803 }, { "epoch": 0.48390789975830045, "ewc_loss": 0.0354594811797142, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013364755432121456, "grad_norm": 4.663006782531738, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8701321482658386, "num_tokens": 145151748.0, "step": 3804 }, { "epoch": 0.484035110036891, "ewc_loss": 0.035546042025089264, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013451317499857396, "grad_norm": 4.665055274963379, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8561820387840271, "num_tokens": 145187074.0, "step": 3805 }, { "epoch": 0.4841623203154815, "ewc_loss": 0.03552573174238205, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013431007391773164, "grad_norm": 4.683698654174805, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8749738931655884, "num_tokens": 145224855.0, "step": 3806 }, { "epoch": 0.484289530594072, "ewc_loss": 0.03557046502828598, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013475737068802118, "grad_norm": 4.641430854797363, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8659656643867493, "num_tokens": 145266538.0, "step": 3807 }, { "epoch": 0.4844167408726625, "ewc_loss": 0.035516928881406784, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013422203483060002, "grad_norm": 4.683931350708008, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8629271984100342, "num_tokens": 145303560.0, "step": 3808 }, { "epoch": 0.48454395115125304, "ewc_loss": 0.03556745499372482, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001347272627754137, "grad_norm": 4.6266655921936035, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8574323654174805, "num_tokens": 145345985.0, "step": 3809 }, { "epoch": 0.4846711614298435, "ewc_loss": 0.035549014806747437, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013454286090563983, "grad_norm": 4.7406415939331055, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.847150444984436, "num_tokens": 145382132.0, "step": 3810 }, { "epoch": 0.48479837170843404, "ewc_loss": 0.035628486424684525, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013533759920392185, "grad_norm": 4.6919684410095215, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8477582931518555, "num_tokens": 145418992.0, "step": 3811 }, { "epoch": 0.48492558198702457, "ewc_loss": 0.03568066656589508, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013463871437124908, "grad_norm": 4.738117218017578, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8554302453994751, "num_tokens": 145455892.0, "step": 3812 }, { "epoch": 0.48505279226561504, "ewc_loss": 0.03561900556087494, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013524279347620904, "grad_norm": 4.657336235046387, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8552306890487671, "num_tokens": 145497217.0, "step": 3813 }, { "epoch": 0.48518000254420557, "ewc_loss": 0.03553064167499542, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013435912842396647, "grad_norm": 4.663789749145508, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8640774488449097, "num_tokens": 145533822.0, "step": 3814 }, { "epoch": 0.4853072128227961, "ewc_loss": 0.03572880104184151, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013512004807125777, "grad_norm": 4.658050060272217, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8766639232635498, "num_tokens": 145572967.0, "step": 3815 }, { "epoch": 0.48543442310138657, "ewc_loss": 0.03555770590901375, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.0001346297940472141, "grad_norm": 4.659862041473389, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8716809749603271, "num_tokens": 145609436.0, "step": 3816 }, { "epoch": 0.4855616333799771, "ewc_loss": 0.03572298586368561, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013506189861800522, "grad_norm": 4.69486665725708, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8690189123153687, "num_tokens": 145645083.0, "step": 3817 }, { "epoch": 0.4856888436585676, "ewc_loss": 0.03571955859661102, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013502759975381196, "grad_norm": 4.839422225952148, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8393398523330688, "num_tokens": 145680249.0, "step": 3818 }, { "epoch": 0.4858160539371581, "ewc_loss": 0.035638950765132904, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013544224202632904, "grad_norm": 4.665334224700928, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8626338243484497, "num_tokens": 145718569.0, "step": 3819 }, { "epoch": 0.4859432642157486, "ewc_loss": 0.03562147542834282, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001340467861155048, "grad_norm": 4.746860027313232, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8415970802307129, "num_tokens": 145751868.0, "step": 3820 }, { "epoch": 0.48607047449433916, "ewc_loss": 0.03578904643654823, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013572249736171216, "grad_norm": 4.67517614364624, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8620715737342834, "num_tokens": 145787881.0, "step": 3821 }, { "epoch": 0.48619768477292963, "ewc_loss": 0.03576111048460007, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013422241318039596, "grad_norm": 5.217000484466553, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8633609414100647, "num_tokens": 145824010.0, "step": 3822 }, { "epoch": 0.48632489505152016, "ewc_loss": 0.03602222725749016, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001380543108098209, "grad_norm": 4.6514716148376465, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8644728660583496, "num_tokens": 145860981.0, "step": 3823 }, { "epoch": 0.4864521053301107, "ewc_loss": 0.03541196510195732, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013195167412050068, "grad_norm": 4.66606330871582, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8618980646133423, "num_tokens": 145900687.0, "step": 3824 }, { "epoch": 0.48657931560870116, "ewc_loss": 0.0357431024312973, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013526306429412216, "grad_norm": 4.7103352546691895, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8483867645263672, "num_tokens": 145938888.0, "step": 3825 }, { "epoch": 0.4867065258872917, "ewc_loss": 0.03562340885400772, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013406614016275853, "grad_norm": 4.60623836517334, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8538795113563538, "num_tokens": 145987060.0, "step": 3826 }, { "epoch": 0.4868337361658822, "ewc_loss": 0.035644154995679855, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001342735777143389, "grad_norm": 4.743894100189209, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8522056341171265, "num_tokens": 146026447.0, "step": 3827 }, { "epoch": 0.4869609464444727, "ewc_loss": 0.035716503858566284, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013499708438757807, "grad_norm": 4.63739013671875, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8594428300857544, "num_tokens": 146067099.0, "step": 3828 }, { "epoch": 0.4870881567230632, "ewc_loss": 0.03563304990530014, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013416254660114646, "grad_norm": 4.642524242401123, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8446545004844666, "num_tokens": 146110767.0, "step": 3829 }, { "epoch": 0.48721536700165374, "ewc_loss": 0.035711344331502914, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013494548329617828, "grad_norm": 4.7115159034729, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8497551083564758, "num_tokens": 146147603.0, "step": 3830 }, { "epoch": 0.48734257728024427, "ewc_loss": 0.035689372569322586, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013472576392814517, "grad_norm": 4.643779754638672, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8676967620849609, "num_tokens": 146188821.0, "step": 3831 }, { "epoch": 0.48746978755883474, "ewc_loss": 0.03569118306040764, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013474386651068926, "grad_norm": 4.647503852844238, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8544800281524658, "num_tokens": 146231886.0, "step": 3832 }, { "epoch": 0.48759699783742527, "ewc_loss": 0.03570793941617012, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013491141726262867, "grad_norm": 4.685758113861084, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8344014883041382, "num_tokens": 146271965.0, "step": 3833 }, { "epoch": 0.4877242081160158, "ewc_loss": 0.03571654111146927, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013499743363354355, "grad_norm": 4.69814395904541, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8444792032241821, "num_tokens": 146316686.0, "step": 3834 }, { "epoch": 0.4878514183946063, "ewc_loss": 0.03558126091957092, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013486534589901567, "grad_norm": 4.661557674407959, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8477121591567993, "num_tokens": 146355098.0, "step": 3835 }, { "epoch": 0.4879786286731968, "ewc_loss": 0.03569009527564049, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013473298167809844, "grad_norm": 4.698296546936035, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8627501726150513, "num_tokens": 146390666.0, "step": 3836 }, { "epoch": 0.48810583895178733, "ewc_loss": 0.03576013818383217, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013543342356570065, "grad_norm": 4.714594841003418, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8645839691162109, "num_tokens": 146426740.0, "step": 3837 }, { "epoch": 0.4882330492303778, "ewc_loss": 0.03569989651441574, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001348309888271615, "grad_norm": 4.697354793548584, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8480565547943115, "num_tokens": 146462668.0, "step": 3838 }, { "epoch": 0.48836025950896833, "ewc_loss": 0.03560873121023178, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013514004240278155, "grad_norm": 4.698675632476807, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8493840098381042, "num_tokens": 146500160.0, "step": 3839 }, { "epoch": 0.48848746978755886, "ewc_loss": 0.035719133913517, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001350233651464805, "grad_norm": 4.639877796173096, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8521441221237183, "num_tokens": 146542963.0, "step": 3840 }, { "epoch": 0.48861468006614933, "ewc_loss": 0.035718511790037155, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013501713692676276, "grad_norm": 4.672375679016113, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8633053302764893, "num_tokens": 146580649.0, "step": 3841 }, { "epoch": 0.48874189034473986, "ewc_loss": 0.035721853375434875, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013505056267604232, "grad_norm": 4.660923480987549, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8644848465919495, "num_tokens": 146620823.0, "step": 3842 }, { "epoch": 0.4888691006233304, "ewc_loss": 0.03573291748762131, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013516118633560836, "grad_norm": 4.653714656829834, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8677276372909546, "num_tokens": 146663700.0, "step": 3843 }, { "epoch": 0.48899631090192086, "ewc_loss": 0.03574288636445999, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001352608815068379, "grad_norm": 4.71360445022583, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8528356552124023, "num_tokens": 146698211.0, "step": 3844 }, { "epoch": 0.4891235211805114, "ewc_loss": 0.03573412075638771, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013517323532141745, "grad_norm": 4.630281448364258, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8556820750236511, "num_tokens": 146738380.0, "step": 3845 }, { "epoch": 0.4892507314591019, "ewc_loss": 0.035629987716674805, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013535263133235276, "grad_norm": 4.691646575927734, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8602390885353088, "num_tokens": 146775413.0, "step": 3846 }, { "epoch": 0.4893779417376924, "ewc_loss": 0.0356445387005806, "ewc_loss_diag": 2.205371856689453e-05, "ewc_loss_parallel": 0.00013549812138080597, "grad_norm": 4.655483722686768, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.859443724155426, "num_tokens": 146814774.0, "step": 3847 }, { "epoch": 0.4895051520162829, "ewc_loss": 0.03571026772260666, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013493470032699406, "grad_norm": 4.640425682067871, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8672529458999634, "num_tokens": 146853957.0, "step": 3848 }, { "epoch": 0.48963236229487345, "ewc_loss": 0.03574736416339874, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013530567230191082, "grad_norm": 4.7360334396362305, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8496352434158325, "num_tokens": 146889060.0, "step": 3849 }, { "epoch": 0.4897595725734639, "ewc_loss": 0.03579247370362282, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001357567816739902, "grad_norm": 4.6417365074157715, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8669739961624146, "num_tokens": 146929849.0, "step": 3850 }, { "epoch": 0.48988678285205445, "ewc_loss": 0.035716574639081955, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013499778287950903, "grad_norm": 4.680746555328369, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8585716485977173, "num_tokens": 146968376.0, "step": 3851 }, { "epoch": 0.490013993130645, "ewc_loss": 0.0358041487634182, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013587353168986738, "grad_norm": 4.731081962585449, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8420034646987915, "num_tokens": 147002871.0, "step": 3852 }, { "epoch": 0.49014120340923545, "ewc_loss": 0.03579439967870712, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013577604840975255, "grad_norm": 4.679408073425293, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8527017831802368, "num_tokens": 147039412.0, "step": 3853 }, { "epoch": 0.490268413687826, "ewc_loss": 0.03589823096990585, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013559364015236497, "grad_norm": 4.632381439208984, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8610032796859741, "num_tokens": 147080307.0, "step": 3854 }, { "epoch": 0.4903956239664165, "ewc_loss": 0.03590938448905945, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013570515147875994, "grad_norm": 4.676955223083496, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8643895983695984, "num_tokens": 147120131.0, "step": 3855 }, { "epoch": 0.490522834245007, "ewc_loss": 0.03595976531505585, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013620899699162692, "grad_norm": 4.727487087249756, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8515439033508301, "num_tokens": 147155760.0, "step": 3856 }, { "epoch": 0.4906500445235975, "ewc_loss": 0.035958755761384964, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001361988834105432, "grad_norm": 4.650071144104004, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8615732789039612, "num_tokens": 147195406.0, "step": 3857 }, { "epoch": 0.49077725480218803, "ewc_loss": 0.03595972806215286, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013620858953800052, "grad_norm": 4.715580940246582, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8705976009368896, "num_tokens": 147235688.0, "step": 3858 }, { "epoch": 0.4909044650807785, "ewc_loss": 0.03598480671644211, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001364593772450462, "grad_norm": 4.688686847686768, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8652534484863281, "num_tokens": 147271575.0, "step": 3859 }, { "epoch": 0.49103167535936904, "ewc_loss": 0.03595729172229767, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013618424418382347, "grad_norm": 4.93937873840332, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8540734052658081, "num_tokens": 147315473.0, "step": 3860 }, { "epoch": 0.49115888563795956, "ewc_loss": 0.03603539243340492, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013696524547412992, "grad_norm": 4.60354471206665, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8614962697029114, "num_tokens": 147359058.0, "step": 3861 }, { "epoch": 0.49128609591655004, "ewc_loss": 0.03580880165100098, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001346993667539209, "grad_norm": 4.731956958770752, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8620420694351196, "num_tokens": 147394443.0, "step": 3862 }, { "epoch": 0.49141330619514056, "ewc_loss": 0.03591220825910568, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013695409870706499, "grad_norm": 4.65990686416626, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.853107750415802, "num_tokens": 147436225.0, "step": 3863 }, { "epoch": 0.4915405164737311, "ewc_loss": 0.03576970472931862, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013552908785641193, "grad_norm": 4.7067036628723145, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8683741092681885, "num_tokens": 147473449.0, "step": 3864 }, { "epoch": 0.49166772675232157, "ewc_loss": 0.03585473448038101, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013637938536703587, "grad_norm": 4.695034027099609, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8557425737380981, "num_tokens": 147510330.0, "step": 3865 }, { "epoch": 0.4917949370309121, "ewc_loss": 0.035805560648441315, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013588763249572366, "grad_norm": 4.683058738708496, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8691685795783997, "num_tokens": 147544683.0, "step": 3866 }, { "epoch": 0.4919221473095026, "ewc_loss": 0.03585778921842575, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013640991528518498, "grad_norm": 4.663841724395752, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8754954934120178, "num_tokens": 147589330.0, "step": 3867 }, { "epoch": 0.4920493575880931, "ewc_loss": 0.03583144396543503, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013614646741189063, "grad_norm": 4.723178386688232, "learning_rate": 1e-06, "loss": 0.5638, "mean_token_accuracy": 0.8273034691810608, "num_tokens": 147626864.0, "step": 3868 }, { "epoch": 0.4921765678666836, "ewc_loss": 0.03587908297777176, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013662288256455213, "grad_norm": 4.699314117431641, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.839436411857605, "num_tokens": 147672006.0, "step": 3869 }, { "epoch": 0.49230377814527415, "ewc_loss": 0.03583477810025215, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013617980584967881, "grad_norm": 4.660566329956055, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8479210138320923, "num_tokens": 147717388.0, "step": 3870 }, { "epoch": 0.4924309884238646, "ewc_loss": 0.03582444787025452, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001360764872515574, "grad_norm": 4.693161964416504, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8464908003807068, "num_tokens": 147759827.0, "step": 3871 }, { "epoch": 0.49255819870245515, "ewc_loss": 0.03583439812064171, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001361760077998042, "grad_norm": 4.79276180267334, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8385355472564697, "num_tokens": 147793742.0, "step": 3872 }, { "epoch": 0.4926854089810457, "ewc_loss": 0.03588040918111801, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013663612480740994, "grad_norm": 4.751001834869385, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8674736022949219, "num_tokens": 147829881.0, "step": 3873 }, { "epoch": 0.49281261925963615, "ewc_loss": 0.03581593930721283, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001359914313070476, "grad_norm": 4.6870436668396, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8427361249923706, "num_tokens": 147875051.0, "step": 3874 }, { "epoch": 0.4929398295382267, "ewc_loss": 0.03591633215546608, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013577465142589062, "grad_norm": 4.7008795738220215, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8507024049758911, "num_tokens": 147912786.0, "step": 3875 }, { "epoch": 0.4930670398168172, "ewc_loss": 0.03584597259759903, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013629176828544587, "grad_norm": 4.6872053146362305, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8360581398010254, "num_tokens": 147954981.0, "step": 3876 }, { "epoch": 0.4931942500954077, "ewc_loss": 0.035930532962083817, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013591664901468903, "grad_norm": 4.739049434661865, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8497109413146973, "num_tokens": 147992179.0, "step": 3877 }, { "epoch": 0.4933214603739982, "ewc_loss": 0.035867027938365936, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001365023199468851, "grad_norm": 4.707309722900391, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8621450066566467, "num_tokens": 148029866.0, "step": 3878 }, { "epoch": 0.49344867065258874, "ewc_loss": 0.03582674264907837, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.000136099464725703, "grad_norm": 4.712989807128906, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8569525480270386, "num_tokens": 148070057.0, "step": 3879 }, { "epoch": 0.4935758809311792, "ewc_loss": 0.03599829971790314, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013659433170687407, "grad_norm": 4.7149858474731445, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.861996054649353, "num_tokens": 148108461.0, "step": 3880 }, { "epoch": 0.49370309120976974, "ewc_loss": 0.03604283928871155, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013703970762435347, "grad_norm": 4.739455223083496, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8504596948623657, "num_tokens": 148147609.0, "step": 3881 }, { "epoch": 0.49383030148836027, "ewc_loss": 0.03601418435573578, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013675315130967647, "grad_norm": 4.662198543548584, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8514758348464966, "num_tokens": 148190475.0, "step": 3882 }, { "epoch": 0.4939575117669508, "ewc_loss": 0.03597337007522583, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013634504284709692, "grad_norm": 4.792543888092041, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8696054220199585, "num_tokens": 148221187.0, "step": 3883 }, { "epoch": 0.49408472204554127, "ewc_loss": 0.036116741597652435, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013777875574305654, "grad_norm": 4.695032596588135, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8653940558433533, "num_tokens": 148260916.0, "step": 3884 }, { "epoch": 0.4942119323241318, "ewc_loss": 0.03597880154848099, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013639935059472919, "grad_norm": 4.75547981262207, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.863150417804718, "num_tokens": 148295591.0, "step": 3885 }, { "epoch": 0.4943391426027223, "ewc_loss": 0.036059457808732986, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013720591960009187, "grad_norm": 4.710695743560791, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8519220352172852, "num_tokens": 148333445.0, "step": 3886 }, { "epoch": 0.4944663528813128, "ewc_loss": 0.03609584644436836, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001363490882795304, "grad_norm": 4.765958786010742, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8667245507240295, "num_tokens": 148367951.0, "step": 3887 }, { "epoch": 0.4945935631599033, "ewc_loss": 0.03604981303215027, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013710945495404303, "grad_norm": 4.721012592315674, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8490975499153137, "num_tokens": 148408491.0, "step": 3888 }, { "epoch": 0.49472077343849385, "ewc_loss": 0.036001987755298615, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013663120626006275, "grad_norm": 4.793549537658691, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8576251268386841, "num_tokens": 148443883.0, "step": 3889 }, { "epoch": 0.4948479837170843, "ewc_loss": 0.03605012595653534, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013711258361581713, "grad_norm": 4.750637054443359, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8591040372848511, "num_tokens": 148480511.0, "step": 3890 }, { "epoch": 0.49497519399567486, "ewc_loss": 0.03584561496973038, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.00013628820306621492, "grad_norm": 4.761783123016357, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.851675808429718, "num_tokens": 148514759.0, "step": 3891 }, { "epoch": 0.4951024042742654, "ewc_loss": 0.03590122610330582, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001368442754028365, "grad_norm": 4.711551189422607, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8589868545532227, "num_tokens": 148553209.0, "step": 3892 }, { "epoch": 0.49522961455285586, "ewc_loss": 0.035962313413619995, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013623446284327656, "grad_norm": 4.7451019287109375, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.848495364189148, "num_tokens": 148590387.0, "step": 3893 }, { "epoch": 0.4953568248314464, "ewc_loss": 0.03590552508831024, "ewc_loss_diag": 2.2172927856445312e-05, "ewc_loss_parallel": 0.0001368872617604211, "grad_norm": 4.760246276855469, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8558691143989563, "num_tokens": 148625201.0, "step": 3894 }, { "epoch": 0.4954840351100369, "ewc_loss": 0.03600340709090233, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001366453943774104, "grad_norm": 4.79771089553833, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8603695631027222, "num_tokens": 148665989.0, "step": 3895 }, { "epoch": 0.4956112453886274, "ewc_loss": 0.03605160489678383, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013712736836168915, "grad_norm": 4.758603572845459, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.860647976398468, "num_tokens": 148704839.0, "step": 3896 }, { "epoch": 0.4957384556672179, "ewc_loss": 0.035945214331150055, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013606349239125848, "grad_norm": 4.736168384552002, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.867721438407898, "num_tokens": 148740523.0, "step": 3897 }, { "epoch": 0.49586566594580844, "ewc_loss": 0.03599202632904053, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013653159840032458, "grad_norm": 4.734313011169434, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8716660737991333, "num_tokens": 148779103.0, "step": 3898 }, { "epoch": 0.4959928762243989, "ewc_loss": 0.03596433252096176, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001362546463496983, "grad_norm": 4.7282633781433105, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8682088255882263, "num_tokens": 148814659.0, "step": 3899 }, { "epoch": 0.49612008650298944, "ewc_loss": 0.03595675155520439, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013617884542327374, "grad_norm": 4.664227485656738, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.877755880355835, "num_tokens": 148856864.0, "step": 3900 }, { "epoch": 0.49624729678157997, "ewc_loss": 0.03592881187796593, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001358994486508891, "grad_norm": 4.706028461456299, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.863688051700592, "num_tokens": 148890309.0, "step": 3901 }, { "epoch": 0.49637450706017044, "ewc_loss": 0.0360250361263752, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013686167949344963, "grad_norm": 4.7539849281311035, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8423870801925659, "num_tokens": 148926662.0, "step": 3902 }, { "epoch": 0.496501717338761, "ewc_loss": 0.03603208437561989, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013693218352273107, "grad_norm": 4.777534008026123, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8446311354637146, "num_tokens": 148963032.0, "step": 3903 }, { "epoch": 0.4966289276173515, "ewc_loss": 0.036141250282526016, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001368031371384859, "grad_norm": 4.762611389160156, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8641120195388794, "num_tokens": 148996684.0, "step": 3904 }, { "epoch": 0.496756137895942, "ewc_loss": 0.036027081310749054, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013688215403817594, "grad_norm": 4.704874038696289, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8575951457023621, "num_tokens": 149032735.0, "step": 3905 }, { "epoch": 0.4968833481745325, "ewc_loss": 0.03601478040218353, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.0001367591175949201, "grad_norm": 4.737136363983154, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.859338104724884, "num_tokens": 149069415.0, "step": 3906 }, { "epoch": 0.49701055845312303, "ewc_loss": 0.0361749641597271, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013714026135858148, "grad_norm": 4.697633266448975, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8431662321090698, "num_tokens": 149115042.0, "step": 3907 }, { "epoch": 0.4971377687317135, "ewc_loss": 0.03623703122138977, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013654024223797023, "grad_norm": 4.738948345184326, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8664019703865051, "num_tokens": 149149767.0, "step": 3908 }, { "epoch": 0.49726497901030403, "ewc_loss": 0.036261703819036484, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013678695540875196, "grad_norm": 4.679327964782715, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.851110577583313, "num_tokens": 149193000.0, "step": 3909 }, { "epoch": 0.49739218928889456, "ewc_loss": 0.03627948462963104, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013696475070901215, "grad_norm": 4.758236408233643, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8547151684761047, "num_tokens": 149226963.0, "step": 3910 }, { "epoch": 0.49751939956748503, "ewc_loss": 0.03606574982404709, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013726881297770888, "grad_norm": 4.706742286682129, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.852969765663147, "num_tokens": 149268139.0, "step": 3911 }, { "epoch": 0.49764660984607556, "ewc_loss": 0.0362735278904438, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013690520427189767, "grad_norm": 4.781523704528809, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8649421334266663, "num_tokens": 149305882.0, "step": 3912 }, { "epoch": 0.4977738201246661, "ewc_loss": 0.036328524351119995, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013745516480412334, "grad_norm": 4.701638698577881, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8535811901092529, "num_tokens": 149344119.0, "step": 3913 }, { "epoch": 0.49790103040325656, "ewc_loss": 0.03597802296280861, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.000136391565320082, "grad_norm": 4.701181411743164, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8479889631271362, "num_tokens": 149384102.0, "step": 3914 }, { "epoch": 0.4980282406818471, "ewc_loss": 0.03633301705121994, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013750011567026377, "grad_norm": 4.681429386138916, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8543264865875244, "num_tokens": 149425677.0, "step": 3915 }, { "epoch": 0.4981554509604376, "ewc_loss": 0.036254920065402985, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013671914348378778, "grad_norm": 4.758987903594971, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8536614775657654, "num_tokens": 149463275.0, "step": 3916 }, { "epoch": 0.4982826612390281, "ewc_loss": 0.03636258468031883, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013779578148387372, "grad_norm": 4.706231117248535, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8724733591079712, "num_tokens": 149503950.0, "step": 3917 }, { "epoch": 0.4984098715176186, "ewc_loss": 0.03624714910984039, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013664142170455307, "grad_norm": 4.711464881896973, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8488264083862305, "num_tokens": 149541192.0, "step": 3918 }, { "epoch": 0.49853708179620915, "ewc_loss": 0.03607514128088951, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013736274559050798, "grad_norm": 4.691208839416504, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8548437356948853, "num_tokens": 149581784.0, "step": 3919 }, { "epoch": 0.4986642920747996, "ewc_loss": 0.03631465509533882, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013731647050008178, "grad_norm": 4.706859588623047, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8630490303039551, "num_tokens": 149619312.0, "step": 3920 }, { "epoch": 0.49879150235339015, "ewc_loss": 0.036053046584129333, "ewc_loss_diag": 2.2292137145996094e-05, "ewc_loss_parallel": 0.00013714177475776523, "grad_norm": 4.7214035987854, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8545491099357605, "num_tokens": 149657443.0, "step": 3921 }, { "epoch": 0.4989187126319807, "ewc_loss": 0.036331452429294586, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001374844287056476, "grad_norm": 4.796185493469238, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8695065975189209, "num_tokens": 149694992.0, "step": 3922 }, { "epoch": 0.49904592291057115, "ewc_loss": 0.036367569118738174, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013784560724161565, "grad_norm": 4.777220726013184, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8529970645904541, "num_tokens": 149729554.0, "step": 3923 }, { "epoch": 0.4991731331891617, "ewc_loss": 0.036296360194683075, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013713350926991552, "grad_norm": 4.702359676361084, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8633872270584106, "num_tokens": 149770051.0, "step": 3924 }, { "epoch": 0.4993003434677522, "ewc_loss": 0.03627575933933258, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013692751235794276, "grad_norm": 4.774340629577637, "learning_rate": 1e-06, "loss": 0.5597, "mean_token_accuracy": 0.8264610767364502, "num_tokens": 149806724.0, "step": 3925 }, { "epoch": 0.4994275537463427, "ewc_loss": 0.036368753761053085, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013785746705252677, "grad_norm": 4.692963123321533, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8488287925720215, "num_tokens": 149849874.0, "step": 3926 }, { "epoch": 0.4995547640249332, "ewc_loss": 0.0362926609814167, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013709653285332024, "grad_norm": 4.7149224281311035, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8571981191635132, "num_tokens": 149888076.0, "step": 3927 }, { "epoch": 0.49968197430352373, "ewc_loss": 0.03635963797569275, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013776631385553628, "grad_norm": 4.735969543457031, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8579471111297607, "num_tokens": 149923750.0, "step": 3928 }, { "epoch": 0.4998091845821142, "ewc_loss": 0.03631671145558357, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013733703235629946, "grad_norm": 4.787416458129883, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.845481276512146, "num_tokens": 149960657.0, "step": 3929 }, { "epoch": 0.49993639486070474, "ewc_loss": 0.03622562065720558, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013764682807959616, "grad_norm": 4.70649528503418, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8720781803131104, "num_tokens": 149995983.0, "step": 3930 }, { "epoch": 0.5000636051392953, "ewc_loss": 0.03616854548454285, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013707607286050916, "grad_norm": 4.667938232421875, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8358895778656006, "num_tokens": 150037508.0, "step": 3931 }, { "epoch": 0.5001908154178858, "ewc_loss": 0.03619812801480293, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013737191329710186, "grad_norm": 4.703036785125732, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8529900312423706, "num_tokens": 150077806.0, "step": 3932 }, { "epoch": 0.5003180256964763, "ewc_loss": 0.03623998910188675, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013779052824247628, "grad_norm": 4.7080769538879395, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8548837900161743, "num_tokens": 150117589.0, "step": 3933 }, { "epoch": 0.5004452359750667, "ewc_loss": 0.03621870279312134, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013757763372268528, "grad_norm": 4.689028263092041, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8505693674087524, "num_tokens": 150161579.0, "step": 3934 }, { "epoch": 0.5005724462536573, "ewc_loss": 0.036323100328445435, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013740094436798245, "grad_norm": 4.746654510498047, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8437544107437134, "num_tokens": 150208018.0, "step": 3935 }, { "epoch": 0.5006996565322478, "ewc_loss": 0.03635377064347267, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001377076405333355, "grad_norm": 4.684732437133789, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.857932448387146, "num_tokens": 150250164.0, "step": 3936 }, { "epoch": 0.5008268668108383, "ewc_loss": 0.03618112578988075, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001372018741676584, "grad_norm": 4.66621208190918, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8759868144989014, "num_tokens": 150289972.0, "step": 3937 }, { "epoch": 0.5009540770894289, "ewc_loss": 0.03622382506728172, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013762887101620436, "grad_norm": 4.711897850036621, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8549308776855469, "num_tokens": 150333656.0, "step": 3938 }, { "epoch": 0.5010812873680194, "ewc_loss": 0.03619327023625374, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013732333900406957, "grad_norm": 4.651029586791992, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8700851202011108, "num_tokens": 150374086.0, "step": 3939 }, { "epoch": 0.5012084976466098, "ewc_loss": 0.036221735179424286, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013760797446593642, "grad_norm": 4.776505947113037, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8603825569152832, "num_tokens": 150407472.0, "step": 3940 }, { "epoch": 0.5013357079252003, "ewc_loss": 0.03625181317329407, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013790874800179154, "grad_norm": 4.709995746612549, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8624441027641296, "num_tokens": 150444560.0, "step": 3941 }, { "epoch": 0.5014629182037909, "ewc_loss": 0.03616289049386978, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013701953866984695, "grad_norm": 4.75947904586792, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8611324429512024, "num_tokens": 150483621.0, "step": 3942 }, { "epoch": 0.5015901284823814, "ewc_loss": 0.03622123599052429, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013760296860709786, "grad_norm": 4.7164411544799805, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8577598929405212, "num_tokens": 150520414.0, "step": 3943 }, { "epoch": 0.5017173387609719, "ewc_loss": 0.036159712821245193, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001369877572869882, "grad_norm": 4.749359607696533, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.848595380783081, "num_tokens": 150556589.0, "step": 3944 }, { "epoch": 0.5018445490395624, "ewc_loss": 0.03622601181268692, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013765075709670782, "grad_norm": 4.724708080291748, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8550376892089844, "num_tokens": 150592393.0, "step": 3945 }, { "epoch": 0.5019717593181529, "ewc_loss": 0.036168310791254044, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001370737300021574, "grad_norm": 4.7254133224487305, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8436139822006226, "num_tokens": 150630457.0, "step": 3946 }, { "epoch": 0.5020989695967434, "ewc_loss": 0.03619934618473053, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013738406414631754, "grad_norm": 4.765044212341309, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8645629286766052, "num_tokens": 150659482.0, "step": 3947 }, { "epoch": 0.5022261798753339, "ewc_loss": 0.03622763231396675, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.00013766693882644176, "grad_norm": 4.716863632202148, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8569536209106445, "num_tokens": 150700920.0, "step": 3948 }, { "epoch": 0.5023533901539244, "ewc_loss": 0.03633158281445503, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013748573837801814, "grad_norm": 4.6861796379089355, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8677159547805786, "num_tokens": 150736393.0, "step": 3949 }, { "epoch": 0.502480600432515, "ewc_loss": 0.0363747701048851, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013791762467008084, "grad_norm": 4.692767143249512, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8541641235351562, "num_tokens": 150783848.0, "step": 3950 }, { "epoch": 0.5026078107111055, "ewc_loss": 0.03624657914042473, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001378564047627151, "grad_norm": 4.704619884490967, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8537458181381226, "num_tokens": 150824179.0, "step": 3951 }, { "epoch": 0.5027350209896959, "ewc_loss": 0.036285098642110825, "ewc_loss_diag": 2.2411346435546875e-05, "ewc_loss_parallel": 0.0001382416085107252, "grad_norm": 4.725375175476074, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8571735620498657, "num_tokens": 150862775.0, "step": 3952 }, { "epoch": 0.5028622312682864, "ewc_loss": 0.03642807900905609, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013845071953255683, "grad_norm": 4.737590789794922, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8562278151512146, "num_tokens": 150900005.0, "step": 3953 }, { "epoch": 0.502989441546877, "ewc_loss": 0.03637334331870079, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001379033492412418, "grad_norm": 4.7402663230896, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8537781834602356, "num_tokens": 150936148.0, "step": 3954 }, { "epoch": 0.5031166518254675, "ewc_loss": 0.03641389310359955, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013830883835908026, "grad_norm": 4.753256320953369, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8540964126586914, "num_tokens": 150971661.0, "step": 3955 }, { "epoch": 0.503243862104058, "ewc_loss": 0.03644724190235138, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013864232460036874, "grad_norm": 4.749711990356445, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8679684996604919, "num_tokens": 151009521.0, "step": 3956 }, { "epoch": 0.5033710723826486, "ewc_loss": 0.03640870749950409, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013825697533320636, "grad_norm": 4.722040176391602, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8536350727081299, "num_tokens": 151053641.0, "step": 3957 }, { "epoch": 0.5034982826612391, "ewc_loss": 0.0363747738301754, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013791766832582653, "grad_norm": 4.723589897155762, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8612865209579468, "num_tokens": 151094476.0, "step": 3958 }, { "epoch": 0.5036254929398295, "ewc_loss": 0.03640132397413254, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013818315346725285, "grad_norm": 4.8638811111450195, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8510064482688904, "num_tokens": 151126381.0, "step": 3959 }, { "epoch": 0.50375270321842, "ewc_loss": 0.03644270449876785, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013859698083251715, "grad_norm": 4.798681259155273, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8613724708557129, "num_tokens": 151156875.0, "step": 3960 }, { "epoch": 0.5038799134970106, "ewc_loss": 0.03634710609912872, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013764097820967436, "grad_norm": 4.710742473602295, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8529369235038757, "num_tokens": 151194394.0, "step": 3961 }, { "epoch": 0.5040071237756011, "ewc_loss": 0.036377910524606705, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013794902770314366, "grad_norm": 4.747313022613525, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8428407311439514, "num_tokens": 151234472.0, "step": 3962 }, { "epoch": 0.5041343340541916, "ewc_loss": 0.03638515621423721, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001380214816890657, "grad_norm": 4.796730995178223, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8528054356575012, "num_tokens": 151270426.0, "step": 3963 }, { "epoch": 0.5042615443327821, "ewc_loss": 0.036436066031455994, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013853059499524534, "grad_norm": 4.814531326293945, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.855277419090271, "num_tokens": 151305793.0, "step": 3964 }, { "epoch": 0.5043887546113726, "ewc_loss": 0.036404550075531006, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013821542961522937, "grad_norm": 4.76804256439209, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8448455333709717, "num_tokens": 151345116.0, "step": 3965 }, { "epoch": 0.5045159648899631, "ewc_loss": 0.03651633858680725, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001381125912303105, "grad_norm": 4.772818565368652, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8624992370605469, "num_tokens": 151381901.0, "step": 3966 }, { "epoch": 0.5046431751685536, "ewc_loss": 0.03669428080320358, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013867131201550364, "grad_norm": 4.744565963745117, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8587727546691895, "num_tokens": 151426884.0, "step": 3967 }, { "epoch": 0.5047703854471441, "ewc_loss": 0.03662188723683357, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013794738333672285, "grad_norm": 4.763599395751953, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8498462438583374, "num_tokens": 151467596.0, "step": 3968 }, { "epoch": 0.5048975957257347, "ewc_loss": 0.03664781153202057, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013820661115460098, "grad_norm": 4.728846549987793, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8765549063682556, "num_tokens": 151502131.0, "step": 3969 }, { "epoch": 0.5050248060043252, "ewc_loss": 0.03662869334220886, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001380154280923307, "grad_norm": 4.815982341766357, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8528238534927368, "num_tokens": 151538932.0, "step": 3970 }, { "epoch": 0.5051520162829156, "ewc_loss": 0.03643007203936577, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013847064110450447, "grad_norm": 4.723176956176758, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8556587100028992, "num_tokens": 151577839.0, "step": 3971 }, { "epoch": 0.5052792265615061, "ewc_loss": 0.03633894398808479, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013755937106907368, "grad_norm": 4.759636402130127, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8573863506317139, "num_tokens": 151618984.0, "step": 3972 }, { "epoch": 0.5054064368400967, "ewc_loss": 0.03637486696243286, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013791861420031637, "grad_norm": 4.71330451965332, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8573941588401794, "num_tokens": 151658392.0, "step": 3973 }, { "epoch": 0.5055336471186872, "ewc_loss": 0.0364832729101181, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013778195716440678, "grad_norm": 4.812311172485352, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.866234302520752, "num_tokens": 151689781.0, "step": 3974 }, { "epoch": 0.5056608573972777, "ewc_loss": 0.03657202422618866, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013866944937035441, "grad_norm": 4.777403831481934, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8421798944473267, "num_tokens": 151729612.0, "step": 3975 }, { "epoch": 0.5057880676758683, "ewc_loss": 0.03648102283477783, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013775945990346372, "grad_norm": 4.7379865646362305, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8442514538764954, "num_tokens": 151769270.0, "step": 3976 }, { "epoch": 0.5059152779544587, "ewc_loss": 0.03639105334877968, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.00013808046060148627, "grad_norm": 4.846035957336426, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8389742374420166, "num_tokens": 151803936.0, "step": 3977 }, { "epoch": 0.5060424882330492, "ewc_loss": 0.03642437607049942, "ewc_loss_diag": 2.2530555725097656e-05, "ewc_loss_parallel": 0.0001384136703563854, "grad_norm": 4.742918968200684, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.844798743724823, "num_tokens": 151840429.0, "step": 3978 }, { "epoch": 0.5061696985116397, "ewc_loss": 0.036462876945734024, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.000137577997520566, "grad_norm": 4.732621669769287, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8588749170303345, "num_tokens": 151883296.0, "step": 3979 }, { "epoch": 0.5062969087902303, "ewc_loss": 0.03650867938995361, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013803601905237883, "grad_norm": 4.742660045623779, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8720195293426514, "num_tokens": 151917658.0, "step": 3980 }, { "epoch": 0.5064241190688208, "ewc_loss": 0.03662018105387688, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013793031394015998, "grad_norm": 4.708963871002197, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8564325571060181, "num_tokens": 151958822.0, "step": 3981 }, { "epoch": 0.5065513293474113, "ewc_loss": 0.036515139043331146, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001381005859002471, "grad_norm": 4.69778299331665, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.852109432220459, "num_tokens": 151994964.0, "step": 3982 }, { "epoch": 0.5066785396260017, "ewc_loss": 0.03655156493186951, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001384648639941588, "grad_norm": 4.739372730255127, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8518856763839722, "num_tokens": 152037013.0, "step": 3983 }, { "epoch": 0.5068057499045923, "ewc_loss": 0.036558400839567184, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013853321433998644, "grad_norm": 4.722846984863281, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8526688814163208, "num_tokens": 152074821.0, "step": 3984 }, { "epoch": 0.5069329601831828, "ewc_loss": 0.03655732050538063, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013852243137080222, "grad_norm": 4.735976219177246, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8490302562713623, "num_tokens": 152116667.0, "step": 3985 }, { "epoch": 0.5070601704617733, "ewc_loss": 0.0365651398897171, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013860061881132424, "grad_norm": 4.7384796142578125, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8611478805541992, "num_tokens": 152152714.0, "step": 3986 }, { "epoch": 0.5071873807403638, "ewc_loss": 0.03666642680764198, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001383927883580327, "grad_norm": 4.754774570465088, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.842758297920227, "num_tokens": 152190230.0, "step": 3987 }, { "epoch": 0.5073145910189544, "ewc_loss": 0.036586944013834, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.0001388186647091061, "grad_norm": 4.762392997741699, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8563807606697083, "num_tokens": 152228887.0, "step": 3988 }, { "epoch": 0.5074418012975448, "ewc_loss": 0.03659024089574814, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013885163934901357, "grad_norm": 4.724636554718018, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8647779226303101, "num_tokens": 152271863.0, "step": 3989 }, { "epoch": 0.5075690115761353, "ewc_loss": 0.036676712334156036, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013849562674295157, "grad_norm": 4.770873069763184, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8517140746116638, "num_tokens": 152310765.0, "step": 3990 }, { "epoch": 0.5076962218547258, "ewc_loss": 0.036604057997465134, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013898979523219168, "grad_norm": 4.799642562866211, "learning_rate": 1e-06, "loss": 0.3871, "mean_token_accuracy": 0.8789708614349365, "num_tokens": 152342545.0, "step": 3991 }, { "epoch": 0.5078234321333164, "ewc_loss": 0.036559391766786575, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013854312419425696, "grad_norm": 4.773796081542969, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8397712707519531, "num_tokens": 152386223.0, "step": 3992 }, { "epoch": 0.5079506424119069, "ewc_loss": 0.0365922674536705, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013887189561501145, "grad_norm": 4.784119129180908, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8634753227233887, "num_tokens": 152421811.0, "step": 3993 }, { "epoch": 0.5080778526904974, "ewc_loss": 0.03654814139008522, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013843062333762646, "grad_norm": 4.807809352874756, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8445885181427002, "num_tokens": 152454581.0, "step": 3994 }, { "epoch": 0.5082050629690879, "ewc_loss": 0.03670182079076767, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013874673459213227, "grad_norm": 4.795438289642334, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8472909927368164, "num_tokens": 152490591.0, "step": 3995 }, { "epoch": 0.5083322732476784, "ewc_loss": 0.03656501695513725, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013859939645044506, "grad_norm": 4.712752342224121, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8787616491317749, "num_tokens": 152532290.0, "step": 3996 }, { "epoch": 0.5084594835262689, "ewc_loss": 0.036679502576589584, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001385235518682748, "grad_norm": 4.765818119049072, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8614595532417297, "num_tokens": 152571984.0, "step": 3997 }, { "epoch": 0.5085866938048594, "ewc_loss": 0.03674754872918129, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013920399942435324, "grad_norm": 4.777668476104736, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8568404912948608, "num_tokens": 152611082.0, "step": 3998 }, { "epoch": 0.50871390408345, "ewc_loss": 0.0367375873029232, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013910439156461507, "grad_norm": 4.736239433288574, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8649265766143799, "num_tokens": 152655303.0, "step": 3999 }, { "epoch": 0.5088411143620405, "ewc_loss": 0.036697473376989365, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001387032534694299, "grad_norm": 4.982906818389893, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8559027910232544, "num_tokens": 152688432.0, "step": 4000 }, { "epoch": 0.5089683246406309, "ewc_loss": 0.03679024428129196, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013963095261715353, "grad_norm": 4.779326915740967, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8516056537628174, "num_tokens": 152725637.0, "step": 4001 }, { "epoch": 0.5090955349192214, "ewc_loss": 0.03659011423587799, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013762967137154192, "grad_norm": 4.820675849914551, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8551650047302246, "num_tokens": 152763108.0, "step": 4002 }, { "epoch": 0.509222745197812, "ewc_loss": 0.03670215606689453, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013875009608455002, "grad_norm": 4.744604110717773, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8631201386451721, "num_tokens": 152800659.0, "step": 4003 }, { "epoch": 0.5093499554764025, "ewc_loss": 0.03650213032960892, "ewc_loss_diag": 2.2649765014648438e-05, "ewc_loss_parallel": 0.00013797053543385118, "grad_norm": 4.732471466064453, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8470644950866699, "num_tokens": 152842797.0, "step": 4004 }, { "epoch": 0.509477165754993, "ewc_loss": 0.03670913726091385, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013881988706998527, "grad_norm": 4.780416011810303, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8478726744651794, "num_tokens": 152888257.0, "step": 4005 }, { "epoch": 0.5096043760335836, "ewc_loss": 0.03669015318155289, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000138630042783916, "grad_norm": 4.749063968658447, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8555372953414917, "num_tokens": 152929089.0, "step": 4006 }, { "epoch": 0.5097315863121741, "ewc_loss": 0.03668471425771713, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013857567682862282, "grad_norm": 4.80265998840332, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8508582711219788, "num_tokens": 152972710.0, "step": 4007 }, { "epoch": 0.5098587965907645, "ewc_loss": 0.03670684993267059, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001387970260111615, "grad_norm": 4.785672187805176, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8564534783363342, "num_tokens": 153008867.0, "step": 4008 }, { "epoch": 0.509986006869355, "ewc_loss": 0.0367097333073616, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001388258533552289, "grad_norm": 4.766939163208008, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8512935042381287, "num_tokens": 153051497.0, "step": 4009 }, { "epoch": 0.5101132171479456, "ewc_loss": 0.03669336438179016, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013866214430890977, "grad_norm": 4.784177780151367, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8723517060279846, "num_tokens": 153089295.0, "step": 4010 }, { "epoch": 0.5102404274265361, "ewc_loss": 0.03672858700156212, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001390143734170124, "grad_norm": 4.772548198699951, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8603408932685852, "num_tokens": 153123957.0, "step": 4011 }, { "epoch": 0.5103676377051266, "ewc_loss": 0.03669857606291771, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.000138714283821173, "grad_norm": 4.743061065673828, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8541423082351685, "num_tokens": 153165248.0, "step": 4012 }, { "epoch": 0.5104948479837171, "ewc_loss": 0.036707520484924316, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013880373444408178, "grad_norm": 4.76358699798584, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8706899881362915, "num_tokens": 153206163.0, "step": 4013 }, { "epoch": 0.5106220582623076, "ewc_loss": 0.036758437752723694, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001393128914060071, "grad_norm": 4.7535080909729, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8476372957229614, "num_tokens": 153248666.0, "step": 4014 }, { "epoch": 0.5107492685408981, "ewc_loss": 0.036727387458086014, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013900238263886422, "grad_norm": 4.738433361053467, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.879297137260437, "num_tokens": 153289009.0, "step": 4015 }, { "epoch": 0.5108764788194886, "ewc_loss": 0.036739520728588104, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013912373105995357, "grad_norm": 4.8106560707092285, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8616491556167603, "num_tokens": 153321354.0, "step": 4016 }, { "epoch": 0.5110036890980791, "ewc_loss": 0.03678988292813301, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013962735829409212, "grad_norm": 4.845101356506348, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8572851419448853, "num_tokens": 153361061.0, "step": 4017 }, { "epoch": 0.5111308993766697, "ewc_loss": 0.036747731268405914, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001392058184137568, "grad_norm": 4.76431941986084, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8500730991363525, "num_tokens": 153397002.0, "step": 4018 }, { "epoch": 0.5112581096552602, "ewc_loss": 0.03673072159290314, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013903575018048286, "grad_norm": 4.778402805328369, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8424695730209351, "num_tokens": 153435264.0, "step": 4019 }, { "epoch": 0.5113853199338506, "ewc_loss": 0.03679560497403145, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013968456187285483, "grad_norm": 4.7795209884643555, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8546261787414551, "num_tokens": 153474751.0, "step": 4020 }, { "epoch": 0.5115125302124411, "ewc_loss": 0.0367903858423233, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001396323787048459, "grad_norm": 4.754087448120117, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8651243448257446, "num_tokens": 153510084.0, "step": 4021 }, { "epoch": 0.5116397404910317, "ewc_loss": 0.036789264529943466, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013962115917820483, "grad_norm": 4.765934944152832, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8431997299194336, "num_tokens": 153550387.0, "step": 4022 }, { "epoch": 0.5117669507696222, "ewc_loss": 0.036804258823394775, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001397710875608027, "grad_norm": 4.719327926635742, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8790671825408936, "num_tokens": 153592441.0, "step": 4023 }, { "epoch": 0.5118941610482127, "ewc_loss": 0.036814622581005096, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013987472630105913, "grad_norm": 4.768195629119873, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8637956976890564, "num_tokens": 153629202.0, "step": 4024 }, { "epoch": 0.5120213713268033, "ewc_loss": 0.03685193136334419, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014024782285559922, "grad_norm": 4.811554431915283, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8513597846031189, "num_tokens": 153669243.0, "step": 4025 }, { "epoch": 0.5121485816053937, "ewc_loss": 0.0368150994181633, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001398795284330845, "grad_norm": 4.768051624298096, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8516931533813477, "num_tokens": 153709206.0, "step": 4026 }, { "epoch": 0.5122757918839842, "ewc_loss": 0.036808889359235764, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001398174063069746, "grad_norm": 4.88304328918457, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8638362288475037, "num_tokens": 153741251.0, "step": 4027 }, { "epoch": 0.5124030021625747, "ewc_loss": 0.03685075044631958, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014023602125234902, "grad_norm": 4.789057731628418, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8425388932228088, "num_tokens": 153775559.0, "step": 4028 }, { "epoch": 0.5125302124411653, "ewc_loss": 0.03677782788872719, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001395067956764251, "grad_norm": 4.744369983673096, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8604913949966431, "num_tokens": 153819158.0, "step": 4029 }, { "epoch": 0.5126574227197558, "ewc_loss": 0.0367930643260479, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001396591542288661, "grad_norm": 4.7661051750183105, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8503700494766235, "num_tokens": 153856231.0, "step": 4030 }, { "epoch": 0.5127846329983463, "ewc_loss": 0.036797743290662766, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013970595318824053, "grad_norm": 4.7388129234313965, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8673985004425049, "num_tokens": 153897125.0, "step": 4031 }, { "epoch": 0.5129118432769367, "ewc_loss": 0.03678445518016815, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013957306509837508, "grad_norm": 4.762528419494629, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8632965087890625, "num_tokens": 153933552.0, "step": 4032 }, { "epoch": 0.5130390535555273, "ewc_loss": 0.036827486008405685, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001400033652316779, "grad_norm": 4.78463888168335, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8426501154899597, "num_tokens": 153974337.0, "step": 4033 }, { "epoch": 0.5131662638341178, "ewc_loss": 0.036810167133808136, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001398301828885451, "grad_norm": 4.708970069885254, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8516621589660645, "num_tokens": 154018911.0, "step": 4034 }, { "epoch": 0.5132934741127083, "ewc_loss": 0.03678599372506142, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013958844647277147, "grad_norm": 4.757206439971924, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8673802614212036, "num_tokens": 154056441.0, "step": 4035 }, { "epoch": 0.5134206843912988, "ewc_loss": 0.03687207028269768, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014044922136235982, "grad_norm": 4.7793989181518555, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8470679521560669, "num_tokens": 154096326.0, "step": 4036 }, { "epoch": 0.5135478946698894, "ewc_loss": 0.03683368116617203, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014006531273480505, "grad_norm": 4.744369983673096, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8609926104545593, "num_tokens": 154137223.0, "step": 4037 }, { "epoch": 0.5136751049484798, "ewc_loss": 0.036818768829107285, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013991619925945997, "grad_norm": 4.869330883026123, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8545124530792236, "num_tokens": 154168312.0, "step": 4038 }, { "epoch": 0.5138023152270703, "ewc_loss": 0.03687417507171631, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001404702488798648, "grad_norm": 4.742759704589844, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8693598508834839, "num_tokens": 154203125.0, "step": 4039 }, { "epoch": 0.5139295255056608, "ewc_loss": 0.03679397329688072, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00013966824917588383, "grad_norm": 4.747141361236572, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8504976630210876, "num_tokens": 154248437.0, "step": 4040 }, { "epoch": 0.5140567357842514, "ewc_loss": 0.03684980422258377, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014022656250745058, "grad_norm": 4.757082462310791, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8606570363044739, "num_tokens": 154286303.0, "step": 4041 }, { "epoch": 0.5141839460628419, "ewc_loss": 0.03685976564884186, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014032618491910398, "grad_norm": 4.782162666320801, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8469821214675903, "num_tokens": 154328638.0, "step": 4042 }, { "epoch": 0.5143111563414324, "ewc_loss": 0.03686243295669556, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014035285857971758, "grad_norm": 4.817820072174072, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8559027910232544, "num_tokens": 154362953.0, "step": 4043 }, { "epoch": 0.5144383666200228, "ewc_loss": 0.036843497306108475, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001401634799549356, "grad_norm": 4.788942337036133, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8447147607803345, "num_tokens": 154401289.0, "step": 4044 }, { "epoch": 0.5145655768986134, "ewc_loss": 0.03685666620731354, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001402951602358371, "grad_norm": 4.7747673988342285, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8610429763793945, "num_tokens": 154438779.0, "step": 4045 }, { "epoch": 0.5146927871772039, "ewc_loss": 0.03683659806847572, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.00014009448932483792, "grad_norm": 4.857934474945068, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8508669137954712, "num_tokens": 154467032.0, "step": 4046 }, { "epoch": 0.5148199974557944, "ewc_loss": 0.03689990192651749, "ewc_loss_diag": 2.276897430419922e-05, "ewc_loss_parallel": 0.0001407275558449328, "grad_norm": 4.736255645751953, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8657589554786682, "num_tokens": 154505222.0, "step": 4047 }, { "epoch": 0.514947207734385, "ewc_loss": 0.0369323194026947, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013983102689962834, "grad_norm": 4.800844192504883, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.843116819858551, "num_tokens": 154544953.0, "step": 4048 }, { "epoch": 0.5150744180129755, "ewc_loss": 0.03700722008943558, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014058001397643238, "grad_norm": 4.707317352294922, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8685880303382874, "num_tokens": 154586193.0, "step": 4049 }, { "epoch": 0.5152016282915659, "ewc_loss": 0.03694600611925125, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013996785855852067, "grad_norm": 4.761031627655029, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8598642945289612, "num_tokens": 154629531.0, "step": 4050 }, { "epoch": 0.5153288385701564, "ewc_loss": 0.03699737787246704, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014048159937374294, "grad_norm": 4.799764633178711, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8724111318588257, "num_tokens": 154659482.0, "step": 4051 }, { "epoch": 0.515456048848747, "ewc_loss": 0.03697977215051651, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014030553575139493, "grad_norm": 4.821352005004883, "learning_rate": 1e-06, "loss": 0.56, "mean_token_accuracy": 0.8236921429634094, "num_tokens": 154702645.0, "step": 4052 }, { "epoch": 0.5155832591273375, "ewc_loss": 0.03698080778121948, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014031586761120707, "grad_norm": 4.714584827423096, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8658367395401001, "num_tokens": 154743329.0, "step": 4053 }, { "epoch": 0.515710469405928, "ewc_loss": 0.03696312755346298, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014013909094501287, "grad_norm": 4.761740684509277, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8535617589950562, "num_tokens": 154783460.0, "step": 4054 }, { "epoch": 0.5158376796845185, "ewc_loss": 0.03698403388261795, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014034814375918359, "grad_norm": 4.710423946380615, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8504739999771118, "num_tokens": 154827209.0, "step": 4055 }, { "epoch": 0.5159648899631091, "ewc_loss": 0.03700852394104004, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000140593052492477, "grad_norm": 4.8164238929748535, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8320821523666382, "num_tokens": 154866577.0, "step": 4056 }, { "epoch": 0.5160921002416995, "ewc_loss": 0.0370451956987381, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014095976075623184, "grad_norm": 4.723560810089111, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8592574596405029, "num_tokens": 154912191.0, "step": 4057 }, { "epoch": 0.51621931052029, "ewc_loss": 0.03701341152191162, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014064193237572908, "grad_norm": 4.805136680603027, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.844302773475647, "num_tokens": 154953412.0, "step": 4058 }, { "epoch": 0.5163465207988805, "ewc_loss": 0.03701196610927582, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014062748232390732, "grad_norm": 4.71319580078125, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8676779866218567, "num_tokens": 154989913.0, "step": 4059 }, { "epoch": 0.5164737310774711, "ewc_loss": 0.03699008375406265, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014040865062270314, "grad_norm": 4.724660396575928, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8439778089523315, "num_tokens": 155038094.0, "step": 4060 }, { "epoch": 0.5166009413560616, "ewc_loss": 0.0370025560259819, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001405333896400407, "grad_norm": 4.799129962921143, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8472306728363037, "num_tokens": 155075900.0, "step": 4061 }, { "epoch": 0.5167281516346521, "ewc_loss": 0.03703228384256363, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000140830670716241, "grad_norm": 4.773253917694092, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8561161756515503, "num_tokens": 155117038.0, "step": 4062 }, { "epoch": 0.5168553619132426, "ewc_loss": 0.036979515105485916, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014030297461431473, "grad_norm": 4.815695762634277, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.871016263961792, "num_tokens": 155151442.0, "step": 4063 }, { "epoch": 0.5169825721918331, "ewc_loss": 0.03698061406612396, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014031394675839692, "grad_norm": 4.783894062042236, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8646831512451172, "num_tokens": 155192288.0, "step": 4064 }, { "epoch": 0.5171097824704236, "ewc_loss": 0.03694925457239151, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014000035298522562, "grad_norm": 4.77478551864624, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.859261691570282, "num_tokens": 155238931.0, "step": 4065 }, { "epoch": 0.5172369927490141, "ewc_loss": 0.036921821534633636, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001397260493831709, "grad_norm": 4.749321460723877, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8696845769882202, "num_tokens": 155276644.0, "step": 4066 }, { "epoch": 0.5173642030276047, "ewc_loss": 0.03694302588701248, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001399380707880482, "grad_norm": 4.907547473907471, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8688133955001831, "num_tokens": 155307638.0, "step": 4067 }, { "epoch": 0.5174914133061952, "ewc_loss": 0.03698960691690445, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014040389214642346, "grad_norm": 4.769410133361816, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8499770164489746, "num_tokens": 155343255.0, "step": 4068 }, { "epoch": 0.5176186235847856, "ewc_loss": 0.036862485110759735, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013913268048781902, "grad_norm": 4.866207599639893, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8586385250091553, "num_tokens": 155382283.0, "step": 4069 }, { "epoch": 0.5177458338633761, "ewc_loss": 0.036996133625507355, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014046912838239223, "grad_norm": 4.795691013336182, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8343284130096436, "num_tokens": 155427096.0, "step": 4070 }, { "epoch": 0.5178730441419667, "ewc_loss": 0.03691695258021355, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013967732957098633, "grad_norm": 4.978046894073486, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8449468612670898, "num_tokens": 155467472.0, "step": 4071 }, { "epoch": 0.5180002544205572, "ewc_loss": 0.03694681078195572, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013997590576764196, "grad_norm": 4.722102165222168, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8508185148239136, "num_tokens": 155508350.0, "step": 4072 }, { "epoch": 0.5181274646991477, "ewc_loss": 0.036770403385162354, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013821186439599842, "grad_norm": 4.741511821746826, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8796087503433228, "num_tokens": 155547215.0, "step": 4073 }, { "epoch": 0.5182546749777382, "ewc_loss": 0.0369161032140255, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013966883125249296, "grad_norm": 4.82210111618042, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8612785339355469, "num_tokens": 155586658.0, "step": 4074 }, { "epoch": 0.5183818852563287, "ewc_loss": 0.03689790144562721, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013948683044873178, "grad_norm": 4.855228900909424, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8596329689025879, "num_tokens": 155620944.0, "step": 4075 }, { "epoch": 0.5185090955349192, "ewc_loss": 0.03690551221370697, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013956293696537614, "grad_norm": 4.772337913513184, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8538970351219177, "num_tokens": 155662699.0, "step": 4076 }, { "epoch": 0.5186363058135097, "ewc_loss": 0.0368439145386219, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001389469689456746, "grad_norm": 4.75567102432251, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8444415330886841, "num_tokens": 155707783.0, "step": 4077 }, { "epoch": 0.5187635160921003, "ewc_loss": 0.036893125623464584, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013943907106295228, "grad_norm": 4.762418746948242, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8623449802398682, "num_tokens": 155745227.0, "step": 4078 }, { "epoch": 0.5188907263706908, "ewc_loss": 0.036921411752700806, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013972191663924605, "grad_norm": 4.7582526206970215, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8585425019264221, "num_tokens": 155786466.0, "step": 4079 }, { "epoch": 0.5190179366492813, "ewc_loss": 0.03690992668271065, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000139607087476179, "grad_norm": 4.773252487182617, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8593038320541382, "num_tokens": 155833145.0, "step": 4080 }, { "epoch": 0.5191451469278717, "ewc_loss": 0.0369473434984684, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013998124632053077, "grad_norm": 4.821469783782959, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8482046723365784, "num_tokens": 155869533.0, "step": 4081 }, { "epoch": 0.5192723572064623, "ewc_loss": 0.03692786395549774, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013978646893519908, "grad_norm": 4.749847412109375, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8557808995246887, "num_tokens": 155909329.0, "step": 4082 }, { "epoch": 0.5193995674850528, "ewc_loss": 0.03691071271896362, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013961496006231755, "grad_norm": 4.785958290100098, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8660963177680969, "num_tokens": 155947848.0, "step": 4083 }, { "epoch": 0.5195267777636433, "ewc_loss": 0.036992497742176056, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014043277769815177, "grad_norm": 4.81651496887207, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8618733882904053, "num_tokens": 155982693.0, "step": 4084 }, { "epoch": 0.5196539880422338, "ewc_loss": 0.036979176104068756, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014029955491423607, "grad_norm": 4.772865295410156, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8552719950675964, "num_tokens": 156023414.0, "step": 4085 }, { "epoch": 0.5197811983208244, "ewc_loss": 0.03692188858985901, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00013972670421935618, "grad_norm": 4.747504711151123, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8562769889831543, "num_tokens": 156063488.0, "step": 4086 }, { "epoch": 0.5199084085994148, "ewc_loss": 0.03698274493217468, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014033525076229125, "grad_norm": 4.767665863037109, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8564853668212891, "num_tokens": 156103081.0, "step": 4087 }, { "epoch": 0.5200356188780053, "ewc_loss": 0.03700011596083641, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001405089715262875, "grad_norm": 4.784299850463867, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8476903438568115, "num_tokens": 156139453.0, "step": 4088 }, { "epoch": 0.5201628291565958, "ewc_loss": 0.03702367842197418, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014074461068958044, "grad_norm": 4.78428316116333, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8509775996208191, "num_tokens": 156176564.0, "step": 4089 }, { "epoch": 0.5202900394351864, "ewc_loss": 0.037037692964076996, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014088473108131438, "grad_norm": 4.762190818786621, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8597984910011292, "num_tokens": 156221278.0, "step": 4090 }, { "epoch": 0.5204172497137769, "ewc_loss": 0.03703456372022629, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014085345901548862, "grad_norm": 4.825658798217773, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8615095615386963, "num_tokens": 156259124.0, "step": 4091 }, { "epoch": 0.5205444599923674, "ewc_loss": 0.03706269711256027, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014113480574451387, "grad_norm": 4.788310527801514, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8465273380279541, "num_tokens": 156297832.0, "step": 4092 }, { "epoch": 0.5206716702709578, "ewc_loss": 0.03703669086098671, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001408747339155525, "grad_norm": 4.859157085418701, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.862837553024292, "num_tokens": 156329745.0, "step": 4093 }, { "epoch": 0.5207988805495484, "ewc_loss": 0.037107862532138824, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014158642443362623, "grad_norm": 4.842330455780029, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8569495677947998, "num_tokens": 156360154.0, "step": 4094 }, { "epoch": 0.5209260908281389, "ewc_loss": 0.037065066397190094, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014115848171059042, "grad_norm": 4.730869770050049, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8593423366546631, "num_tokens": 156399360.0, "step": 4095 }, { "epoch": 0.5210533011067294, "ewc_loss": 0.03704331815242767, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014094098878558725, "grad_norm": 4.763666152954102, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8552855849266052, "num_tokens": 156438782.0, "step": 4096 }, { "epoch": 0.52118051138532, "ewc_loss": 0.037111155688762665, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014161935541778803, "grad_norm": 4.786314487457275, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.863664984703064, "num_tokens": 156470261.0, "step": 4097 }, { "epoch": 0.5213077216639105, "ewc_loss": 0.03711892291903496, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014169704809319228, "grad_norm": 4.847497940063477, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8398648500442505, "num_tokens": 156510493.0, "step": 4098 }, { "epoch": 0.5214349319425009, "ewc_loss": 0.037124816328287125, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014175596879795194, "grad_norm": 4.81246280670166, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8595892190933228, "num_tokens": 156545040.0, "step": 4099 }, { "epoch": 0.5215621422210914, "ewc_loss": 0.037092506885528564, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001414328726241365, "grad_norm": 4.7911834716796875, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8550269603729248, "num_tokens": 156583474.0, "step": 4100 }, { "epoch": 0.521689352499682, "ewc_loss": 0.037083886563777924, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014134669618215412, "grad_norm": 4.7674431800842285, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8569843769073486, "num_tokens": 156627137.0, "step": 4101 }, { "epoch": 0.5218165627782725, "ewc_loss": 0.03712030500173569, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014171084330882877, "grad_norm": 4.864400386810303, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.846122145652771, "num_tokens": 156664726.0, "step": 4102 }, { "epoch": 0.521943773056863, "ewc_loss": 0.037132758647203445, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014183539315126836, "grad_norm": 4.7378082275390625, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8741265535354614, "num_tokens": 156707014.0, "step": 4103 }, { "epoch": 0.5220709833354535, "ewc_loss": 0.03706483170390129, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014115613885223866, "grad_norm": 4.845447540283203, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.854350209236145, "num_tokens": 156747784.0, "step": 4104 }, { "epoch": 0.522198193614044, "ewc_loss": 0.03714675083756447, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014197532436810434, "grad_norm": 4.8254828453063965, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8543547987937927, "num_tokens": 156784760.0, "step": 4105 }, { "epoch": 0.5223254038926345, "ewc_loss": 0.03705982863903046, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001411060948157683, "grad_norm": 4.8265790939331055, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8543580174446106, "num_tokens": 156822899.0, "step": 4106 }, { "epoch": 0.522452614171225, "ewc_loss": 0.03708118945360184, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001413197023794055, "grad_norm": 4.764352798461914, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8633849024772644, "num_tokens": 156861308.0, "step": 4107 }, { "epoch": 0.5225798244498155, "ewc_loss": 0.037065304815769196, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014116083912085742, "grad_norm": 4.7701520919799805, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8630228638648987, "num_tokens": 156901981.0, "step": 4108 }, { "epoch": 0.5227070347284061, "ewc_loss": 0.0370759516954422, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001412673300364986, "grad_norm": 4.751570701599121, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8620270490646362, "num_tokens": 156942215.0, "step": 4109 }, { "epoch": 0.5228342450069966, "ewc_loss": 0.03705916926264763, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014109950279816985, "grad_norm": 4.804130554199219, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8642098903656006, "num_tokens": 156983820.0, "step": 4110 }, { "epoch": 0.5229614552855871, "ewc_loss": 0.03709038347005844, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001414116268279031, "grad_norm": 4.81367826461792, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8694489002227783, "num_tokens": 157016215.0, "step": 4111 }, { "epoch": 0.5230886655641775, "ewc_loss": 0.03708526864647865, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014136050594970584, "grad_norm": 4.855630874633789, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8567551374435425, "num_tokens": 157052071.0, "step": 4112 }, { "epoch": 0.5232158758427681, "ewc_loss": 0.03708040341734886, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.0001413118588970974, "grad_norm": 4.784180641174316, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8357071280479431, "num_tokens": 157097711.0, "step": 4113 }, { "epoch": 0.5233430861213586, "ewc_loss": 0.03704586625099182, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014096649829298258, "grad_norm": 4.826084613800049, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8550910353660583, "num_tokens": 157134801.0, "step": 4114 }, { "epoch": 0.5234702963999491, "ewc_loss": 0.03707274794578552, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014123527216725051, "grad_norm": 4.738880634307861, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8573037385940552, "num_tokens": 157171885.0, "step": 4115 }, { "epoch": 0.5235975066785397, "ewc_loss": 0.03704226016998291, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014093042409513146, "grad_norm": 4.823960304260254, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8557440042495728, "num_tokens": 157211915.0, "step": 4116 }, { "epoch": 0.5237247169571302, "ewc_loss": 0.037123408168554306, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.00014174188254401088, "grad_norm": 4.745199680328369, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8511807322502136, "num_tokens": 157260564.0, "step": 4117 }, { "epoch": 0.5238519272357206, "ewc_loss": 0.03704873472452164, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014099516556598246, "grad_norm": 4.8401594161987305, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8590306043624878, "num_tokens": 157294676.0, "step": 4118 }, { "epoch": 0.5239791375143111, "ewc_loss": 0.037133704870939255, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014184486644808203, "grad_norm": 4.7810750007629395, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8667672872543335, "num_tokens": 157334607.0, "step": 4119 }, { "epoch": 0.5241063477929017, "ewc_loss": 0.03705974295735359, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014110523625276983, "grad_norm": 4.812500476837158, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8582541942596436, "num_tokens": 157372021.0, "step": 4120 }, { "epoch": 0.5242335580714922, "ewc_loss": 0.0371321439743042, "ewc_loss_diag": 2.288818359375e-05, "ewc_loss_parallel": 0.000141829252243042, "grad_norm": 4.761328220367432, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8619673252105713, "num_tokens": 157410092.0, "step": 4121 }, { "epoch": 0.5243607683500827, "ewc_loss": 0.03706297278404236, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014113752695266157, "grad_norm": 4.851516246795654, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8541470766067505, "num_tokens": 157447979.0, "step": 4122 }, { "epoch": 0.5244879786286732, "ewc_loss": 0.03715462237596512, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001420540502294898, "grad_norm": 4.847750186920166, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8518331050872803, "num_tokens": 157481376.0, "step": 4123 }, { "epoch": 0.5246151889072637, "ewc_loss": 0.03711390122771263, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014164682943373919, "grad_norm": 4.836150646209717, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8629684448242188, "num_tokens": 157519555.0, "step": 4124 }, { "epoch": 0.5247423991858542, "ewc_loss": 0.03714307025074959, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014193850802257657, "grad_norm": 4.843811511993408, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8553215265274048, "num_tokens": 157551849.0, "step": 4125 }, { "epoch": 0.5248696094644447, "ewc_loss": 0.03713911771774292, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014189899957273155, "grad_norm": 4.813931465148926, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8569156527519226, "num_tokens": 157590823.0, "step": 4126 }, { "epoch": 0.5249968197430352, "ewc_loss": 0.03710220381617546, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014152986113913357, "grad_norm": 4.805883884429932, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8675948977470398, "num_tokens": 157633771.0, "step": 4127 }, { "epoch": 0.5251240300216258, "ewc_loss": 0.03713422641158104, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014185007603373379, "grad_norm": 4.8408637046813965, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8708334565162659, "num_tokens": 157667796.0, "step": 4128 }, { "epoch": 0.5252512403002163, "ewc_loss": 0.03710947930812836, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014160259161144495, "grad_norm": 4.7633442878723145, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8671698570251465, "num_tokens": 157709452.0, "step": 4129 }, { "epoch": 0.5253784505788067, "ewc_loss": 0.03711424022912979, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014165020547807217, "grad_norm": 4.88933801651001, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8485000729560852, "num_tokens": 157741407.0, "step": 4130 }, { "epoch": 0.5255056608573972, "ewc_loss": 0.03718845173716545, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014239232405088842, "grad_norm": 4.807248115539551, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8686712384223938, "num_tokens": 157777693.0, "step": 4131 }, { "epoch": 0.5256328711359878, "ewc_loss": 0.03709083050489426, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014141612336970866, "grad_norm": 4.801449298858643, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8546725511550903, "num_tokens": 157814729.0, "step": 4132 }, { "epoch": 0.5257600814145783, "ewc_loss": 0.03715880960226059, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014209590153768659, "grad_norm": 4.860842227935791, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8663632869720459, "num_tokens": 157853698.0, "step": 4133 }, { "epoch": 0.5258872916931688, "ewc_loss": 0.03715510666370392, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014205888146534562, "grad_norm": 4.833733081817627, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8623213768005371, "num_tokens": 157888580.0, "step": 4134 }, { "epoch": 0.5260145019717594, "ewc_loss": 0.03711941838264465, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014170201029628515, "grad_norm": 4.817951202392578, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8559472560882568, "num_tokens": 157928434.0, "step": 4135 }, { "epoch": 0.5261417122503498, "ewc_loss": 0.03716989606618881, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001422067725798115, "grad_norm": 4.856851577758789, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8529363870620728, "num_tokens": 157968936.0, "step": 4136 }, { "epoch": 0.5262689225289403, "ewc_loss": 0.03715486824512482, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014205648039933294, "grad_norm": 4.836719989776611, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8460750579833984, "num_tokens": 158005496.0, "step": 4137 }, { "epoch": 0.5263961328075308, "ewc_loss": 0.03712814301252365, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014178926357999444, "grad_norm": 4.811136722564697, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8646615743637085, "num_tokens": 158047569.0, "step": 4138 }, { "epoch": 0.5265233430861214, "ewc_loss": 0.03714081272482872, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014191593800205737, "grad_norm": 4.799936771392822, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8572965264320374, "num_tokens": 158085733.0, "step": 4139 }, { "epoch": 0.5266505533647119, "ewc_loss": 0.037148673087358475, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014199453289620578, "grad_norm": 4.891378879547119, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8470771312713623, "num_tokens": 158123906.0, "step": 4140 }, { "epoch": 0.5267777636433024, "ewc_loss": 0.03717359900474548, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014224379265215248, "grad_norm": 4.765705585479736, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8634465932846069, "num_tokens": 158163663.0, "step": 4141 }, { "epoch": 0.5269049739218928, "ewc_loss": 0.0371306873857975, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001418146857758984, "grad_norm": 4.819005966186523, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8454074859619141, "num_tokens": 158205878.0, "step": 4142 }, { "epoch": 0.5270321842004834, "ewc_loss": 0.03720363229513168, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014254414418246597, "grad_norm": 4.845638751983643, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.861269474029541, "num_tokens": 158241608.0, "step": 4143 }, { "epoch": 0.5271593944790739, "ewc_loss": 0.037205785512924194, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014256566646508873, "grad_norm": 4.889281272888184, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8690202832221985, "num_tokens": 158274259.0, "step": 4144 }, { "epoch": 0.5272866047576644, "ewc_loss": 0.03719460219144821, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014245384954847395, "grad_norm": 4.860910415649414, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8640763759613037, "num_tokens": 158304793.0, "step": 4145 }, { "epoch": 0.527413815036255, "ewc_loss": 0.03718770667910576, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001423848734702915, "grad_norm": 4.7748236656188965, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.870976448059082, "num_tokens": 158349420.0, "step": 4146 }, { "epoch": 0.5275410253148455, "ewc_loss": 0.037139978259801865, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001419075852027163, "grad_norm": 4.762653350830078, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8559511303901672, "num_tokens": 158395834.0, "step": 4147 }, { "epoch": 0.5276682355934359, "ewc_loss": 0.037161894142627716, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014212676614988595, "grad_norm": 4.844429016113281, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8595863580703735, "num_tokens": 158435568.0, "step": 4148 }, { "epoch": 0.5277954458720264, "ewc_loss": 0.03718917816877365, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014239957090467215, "grad_norm": 4.780271530151367, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8679149150848389, "num_tokens": 158473584.0, "step": 4149 }, { "epoch": 0.527922656150617, "ewc_loss": 0.037131424993276596, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014182204904500395, "grad_norm": 4.808446884155273, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8589444160461426, "num_tokens": 158513435.0, "step": 4150 }, { "epoch": 0.5280498664292075, "ewc_loss": 0.037223972380161285, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014274753630161285, "grad_norm": 4.799288749694824, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.858237087726593, "num_tokens": 158549478.0, "step": 4151 }, { "epoch": 0.528177076707798, "ewc_loss": 0.037415362894535065, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014222002937458456, "grad_norm": 4.72803258895874, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8623437285423279, "num_tokens": 158598742.0, "step": 4152 }, { "epoch": 0.5283042869863885, "ewc_loss": 0.037154898047447205, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014205678598955274, "grad_norm": 4.831809997558594, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8458131551742554, "num_tokens": 158636806.0, "step": 4153 }, { "epoch": 0.528431497264979, "ewc_loss": 0.03721727430820465, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014268053928390145, "grad_norm": 4.780179500579834, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8502709865570068, "num_tokens": 158676043.0, "step": 4154 }, { "epoch": 0.5285587075435695, "ewc_loss": 0.037169042974710464, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001421982451574877, "grad_norm": 4.829319953918457, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8429343700408936, "num_tokens": 158711772.0, "step": 4155 }, { "epoch": 0.52868591782216, "ewc_loss": 0.03722443804144859, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014275219291448593, "grad_norm": 4.812543869018555, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.856137216091156, "num_tokens": 158749608.0, "step": 4156 }, { "epoch": 0.5288131281007505, "ewc_loss": 0.03720433637499809, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014255117275752127, "grad_norm": 4.821478366851807, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.857879638671875, "num_tokens": 158790502.0, "step": 4157 }, { "epoch": 0.5289403383793411, "ewc_loss": 0.037186600267887115, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014237382856663316, "grad_norm": 4.786139488220215, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.8339555263519287, "num_tokens": 158828415.0, "step": 4158 }, { "epoch": 0.5290675486579316, "ewc_loss": 0.03718209266662598, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001423287612851709, "grad_norm": 4.811103820800781, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8480352163314819, "num_tokens": 158866972.0, "step": 4159 }, { "epoch": 0.5291947589365221, "ewc_loss": 0.03723059967160225, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014281382027547807, "grad_norm": 4.8501691818237305, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8551127910614014, "num_tokens": 158902991.0, "step": 4160 }, { "epoch": 0.5293219692151125, "ewc_loss": 0.03723982721567154, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014290609396994114, "grad_norm": 4.779658317565918, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8650031089782715, "num_tokens": 158945963.0, "step": 4161 }, { "epoch": 0.5294491794937031, "ewc_loss": 0.03722192347049713, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001427270472049713, "grad_norm": 4.828943252563477, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8560131788253784, "num_tokens": 158985407.0, "step": 4162 }, { "epoch": 0.5295763897722936, "ewc_loss": 0.037275172770023346, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001432595308870077, "grad_norm": 4.826220989227295, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8496155738830566, "num_tokens": 159024387.0, "step": 4163 }, { "epoch": 0.5297036000508841, "ewc_loss": 0.037224020808935165, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014274803106673062, "grad_norm": 4.83933687210083, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8493212461471558, "num_tokens": 159061430.0, "step": 4164 }, { "epoch": 0.5298308103294747, "ewc_loss": 0.03723976016044617, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014290539547801018, "grad_norm": 4.816638469696045, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8584138751029968, "num_tokens": 159099350.0, "step": 4165 }, { "epoch": 0.5299580206080652, "ewc_loss": 0.037230513989925385, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014281293260864913, "grad_norm": 4.9195709228515625, "learning_rate": 1e-06, "loss": 0.5507, "mean_token_accuracy": 0.8322299718856812, "num_tokens": 159136276.0, "step": 4166 }, { "epoch": 0.5300852308866556, "ewc_loss": 0.037287842482328415, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001433862344129011, "grad_norm": 4.779546737670898, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8790209293365479, "num_tokens": 159177102.0, "step": 4167 }, { "epoch": 0.5302124411652461, "ewc_loss": 0.037182122468948364, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014232905232347548, "grad_norm": 4.866485595703125, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.868460476398468, "num_tokens": 159213582.0, "step": 4168 }, { "epoch": 0.5303396514438367, "ewc_loss": 0.0372648723423481, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014315653243102133, "grad_norm": 4.801455497741699, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8565058708190918, "num_tokens": 159254283.0, "step": 4169 }, { "epoch": 0.5304668617224272, "ewc_loss": 0.037188827991485596, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014239607844501734, "grad_norm": 4.880422115325928, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8595833778381348, "num_tokens": 159288557.0, "step": 4170 }, { "epoch": 0.5305940720010177, "ewc_loss": 0.03751015663146973, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001431679556844756, "grad_norm": 4.808403491973877, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.867290735244751, "num_tokens": 159325047.0, "step": 4171 }, { "epoch": 0.5307212822796082, "ewc_loss": 0.03718726336956024, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014238043513614684, "grad_norm": 4.846354007720947, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8611044883728027, "num_tokens": 159366291.0, "step": 4172 }, { "epoch": 0.5308484925581987, "ewc_loss": 0.03723233565688133, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014283116615843028, "grad_norm": 4.8571648597717285, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8513076901435852, "num_tokens": 159401634.0, "step": 4173 }, { "epoch": 0.5309757028367892, "ewc_loss": 0.03722510486841202, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014275887224357575, "grad_norm": 4.8993353843688965, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8494539260864258, "num_tokens": 159435316.0, "step": 4174 }, { "epoch": 0.5311029131153797, "ewc_loss": 0.03725467249751091, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001430545380571857, "grad_norm": 4.829139232635498, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.850723385810852, "num_tokens": 159472192.0, "step": 4175 }, { "epoch": 0.5312301233939702, "ewc_loss": 0.03721672296524048, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001426750241080299, "grad_norm": 4.789408206939697, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8641276359558105, "num_tokens": 159515342.0, "step": 4176 }, { "epoch": 0.5313573336725608, "ewc_loss": 0.03720969706773758, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014260476746130735, "grad_norm": 4.797847270965576, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8619828224182129, "num_tokens": 159553932.0, "step": 4177 }, { "epoch": 0.5314845439511513, "ewc_loss": 0.03724656254053116, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014297345478553325, "grad_norm": 4.850188255310059, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8586918711662292, "num_tokens": 159584711.0, "step": 4178 }, { "epoch": 0.5316117542297417, "ewc_loss": 0.037277039140462875, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014327821554616094, "grad_norm": 4.832753658294678, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.844996452331543, "num_tokens": 159618784.0, "step": 4179 }, { "epoch": 0.5317389645083322, "ewc_loss": 0.03724341094493866, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014294190623331815, "grad_norm": 4.742212295532227, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8548815846443176, "num_tokens": 159661288.0, "step": 4180 }, { "epoch": 0.5318661747869228, "ewc_loss": 0.037274450063705444, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014325232768896967, "grad_norm": 4.8515400886535645, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.868348240852356, "num_tokens": 159701286.0, "step": 4181 }, { "epoch": 0.5319933850655133, "ewc_loss": 0.03756031021475792, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001436695019947365, "grad_norm": 4.754396915435791, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8452508449554443, "num_tokens": 159746631.0, "step": 4182 }, { "epoch": 0.5321205953441038, "ewc_loss": 0.03728542476892471, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014336204912979156, "grad_norm": 4.810272216796875, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8599152565002441, "num_tokens": 159787753.0, "step": 4183 }, { "epoch": 0.5322478056226944, "ewc_loss": 0.03731152042746544, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001436230231774971, "grad_norm": 4.829493999481201, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8495216965675354, "num_tokens": 159823092.0, "step": 4184 }, { "epoch": 0.5323750159012848, "ewc_loss": 0.03727944195270538, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.0001433022553101182, "grad_norm": 4.846498966217041, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8484891653060913, "num_tokens": 159858480.0, "step": 4185 }, { "epoch": 0.5325022261798753, "ewc_loss": 0.03728925436735153, "ewc_loss_diag": 2.300739288330078e-05, "ewc_loss_parallel": 0.00014340036432258785, "grad_norm": 4.801652431488037, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8476557731628418, "num_tokens": 159896220.0, "step": 4186 }, { "epoch": 0.5326294364584658, "ewc_loss": 0.03753028064966202, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014336919412016869, "grad_norm": 4.82567024230957, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8560119271278381, "num_tokens": 159930995.0, "step": 4187 }, { "epoch": 0.5327566467370564, "ewc_loss": 0.0375327467918396, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014339388872031122, "grad_norm": 4.785222053527832, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8528439998626709, "num_tokens": 159968013.0, "step": 4188 }, { "epoch": 0.5328838570156469, "ewc_loss": 0.03753858432173729, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001434522564522922, "grad_norm": 4.794534206390381, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8508545756340027, "num_tokens": 160004085.0, "step": 4189 }, { "epoch": 0.5330110672942374, "ewc_loss": 0.03767155483365059, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014356125029735267, "grad_norm": 4.768792629241943, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8677873015403748, "num_tokens": 160040606.0, "step": 4190 }, { "epoch": 0.5331382775728278, "ewc_loss": 0.03770142048597336, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014385988470166922, "grad_norm": 4.777247905731201, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8682324886322021, "num_tokens": 160080792.0, "step": 4191 }, { "epoch": 0.5332654878514184, "ewc_loss": 0.0375843346118927, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014390976866707206, "grad_norm": 4.793750286102295, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8453961610794067, "num_tokens": 160117055.0, "step": 4192 }, { "epoch": 0.5333926981300089, "ewc_loss": 0.03757660835981369, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014383249799720943, "grad_norm": 4.7770161628723145, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.862471342086792, "num_tokens": 160153223.0, "step": 4193 }, { "epoch": 0.5335199084085994, "ewc_loss": 0.03757273033261299, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014379371714312583, "grad_norm": 4.845839977264404, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.83938068151474, "num_tokens": 160190920.0, "step": 4194 }, { "epoch": 0.53364711868719, "ewc_loss": 0.037706129252910614, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014390698925126344, "grad_norm": 4.824007034301758, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8555223941802979, "num_tokens": 160221898.0, "step": 4195 }, { "epoch": 0.5337743289657805, "ewc_loss": 0.03767312318086624, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014357693726196885, "grad_norm": 4.770323276519775, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8595213294029236, "num_tokens": 160267102.0, "step": 4196 }, { "epoch": 0.5339015392443709, "ewc_loss": 0.03755354881286621, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001436018937965855, "grad_norm": 4.856774806976318, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8618038296699524, "num_tokens": 160304422.0, "step": 4197 }, { "epoch": 0.5340287495229614, "ewc_loss": 0.03755566477775574, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014362305228132755, "grad_norm": 4.79689884185791, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.854306697845459, "num_tokens": 160339006.0, "step": 4198 }, { "epoch": 0.534155959801552, "ewc_loss": 0.03750230371952057, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014308944810181856, "grad_norm": 4.7497782707214355, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8774651288986206, "num_tokens": 160377504.0, "step": 4199 }, { "epoch": 0.5342831700801425, "ewc_loss": 0.03755181282758713, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001435845479136333, "grad_norm": 4.857125759124756, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8364566564559937, "num_tokens": 160413315.0, "step": 4200 }, { "epoch": 0.534410380358733, "ewc_loss": 0.037717558443546295, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014402129454538226, "grad_norm": 4.772193431854248, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8711273074150085, "num_tokens": 160455034.0, "step": 4201 }, { "epoch": 0.5345375906373235, "ewc_loss": 0.037641361355781555, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001432593126082793, "grad_norm": 4.840671062469482, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8524813652038574, "num_tokens": 160490505.0, "step": 4202 }, { "epoch": 0.534664800915914, "ewc_loss": 0.03769020736217499, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014374777674674988, "grad_norm": 4.75522518157959, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8670756816864014, "num_tokens": 160528712.0, "step": 4203 }, { "epoch": 0.5347920111945045, "ewc_loss": 0.0375560000538826, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014362638466991484, "grad_norm": 4.862348556518555, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8499943017959595, "num_tokens": 160567945.0, "step": 4204 }, { "epoch": 0.534919221473095, "ewc_loss": 0.03762049227952957, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014427131100092083, "grad_norm": 4.928509712219238, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8637782335281372, "num_tokens": 160596767.0, "step": 4205 }, { "epoch": 0.5350464317516855, "ewc_loss": 0.037587765604257584, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001439440529793501, "grad_norm": 4.7856245040893555, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8545663356781006, "num_tokens": 160636912.0, "step": 4206 }, { "epoch": 0.5351736420302761, "ewc_loss": 0.03752487525343895, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014331514830701053, "grad_norm": 4.899823188781738, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8510894775390625, "num_tokens": 160674588.0, "step": 4207 }, { "epoch": 0.5353008523088666, "ewc_loss": 0.037590231746435165, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014396871847566217, "grad_norm": 4.755279064178467, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.846869945526123, "num_tokens": 160713441.0, "step": 4208 }, { "epoch": 0.5354280625874571, "ewc_loss": 0.03762257844209671, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014307147648651153, "grad_norm": 4.8143205642700195, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8665185570716858, "num_tokens": 160752716.0, "step": 4209 }, { "epoch": 0.5355552728660475, "ewc_loss": 0.03758338838815689, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001439002953702584, "grad_norm": 4.740025997161865, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.86788010597229, "num_tokens": 160798771.0, "step": 4210 }, { "epoch": 0.5356824831446381, "ewc_loss": 0.03752744197845459, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014334084698930383, "grad_norm": 4.797338008880615, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8747584819793701, "num_tokens": 160843085.0, "step": 4211 }, { "epoch": 0.5358096934232286, "ewc_loss": 0.037567779421806335, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001437441969756037, "grad_norm": 4.817962169647217, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8494168519973755, "num_tokens": 160883073.0, "step": 4212 }, { "epoch": 0.5359369037018191, "ewc_loss": 0.03755379468202591, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014360433851834387, "grad_norm": 4.775386333465576, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8636845350265503, "num_tokens": 160923734.0, "step": 4213 }, { "epoch": 0.5360641139804097, "ewc_loss": 0.037654221057891846, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014338790788315237, "grad_norm": 4.804384708404541, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8545322418212891, "num_tokens": 160963460.0, "step": 4214 }, { "epoch": 0.5361913242590002, "ewc_loss": 0.03766157105565071, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014346140960697085, "grad_norm": 4.8056182861328125, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8559263348579407, "num_tokens": 161003691.0, "step": 4215 }, { "epoch": 0.5363185345375906, "ewc_loss": 0.03764898329973221, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014333552098833025, "grad_norm": 4.780961036682129, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8584147691726685, "num_tokens": 161042056.0, "step": 4216 }, { "epoch": 0.5364457448161811, "ewc_loss": 0.03769170492887497, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014376275066751987, "grad_norm": 4.896347999572754, "learning_rate": 1e-06, "loss": 0.3939, "mean_token_accuracy": 0.8811805248260498, "num_tokens": 161078670.0, "step": 4217 }, { "epoch": 0.5365729550947717, "ewc_loss": 0.037691060453653336, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001437563041690737, "grad_norm": 4.842111110687256, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8718413710594177, "num_tokens": 161113992.0, "step": 4218 }, { "epoch": 0.5367001653733622, "ewc_loss": 0.03750759735703468, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014314237341750413, "grad_norm": 4.772572040557861, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8636974692344666, "num_tokens": 161153763.0, "step": 4219 }, { "epoch": 0.5368273756519527, "ewc_loss": 0.03754422813653946, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014350867422763258, "grad_norm": 4.900649070739746, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8416074514389038, "num_tokens": 161184186.0, "step": 4220 }, { "epoch": 0.5369545859305432, "ewc_loss": 0.03758327290415764, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014389913121704012, "grad_norm": 4.8740057945251465, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8546187877655029, "num_tokens": 161219298.0, "step": 4221 }, { "epoch": 0.5370817962091337, "ewc_loss": 0.037478744983673096, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014285386714618653, "grad_norm": 4.817536354064941, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8730681538581848, "num_tokens": 161255252.0, "step": 4222 }, { "epoch": 0.5372090064877242, "ewc_loss": 0.03754676133394241, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014353400911204517, "grad_norm": 4.833532810211182, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8620761632919312, "num_tokens": 161295890.0, "step": 4223 }, { "epoch": 0.5373362167663147, "ewc_loss": 0.03764181584119797, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014326386735774577, "grad_norm": 4.8569207191467285, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8545040488243103, "num_tokens": 161330808.0, "step": 4224 }, { "epoch": 0.5374634270449052, "ewc_loss": 0.03752807527780533, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014334717707242817, "grad_norm": 4.7789177894592285, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8653669357299805, "num_tokens": 161370515.0, "step": 4225 }, { "epoch": 0.5375906373234958, "ewc_loss": 0.03750734403729439, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001431398413842544, "grad_norm": 4.859296798706055, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8706901669502258, "num_tokens": 161405257.0, "step": 4226 }, { "epoch": 0.5377178476020863, "ewc_loss": 0.03757362440228462, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014380265201907605, "grad_norm": 4.81137752532959, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8583977222442627, "num_tokens": 161443526.0, "step": 4227 }, { "epoch": 0.5378450578806767, "ewc_loss": 0.037652697414159775, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001433726865798235, "grad_norm": 4.855733871459961, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8510522842407227, "num_tokens": 161481262.0, "step": 4228 }, { "epoch": 0.5379722681592672, "ewc_loss": 0.03768683224916458, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014371404540725052, "grad_norm": 4.80841064453125, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8440852761268616, "num_tokens": 161517756.0, "step": 4229 }, { "epoch": 0.5380994784378578, "ewc_loss": 0.03767269477248192, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001435726589988917, "grad_norm": 4.832139492034912, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8666597008705139, "num_tokens": 161552680.0, "step": 4230 }, { "epoch": 0.5382266887164483, "ewc_loss": 0.03769238665699959, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014376957551576197, "grad_norm": 4.869150161743164, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8552793264389038, "num_tokens": 161588607.0, "step": 4231 }, { "epoch": 0.5383538989950388, "ewc_loss": 0.03771049156785011, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001439506304450333, "grad_norm": 4.854040622711182, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8400742411613464, "num_tokens": 161626427.0, "step": 4232 }, { "epoch": 0.5384811092736294, "ewc_loss": 0.03755098581314087, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014357628242578357, "grad_norm": 4.753925800323486, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8639525175094604, "num_tokens": 161665182.0, "step": 4233 }, { "epoch": 0.5386083195522198, "ewc_loss": 0.0375526137650013, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014359253691509366, "grad_norm": 4.804342746734619, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8577249050140381, "num_tokens": 161704685.0, "step": 4234 }, { "epoch": 0.5387355298308103, "ewc_loss": 0.03771393001079559, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014398501662071794, "grad_norm": 4.775317192077637, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8629224300384521, "num_tokens": 161742960.0, "step": 4235 }, { "epoch": 0.5388627401094008, "ewc_loss": 0.03769274801015854, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014377319894265383, "grad_norm": 4.797508239746094, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8574668169021606, "num_tokens": 161778199.0, "step": 4236 }, { "epoch": 0.5389899503879914, "ewc_loss": 0.03773828595876694, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014422854292206466, "grad_norm": 4.805866241455078, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.854375958442688, "num_tokens": 161811235.0, "step": 4237 }, { "epoch": 0.5391171606665819, "ewc_loss": 0.03761734440922737, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001442398497601971, "grad_norm": 4.827797889709473, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8583112955093384, "num_tokens": 161847669.0, "step": 4238 }, { "epoch": 0.5392443709451724, "ewc_loss": 0.03761594742536545, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014422586536966264, "grad_norm": 4.820291042327881, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8441764116287231, "num_tokens": 161888470.0, "step": 4239 }, { "epoch": 0.5393715812237628, "ewc_loss": 0.03762853890657425, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014435179764404893, "grad_norm": 4.878747940063477, "learning_rate": 1e-06, "loss": 0.5198, "mean_token_accuracy": 0.8383390307426453, "num_tokens": 161924563.0, "step": 4240 }, { "epoch": 0.5394987915023534, "ewc_loss": 0.03765532374382019, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014461964019574225, "grad_norm": 4.84285306930542, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8440591096878052, "num_tokens": 161959062.0, "step": 4241 }, { "epoch": 0.5396260017809439, "ewc_loss": 0.037604231387376785, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014410870790015906, "grad_norm": 4.803248405456543, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8619215488433838, "num_tokens": 161994755.0, "step": 4242 }, { "epoch": 0.5397532120595344, "ewc_loss": 0.037616461515426636, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014423100219573826, "grad_norm": 4.785545349121094, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8671392202377319, "num_tokens": 162032812.0, "step": 4243 }, { "epoch": 0.5398804223381249, "ewc_loss": 0.03765647113323212, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014463112165685743, "grad_norm": 4.829450607299805, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.861137330532074, "num_tokens": 162067130.0, "step": 4244 }, { "epoch": 0.5400076326167155, "ewc_loss": 0.037796635180711746, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014481204561889172, "grad_norm": 4.767309665679932, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.859818696975708, "num_tokens": 162107452.0, "step": 4245 }, { "epoch": 0.5401348428953059, "ewc_loss": 0.03772718086838722, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014411751180887222, "grad_norm": 4.822772979736328, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8585819005966187, "num_tokens": 162144220.0, "step": 4246 }, { "epoch": 0.5402620531738964, "ewc_loss": 0.037820473313331604, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014505044964607805, "grad_norm": 4.805227756500244, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8667486906051636, "num_tokens": 162181798.0, "step": 4247 }, { "epoch": 0.540389263452487, "ewc_loss": 0.037781972438097, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014466543507296592, "grad_norm": 4.826526165008545, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8519850969314575, "num_tokens": 162222501.0, "step": 4248 }, { "epoch": 0.5405164737310775, "ewc_loss": 0.037782832980155945, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014467403525486588, "grad_norm": 4.779776573181152, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.854202389717102, "num_tokens": 162265533.0, "step": 4249 }, { "epoch": 0.540643684009668, "ewc_loss": 0.03779596462845802, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014480535173788667, "grad_norm": 4.7819905281066895, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8740496635437012, "num_tokens": 162304984.0, "step": 4250 }, { "epoch": 0.5407708942882585, "ewc_loss": 0.03792750835418701, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014490008470602334, "grad_norm": 4.841423988342285, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8486772775650024, "num_tokens": 162342358.0, "step": 4251 }, { "epoch": 0.540898104566849, "ewc_loss": 0.03793967142701149, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014502172416541725, "grad_norm": 4.905249118804932, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8587669134140015, "num_tokens": 162377343.0, "step": 4252 }, { "epoch": 0.5410253148454395, "ewc_loss": 0.03794941306114197, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001451191201340407, "grad_norm": 4.863510608673096, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8586054444313049, "num_tokens": 162416416.0, "step": 4253 }, { "epoch": 0.54115252512403, "ewc_loss": 0.037881456315517426, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014443957479670644, "grad_norm": 8.868457794189453, "learning_rate": 1e-06, "loss": 0.3763, "mean_token_accuracy": 0.88413405418396, "num_tokens": 162447108.0, "step": 4254 }, { "epoch": 0.5412797354026205, "ewc_loss": 0.042080096900463104, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00018642596842255443, "grad_norm": 5.565081596374512, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.860084056854248, "num_tokens": 162489659.0, "step": 4255 }, { "epoch": 0.5414069456812111, "ewc_loss": 0.03692547604441643, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00013487976684700698, "grad_norm": 4.465850353240967, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.848576545715332, "num_tokens": 162525816.0, "step": 4256 }, { "epoch": 0.5415341559598016, "ewc_loss": 0.0385109968483448, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00015073498070705682, "grad_norm": 5.219140529632568, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8542091846466064, "num_tokens": 162564154.0, "step": 4257 }, { "epoch": 0.5416613662383921, "ewc_loss": 0.038557667285203934, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00015120167518034577, "grad_norm": 4.75931453704834, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8514514565467834, "num_tokens": 162602728.0, "step": 4258 }, { "epoch": 0.5417885765169825, "ewc_loss": 0.0378548763692379, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014539447147399187, "grad_norm": 4.959789752960205, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8621755838394165, "num_tokens": 162636659.0, "step": 4259 }, { "epoch": 0.5419157867955731, "ewc_loss": 0.03837433457374573, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014936833758838475, "grad_norm": 4.85373067855835, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8748370409011841, "num_tokens": 162676639.0, "step": 4260 }, { "epoch": 0.5420429970741636, "ewc_loss": 0.037899844348430634, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001458441693102941, "grad_norm": 4.899358749389648, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8406644463539124, "num_tokens": 162716763.0, "step": 4261 }, { "epoch": 0.5421702073527541, "ewc_loss": 0.03807135298848152, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014755924348719418, "grad_norm": 4.911975383758545, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8482738733291626, "num_tokens": 162753943.0, "step": 4262 }, { "epoch": 0.5422974176313446, "ewc_loss": 0.03806178271770477, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014624284813180566, "grad_norm": 4.902732849121094, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.853081464767456, "num_tokens": 162792938.0, "step": 4263 }, { "epoch": 0.5424246279099352, "ewc_loss": 0.03791830316185951, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001460287458030507, "grad_norm": 4.807133674621582, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8555917143821716, "num_tokens": 162838938.0, "step": 4264 }, { "epoch": 0.5425518381885256, "ewc_loss": 0.037767134606838226, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014573775115422904, "grad_norm": 4.912024974822998, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8643218278884888, "num_tokens": 162875026.0, "step": 4265 }, { "epoch": 0.5426790484671161, "ewc_loss": 0.03780736029148102, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.00014614002429880202, "grad_norm": 4.930081367492676, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8611690998077393, "num_tokens": 162910724.0, "step": 4266 }, { "epoch": 0.5428062587457066, "ewc_loss": 0.03769475966691971, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.000145013997098431, "grad_norm": 4.945616722106934, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8605327010154724, "num_tokens": 162941602.0, "step": 4267 }, { "epoch": 0.5429334690242972, "ewc_loss": 0.03773118555545807, "ewc_loss_diag": 2.3245811462402344e-05, "ewc_loss_parallel": 0.0001453782751923427, "grad_norm": 4.911875247955322, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8456593751907349, "num_tokens": 162980760.0, "step": 4268 }, { "epoch": 0.5430606793028877, "ewc_loss": 0.03790547698736191, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001446797832613811, "grad_norm": 4.889961242675781, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8688223361968994, "num_tokens": 163016155.0, "step": 4269 }, { "epoch": 0.5431878895814782, "ewc_loss": 0.03775736689567566, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014441937673836946, "grad_norm": 4.925795555114746, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8502990007400513, "num_tokens": 163052519.0, "step": 4270 }, { "epoch": 0.5433150998600687, "ewc_loss": 0.03776007518172264, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014444645785260946, "grad_norm": 4.825988292694092, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8674840927124023, "num_tokens": 163091806.0, "step": 4271 }, { "epoch": 0.5434423101386592, "ewc_loss": 0.03770575672388077, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014390326396096498, "grad_norm": 4.8272552490234375, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8739652037620544, "num_tokens": 163128830.0, "step": 4272 }, { "epoch": 0.5435695204172497, "ewc_loss": 0.0377698615193367, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014454431948252022, "grad_norm": 4.850965976715088, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8534512519836426, "num_tokens": 163169567.0, "step": 4273 }, { "epoch": 0.5436967306958402, "ewc_loss": 0.03778613731265068, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001447070826543495, "grad_norm": 4.950628757476807, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8511114120483398, "num_tokens": 163202913.0, "step": 4274 }, { "epoch": 0.5438239409744308, "ewc_loss": 0.03780275583267212, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014487326552625746, "grad_norm": 4.8850908279418945, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8657690286636353, "num_tokens": 163236741.0, "step": 4275 }, { "epoch": 0.5439511512530213, "ewc_loss": 0.037743400782346725, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001442797074560076, "grad_norm": 4.811025619506836, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8529266119003296, "num_tokens": 163281402.0, "step": 4276 }, { "epoch": 0.5440783615316117, "ewc_loss": 0.037777286022901535, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014461856335401535, "grad_norm": 4.856110572814941, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8676687479019165, "num_tokens": 163325288.0, "step": 4277 }, { "epoch": 0.5442055718102022, "ewc_loss": 0.037775471806526184, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014460041711572558, "grad_norm": 4.879305362701416, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8483341336250305, "num_tokens": 163367471.0, "step": 4278 }, { "epoch": 0.5443327820887928, "ewc_loss": 0.037780407816171646, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014464979176409543, "grad_norm": 4.915008068084717, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8591102361679077, "num_tokens": 163400077.0, "step": 4279 }, { "epoch": 0.5444599923673833, "ewc_loss": 0.03774881362915039, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014433385513257235, "grad_norm": 4.871799468994141, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8611623048782349, "num_tokens": 163433739.0, "step": 4280 }, { "epoch": 0.5445872026459738, "ewc_loss": 0.037719037383794785, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014403606473933905, "grad_norm": 4.875784873962402, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8587347865104675, "num_tokens": 163467447.0, "step": 4281 }, { "epoch": 0.5447144129245644, "ewc_loss": 0.03777772933244705, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014462300168816, "grad_norm": 4.882082939147949, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8548309803009033, "num_tokens": 163506633.0, "step": 4282 }, { "epoch": 0.5448416232031548, "ewc_loss": 0.037717852741479874, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014402421948034316, "grad_norm": 4.863644599914551, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8429727554321289, "num_tokens": 163542773.0, "step": 4283 }, { "epoch": 0.5449688334817453, "ewc_loss": 0.0377538725733757, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014438443758990616, "grad_norm": 4.838399887084961, "learning_rate": 1e-06, "loss": 0.5627, "mean_token_accuracy": 0.8255538940429688, "num_tokens": 163585247.0, "step": 4284 }, { "epoch": 0.5450960437603358, "ewc_loss": 0.03774967044591904, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014434241165872663, "grad_norm": 4.856611728668213, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8466628789901733, "num_tokens": 163622802.0, "step": 4285 }, { "epoch": 0.5452232540389264, "ewc_loss": 0.03776180371642113, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014446374552790076, "grad_norm": 4.821518421173096, "learning_rate": 1e-06, "loss": 0.3772, "mean_token_accuracy": 0.8817806243896484, "num_tokens": 163655980.0, "step": 4286 }, { "epoch": 0.5453504643175169, "ewc_loss": 0.03774959594011307, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014434165495913476, "grad_norm": 4.804616451263428, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8636146783828735, "num_tokens": 163691494.0, "step": 4287 }, { "epoch": 0.5454776745961074, "ewc_loss": 0.037763502448797226, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014448072761297226, "grad_norm": 4.883764266967773, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8700686097145081, "num_tokens": 163725169.0, "step": 4288 }, { "epoch": 0.5456048848746978, "ewc_loss": 0.03781341761350632, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014497985830530524, "grad_norm": 4.780093193054199, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8618425130844116, "num_tokens": 163766050.0, "step": 4289 }, { "epoch": 0.5457320951532884, "ewc_loss": 0.037799712270498276, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001448428229195997, "grad_norm": 4.925709247589111, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8523560762405396, "num_tokens": 163799559.0, "step": 4290 }, { "epoch": 0.5458593054318789, "ewc_loss": 0.0379265695810318, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001461113861296326, "grad_norm": 4.876254558563232, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8519258499145508, "num_tokens": 163837052.0, "step": 4291 }, { "epoch": 0.5459865157104694, "ewc_loss": 0.037929780781269073, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014492282934952527, "grad_norm": 4.846974849700928, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8548847436904907, "num_tokens": 163872494.0, "step": 4292 }, { "epoch": 0.5461137259890599, "ewc_loss": 0.03795871138572693, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014521213597618043, "grad_norm": 4.872910499572754, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8499679565429688, "num_tokens": 163913499.0, "step": 4293 }, { "epoch": 0.5462409362676505, "ewc_loss": 0.03797931596636772, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014541814744006842, "grad_norm": 4.805333137512207, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8392106890678406, "num_tokens": 163957965.0, "step": 4294 }, { "epoch": 0.5463681465462409, "ewc_loss": 0.03796542063355446, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014527920575346798, "grad_norm": 4.848544597625732, "learning_rate": 1e-06, "loss": 0.5899, "mean_token_accuracy": 0.8182314038276672, "num_tokens": 164005661.0, "step": 4295 }, { "epoch": 0.5464953568248314, "ewc_loss": 0.037894442677497864, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001457901526009664, "grad_norm": 4.866607666015625, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8742367029190063, "num_tokens": 164044717.0, "step": 4296 }, { "epoch": 0.5466225671034219, "ewc_loss": 0.037979982793331146, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014542484132107347, "grad_norm": 4.974095821380615, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8581920862197876, "num_tokens": 164080022.0, "step": 4297 }, { "epoch": 0.5467497773820125, "ewc_loss": 0.037883300334215164, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014567871403414756, "grad_norm": 4.908206939697266, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8640516996383667, "num_tokens": 164112749.0, "step": 4298 }, { "epoch": 0.546876987660603, "ewc_loss": 0.03776971995830536, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014454290794674307, "grad_norm": 4.918274879455566, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.853472113609314, "num_tokens": 164152517.0, "step": 4299 }, { "epoch": 0.5470041979391935, "ewc_loss": 0.037830691784620285, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014515263319481164, "grad_norm": 4.838146686553955, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8647842407226562, "num_tokens": 164195245.0, "step": 4300 }, { "epoch": 0.5471314082177839, "ewc_loss": 0.03776884824037552, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014453417679760605, "grad_norm": 4.872016429901123, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8526525497436523, "num_tokens": 164230643.0, "step": 4301 }, { "epoch": 0.5472586184963745, "ewc_loss": 0.03781299293041229, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001449756236979738, "grad_norm": 4.821394920349121, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8539426922798157, "num_tokens": 164271089.0, "step": 4302 }, { "epoch": 0.547385828774965, "ewc_loss": 0.03792266175150871, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014485162682831287, "grad_norm": 4.8971076011657715, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8569953441619873, "num_tokens": 164310618.0, "step": 4303 }, { "epoch": 0.5475130390535555, "ewc_loss": 0.03784908354282379, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014533655485138297, "grad_norm": 4.865179538726807, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8454480171203613, "num_tokens": 164348630.0, "step": 4304 }, { "epoch": 0.5476402493321461, "ewc_loss": 0.03777971863746643, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001446429087081924, "grad_norm": 4.880457878112793, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8647473454475403, "num_tokens": 164383735.0, "step": 4305 }, { "epoch": 0.5477674596107366, "ewc_loss": 0.03782224655151367, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014506818843074143, "grad_norm": 4.792205333709717, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8583285212516785, "num_tokens": 164426115.0, "step": 4306 }, { "epoch": 0.5478946698893271, "ewc_loss": 0.03793242201209068, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014494922652374953, "grad_norm": 4.924535751342773, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8505975008010864, "num_tokens": 164462119.0, "step": 4307 }, { "epoch": 0.5480218801679175, "ewc_loss": 0.03789333999156952, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014577909314539284, "grad_norm": 4.859611988067627, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8558601140975952, "num_tokens": 164503030.0, "step": 4308 }, { "epoch": 0.5481490904465081, "ewc_loss": 0.03781154006719589, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001449611154384911, "grad_norm": 4.88559103012085, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8599474430084229, "num_tokens": 164541091.0, "step": 4309 }, { "epoch": 0.5482763007250986, "ewc_loss": 0.0379667803645134, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014529279724229127, "grad_norm": 4.855574607849121, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8515095710754395, "num_tokens": 164577678.0, "step": 4310 }, { "epoch": 0.5484035110036891, "ewc_loss": 0.03793436288833618, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014496860967483371, "grad_norm": 4.862796783447266, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8550333380699158, "num_tokens": 164612779.0, "step": 4311 }, { "epoch": 0.5485307212822796, "ewc_loss": 0.03782549127936363, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014510062464978546, "grad_norm": 4.860805511474609, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.862831175327301, "num_tokens": 164649849.0, "step": 4312 }, { "epoch": 0.5486579315608702, "ewc_loss": 0.03811653330922127, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001455696183256805, "grad_norm": 4.873824119567871, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8548192381858826, "num_tokens": 164690538.0, "step": 4313 }, { "epoch": 0.5487851418394606, "ewc_loss": 0.03784184157848358, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014526411541737616, "grad_norm": 4.812870502471924, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.858659029006958, "num_tokens": 164729338.0, "step": 4314 }, { "epoch": 0.5489123521180511, "ewc_loss": 0.03798458352684975, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014547083992511034, "grad_norm": 4.9085307121276855, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8692507147789001, "num_tokens": 164766648.0, "step": 4315 }, { "epoch": 0.5490395623966416, "ewc_loss": 0.03788125514984131, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001456582685932517, "grad_norm": 4.8528852462768555, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8571887016296387, "num_tokens": 164804254.0, "step": 4316 }, { "epoch": 0.5491667726752322, "ewc_loss": 0.03797570988535881, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014538210234604776, "grad_norm": 4.847177028656006, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8711121082305908, "num_tokens": 164839943.0, "step": 4317 }, { "epoch": 0.5492939829538227, "ewc_loss": 0.03797837719321251, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014540876145474613, "grad_norm": 4.894242286682129, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.850970983505249, "num_tokens": 164887057.0, "step": 4318 }, { "epoch": 0.5494211932324132, "ewc_loss": 0.03798063099384308, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.00014543128781951964, "grad_norm": 4.876917839050293, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.857626736164093, "num_tokens": 164928515.0, "step": 4319 }, { "epoch": 0.5495484035110036, "ewc_loss": 0.037953391671180725, "ewc_loss_diag": 2.3484230041503906e-05, "ewc_loss_parallel": 0.0001451589196221903, "grad_norm": 4.862734317779541, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8570959568023682, "num_tokens": 164963818.0, "step": 4320 }, { "epoch": 0.5496756137895942, "ewc_loss": 0.0380837619304657, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014524193829856813, "grad_norm": 4.859584808349609, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8718053102493286, "num_tokens": 165001091.0, "step": 4321 }, { "epoch": 0.5498028240681847, "ewc_loss": 0.038078054785728455, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001451848220312968, "grad_norm": 4.874178886413574, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8408864736557007, "num_tokens": 165046009.0, "step": 4322 }, { "epoch": 0.5499300343467752, "ewc_loss": 0.038127701729536057, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014568131882697344, "grad_norm": 4.885893821716309, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8551273345947266, "num_tokens": 165090729.0, "step": 4323 }, { "epoch": 0.5500572446253658, "ewc_loss": 0.038123637437820435, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014564068987965584, "grad_norm": 4.845225811004639, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8757659196853638, "num_tokens": 165129771.0, "step": 4324 }, { "epoch": 0.5501844549039563, "ewc_loss": 0.03787481039762497, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014559380360879004, "grad_norm": 4.909811496734619, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8725321292877197, "num_tokens": 165164310.0, "step": 4325 }, { "epoch": 0.5503116651825467, "ewc_loss": 0.0381350964307785, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014575524255633354, "grad_norm": 4.87363862991333, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8555577397346497, "num_tokens": 165202842.0, "step": 4326 }, { "epoch": 0.5504388754611372, "ewc_loss": 0.03786161541938782, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001454618468414992, "grad_norm": 4.942422866821289, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8630438446998596, "num_tokens": 165235998.0, "step": 4327 }, { "epoch": 0.5505660857397278, "ewc_loss": 0.03790820762515068, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001459277846151963, "grad_norm": 4.8522844314575195, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8597536683082581, "num_tokens": 165271779.0, "step": 4328 }, { "epoch": 0.5506932960183183, "ewc_loss": 0.03783489018678665, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014519460091833025, "grad_norm": 4.87410831451416, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8520619869232178, "num_tokens": 165306136.0, "step": 4329 }, { "epoch": 0.5508205062969088, "ewc_loss": 0.03790471330285072, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014589284546673298, "grad_norm": 4.865996837615967, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8725773096084595, "num_tokens": 165342401.0, "step": 4330 }, { "epoch": 0.5509477165754993, "ewc_loss": 0.03786778450012207, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014552353241015226, "grad_norm": 4.812932968139648, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8603717088699341, "num_tokens": 165380120.0, "step": 4331 }, { "epoch": 0.5510749268540898, "ewc_loss": 0.03787907958030701, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014563648437615484, "grad_norm": 4.866876125335693, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8450631499290466, "num_tokens": 165418210.0, "step": 4332 }, { "epoch": 0.5512021371326803, "ewc_loss": 0.03793411701917648, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.00014618688146583736, "grad_norm": 4.822134494781494, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8383468985557556, "num_tokens": 165460952.0, "step": 4333 }, { "epoch": 0.5513293474112708, "ewc_loss": 0.037930965423583984, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001461553474655375, "grad_norm": 4.834823131561279, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8706603050231934, "num_tokens": 165500370.0, "step": 4334 }, { "epoch": 0.5514565576898613, "ewc_loss": 0.03793064132332802, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001461521169403568, "grad_norm": 4.8870673179626465, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8578227758407593, "num_tokens": 165534089.0, "step": 4335 }, { "epoch": 0.5515837679684519, "ewc_loss": 0.03798912465572357, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001467369293095544, "grad_norm": 4.860739231109619, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.851338803768158, "num_tokens": 165573111.0, "step": 4336 }, { "epoch": 0.5517109782470424, "ewc_loss": 0.03818061947822571, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014621048467233777, "grad_norm": 5.00808572769165, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8477993607521057, "num_tokens": 165611942.0, "step": 4337 }, { "epoch": 0.5518381885256328, "ewc_loss": 0.03825446963310242, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001469489943701774, "grad_norm": 4.874910831451416, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8655740022659302, "num_tokens": 165651703.0, "step": 4338 }, { "epoch": 0.5519653988042234, "ewc_loss": 0.03811182081699371, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014552248467225581, "grad_norm": 4.82806921005249, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8555592894554138, "num_tokens": 165691257.0, "step": 4339 }, { "epoch": 0.5520926090828139, "ewc_loss": 0.037874236702919006, "ewc_loss_diag": 2.3365020751953125e-05, "ewc_loss_parallel": 0.0001455880847061053, "grad_norm": 4.834413051605225, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.8739321231842041, "num_tokens": 165727947.0, "step": 4340 }, { "epoch": 0.5522198193614044, "ewc_loss": 0.038184113800525665, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014624545292463154, "grad_norm": 4.85515832901001, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8665637969970703, "num_tokens": 165763400.0, "step": 4341 }, { "epoch": 0.5523470296399949, "ewc_loss": 0.038164641708135605, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001460507046431303, "grad_norm": 4.79025411605835, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8593016266822815, "num_tokens": 165809586.0, "step": 4342 }, { "epoch": 0.5524742399185855, "ewc_loss": 0.038166120648384094, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014606550394091755, "grad_norm": 4.923439979553223, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8605309724807739, "num_tokens": 165839565.0, "step": 4343 }, { "epoch": 0.5526014501971759, "ewc_loss": 0.038220714777708054, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001466114481445402, "grad_norm": 4.86007022857666, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8618394732475281, "num_tokens": 165873539.0, "step": 4344 }, { "epoch": 0.5527286604757664, "ewc_loss": 0.038187213242053986, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001462764194002375, "grad_norm": 4.815494060516357, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8666805028915405, "num_tokens": 165920987.0, "step": 4345 }, { "epoch": 0.5528558707543569, "ewc_loss": 0.03820006549358368, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001464049710193649, "grad_norm": 4.842047691345215, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8483903408050537, "num_tokens": 165965661.0, "step": 4346 }, { "epoch": 0.5529830810329475, "ewc_loss": 0.038227710872888565, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014668139920104295, "grad_norm": 4.913003921508789, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8520283699035645, "num_tokens": 165998239.0, "step": 4347 }, { "epoch": 0.553110291311538, "ewc_loss": 0.03820499777793884, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014645425835624337, "grad_norm": 4.849627494812012, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8557753562927246, "num_tokens": 166036489.0, "step": 4348 }, { "epoch": 0.5532375015901285, "ewc_loss": 0.038201868534088135, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001464229862904176, "grad_norm": 4.827001571655273, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8545613288879395, "num_tokens": 166078774.0, "step": 4349 }, { "epoch": 0.5533647118687189, "ewc_loss": 0.03817696124315262, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014617391570936888, "grad_norm": 4.877076148986816, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.865300178527832, "num_tokens": 166119555.0, "step": 4350 }, { "epoch": 0.5534919221473095, "ewc_loss": 0.03822437301278114, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001466480316594243, "grad_norm": 4.819378852844238, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8573029637336731, "num_tokens": 166159788.0, "step": 4351 }, { "epoch": 0.5536191324259, "ewc_loss": 0.038188040256500244, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001462846848880872, "grad_norm": 4.83588981628418, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8535475730895996, "num_tokens": 166200230.0, "step": 4352 }, { "epoch": 0.5537463427044905, "ewc_loss": 0.038241367787122726, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001468179834773764, "grad_norm": 4.9294538497924805, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8631399869918823, "num_tokens": 166237864.0, "step": 4353 }, { "epoch": 0.553873552983081, "ewc_loss": 0.0382457971572876, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014686226495541632, "grad_norm": 5.313510417938232, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8614044189453125, "num_tokens": 166278347.0, "step": 4354 }, { "epoch": 0.5540007632616716, "ewc_loss": 0.03831912577152252, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014759555051568896, "grad_norm": 4.838008403778076, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8489773869514465, "num_tokens": 166315744.0, "step": 4355 }, { "epoch": 0.554127973540262, "ewc_loss": 0.037970103323459625, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001441053464077413, "grad_norm": 4.9070210456848145, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8592460751533508, "num_tokens": 166352705.0, "step": 4356 }, { "epoch": 0.5542551838188525, "ewc_loss": 0.03821573778986931, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014656166604254395, "grad_norm": 4.806313991546631, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8633534908294678, "num_tokens": 166393956.0, "step": 4357 }, { "epoch": 0.554382394097443, "ewc_loss": 0.03807281702756882, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014513246424030513, "grad_norm": 4.915229797363281, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8354633450508118, "num_tokens": 166430172.0, "step": 4358 }, { "epoch": 0.5545096043760336, "ewc_loss": 0.03820498287677765, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014645414194092155, "grad_norm": 4.842397212982178, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8585835695266724, "num_tokens": 166468732.0, "step": 4359 }, { "epoch": 0.5546368146546241, "ewc_loss": 0.03811659663915634, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014557024405803531, "grad_norm": 4.8133063316345215, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8536800146102905, "num_tokens": 166511383.0, "step": 4360 }, { "epoch": 0.5547640249332146, "ewc_loss": 0.038224443793296814, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014664873015135527, "grad_norm": 4.876194000244141, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8625049591064453, "num_tokens": 166547674.0, "step": 4361 }, { "epoch": 0.5548912352118052, "ewc_loss": 0.038202520459890366, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014642950554843992, "grad_norm": 4.865484237670898, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.851123571395874, "num_tokens": 166581763.0, "step": 4362 }, { "epoch": 0.5550184454903956, "ewc_loss": 0.038249172270298004, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014689599629491568, "grad_norm": 4.975170135498047, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8548346757888794, "num_tokens": 166618604.0, "step": 4363 }, { "epoch": 0.5551456557689861, "ewc_loss": 0.03827134147286415, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014711770927533507, "grad_norm": 4.8329176902771, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8674397468566895, "num_tokens": 166654812.0, "step": 4364 }, { "epoch": 0.5552728660475766, "ewc_loss": 0.0381661020219326, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014606531476601958, "grad_norm": 4.947791576385498, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8457676768302917, "num_tokens": 166693206.0, "step": 4365 }, { "epoch": 0.5554000763261672, "ewc_loss": 0.03829796239733696, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.00014738392201252282, "grad_norm": 4.853014945983887, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8563424348831177, "num_tokens": 166734531.0, "step": 4366 }, { "epoch": 0.5555272866047577, "ewc_loss": 0.0382002592086792, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001464068773202598, "grad_norm": 4.903721809387207, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.85572749376297, "num_tokens": 166771362.0, "step": 4367 }, { "epoch": 0.5556544968833482, "ewc_loss": 0.038511015474796295, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00014707306399941444, "grad_norm": 4.960705280303955, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.849076509475708, "num_tokens": 166802784.0, "step": 4368 }, { "epoch": 0.5557817071619386, "ewc_loss": 0.03828200325369835, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001472243166062981, "grad_norm": 4.937286376953125, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8539355993270874, "num_tokens": 166835590.0, "step": 4369 }, { "epoch": 0.5559089174405292, "ewc_loss": 0.03851357847452164, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00014709867537021637, "grad_norm": 4.913819313049316, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8533605337142944, "num_tokens": 166872145.0, "step": 4370 }, { "epoch": 0.5560361277191197, "ewc_loss": 0.0382700189948082, "ewc_loss_diag": 2.3603439331054688e-05, "ewc_loss_parallel": 0.0001471044815843925, "grad_norm": 4.862455368041992, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8537602424621582, "num_tokens": 166912855.0, "step": 4371 }, { "epoch": 0.5561633379977102, "ewc_loss": 0.03850511461496353, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00014701405598316342, "grad_norm": 4.878048419952393, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.842465877532959, "num_tokens": 166949482.0, "step": 4372 }, { "epoch": 0.5562905482763008, "ewc_loss": 0.038534000515937805, "ewc_loss_diag": 2.384185791015625e-05, "ewc_loss_parallel": 0.00014730289694853127, "grad_norm": 4.896154403686523, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8582383394241333, "num_tokens": 166986727.0, "step": 4373 }, { "epoch": 0.5564177585548913, "ewc_loss": 0.038667045533657074, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014741263294126838, "grad_norm": 4.852113723754883, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.849809467792511, "num_tokens": 167025869.0, "step": 4374 }, { "epoch": 0.5565449688334817, "ewc_loss": 0.038668662309646606, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014742881467100233, "grad_norm": 4.89434814453125, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.867919921875, "num_tokens": 167059825.0, "step": 4375 }, { "epoch": 0.5566721791120722, "ewc_loss": 0.03869854658842087, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014772765280213207, "grad_norm": 4.892895221710205, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8630040884017944, "num_tokens": 167097375.0, "step": 4376 }, { "epoch": 0.5567993893906628, "ewc_loss": 0.03867437317967415, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014748591638635844, "grad_norm": 4.85764741897583, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8487090468406677, "num_tokens": 167136555.0, "step": 4377 }, { "epoch": 0.5569265996692533, "ewc_loss": 0.03866906464099884, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014743284555152059, "grad_norm": 4.81313943862915, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8550055027008057, "num_tokens": 167179870.0, "step": 4378 }, { "epoch": 0.5570538099478438, "ewc_loss": 0.038676753640174866, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014750972331967205, "grad_norm": 4.927981853485107, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8565858602523804, "num_tokens": 167215242.0, "step": 4379 }, { "epoch": 0.5571810202264343, "ewc_loss": 0.038695622235536575, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014769840345252305, "grad_norm": 4.8775787353515625, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8649889230728149, "num_tokens": 167250690.0, "step": 4380 }, { "epoch": 0.5573082305050248, "ewc_loss": 0.03867437690496445, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014748597459401935, "grad_norm": 4.926598072052002, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8444640636444092, "num_tokens": 167283677.0, "step": 4381 }, { "epoch": 0.5574354407836153, "ewc_loss": 0.03868916630744934, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014763386570848525, "grad_norm": 4.806347370147705, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8574936389923096, "num_tokens": 167321355.0, "step": 4382 }, { "epoch": 0.5575626510622058, "ewc_loss": 0.038678690791130066, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014752910647075623, "grad_norm": 4.877411842346191, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8506119251251221, "num_tokens": 167363103.0, "step": 4383 }, { "epoch": 0.5576898613407963, "ewc_loss": 0.038710977882146835, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.000147851969813928, "grad_norm": 4.8516716957092285, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8706392645835876, "num_tokens": 167399732.0, "step": 4384 }, { "epoch": 0.5578170716193869, "ewc_loss": 0.03868604451417923, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001476026518503204, "grad_norm": 4.923428535461426, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8567979335784912, "num_tokens": 167434938.0, "step": 4385 }, { "epoch": 0.5579442818979774, "ewc_loss": 0.038725730031728745, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014799948257859796, "grad_norm": 4.882199287414551, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8605152368545532, "num_tokens": 167472293.0, "step": 4386 }, { "epoch": 0.5580714921765678, "ewc_loss": 0.03867894038558006, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014753158029634506, "grad_norm": 4.822286605834961, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8621124625205994, "num_tokens": 167511015.0, "step": 4387 }, { "epoch": 0.5581987024551583, "ewc_loss": 0.038666777312755585, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014740995538886636, "grad_norm": 4.869424819946289, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8577278852462769, "num_tokens": 167549777.0, "step": 4388 }, { "epoch": 0.5583259127337489, "ewc_loss": 0.03844928368926048, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00014767643006052822, "grad_norm": 4.838375568389893, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8637212514877319, "num_tokens": 167592342.0, "step": 4389 }, { "epoch": 0.5584531230123394, "ewc_loss": 0.038664933294057846, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014739151811227202, "grad_norm": 4.879661560058594, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8612644672393799, "num_tokens": 167630829.0, "step": 4390 }, { "epoch": 0.5585803332909299, "ewc_loss": 0.038487207144498825, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.0001480556675232947, "grad_norm": 4.881041526794434, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8459025621414185, "num_tokens": 167675938.0, "step": 4391 }, { "epoch": 0.5587075435695205, "ewc_loss": 0.03843923285603523, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00014757593453396112, "grad_norm": 4.901080131530762, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8626603484153748, "num_tokens": 167716550.0, "step": 4392 }, { "epoch": 0.5588347538481109, "ewc_loss": 0.038440316915512085, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00014758676115889102, "grad_norm": 4.922133922576904, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.844592273235321, "num_tokens": 167754401.0, "step": 4393 }, { "epoch": 0.5589619641267014, "ewc_loss": 0.038662947714328766, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014737166929990053, "grad_norm": 4.870718955993652, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8620739579200745, "num_tokens": 167792124.0, "step": 4394 }, { "epoch": 0.5590891744052919, "ewc_loss": 0.03844280540943146, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00014761164493393153, "grad_norm": 4.890503883361816, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8765623569488525, "num_tokens": 167829226.0, "step": 4395 }, { "epoch": 0.5592163846838825, "ewc_loss": 0.03865428268909454, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014728499809280038, "grad_norm": 4.925933837890625, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8687630891799927, "num_tokens": 167863439.0, "step": 4396 }, { "epoch": 0.559343594962473, "ewc_loss": 0.038642384111881256, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001471660507377237, "grad_norm": 4.840940952301025, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.858961284160614, "num_tokens": 167907019.0, "step": 4397 }, { "epoch": 0.5594708052410635, "ewc_loss": 0.03861597552895546, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014690194802824408, "grad_norm": 4.9055280685424805, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.850801944732666, "num_tokens": 167942539.0, "step": 4398 }, { "epoch": 0.5595980155196539, "ewc_loss": 0.03864440321922302, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014718621969223022, "grad_norm": 4.868756294250488, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8578110933303833, "num_tokens": 167982848.0, "step": 4399 }, { "epoch": 0.5597252257982445, "ewc_loss": 0.03865974768996239, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014733966963831335, "grad_norm": 4.9452738761901855, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8667293190956116, "num_tokens": 168017500.0, "step": 4400 }, { "epoch": 0.559852436076835, "ewc_loss": 0.03869815170764923, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001477237237850204, "grad_norm": 4.886845111846924, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.842732846736908, "num_tokens": 168052790.0, "step": 4401 }, { "epoch": 0.5599796463554255, "ewc_loss": 0.03863140195608139, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014705622743349522, "grad_norm": 4.992982864379883, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8655421733856201, "num_tokens": 168082541.0, "step": 4402 }, { "epoch": 0.560106856634016, "ewc_loss": 0.038730841130018234, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014805060345679522, "grad_norm": 4.859502792358398, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8637548685073853, "num_tokens": 168128218.0, "step": 4403 }, { "epoch": 0.5602340669126066, "ewc_loss": 0.03831871598958969, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.000146370759466663, "grad_norm": 4.895216464996338, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8599845170974731, "num_tokens": 168165496.0, "step": 4404 }, { "epoch": 0.560361277191197, "ewc_loss": 0.038470134139060974, "ewc_loss_diag": 2.372264862060547e-05, "ewc_loss_parallel": 0.00014788491535000503, "grad_norm": 4.957357883453369, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8455729484558105, "num_tokens": 168204675.0, "step": 4405 }, { "epoch": 0.5604884874697875, "ewc_loss": 0.03869619220495224, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014770409325137734, "grad_norm": 4.901304244995117, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8528115749359131, "num_tokens": 168244832.0, "step": 4406 }, { "epoch": 0.560615697748378, "ewc_loss": 0.03863245248794556, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014706670481245965, "grad_norm": 4.913203239440918, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8535650968551636, "num_tokens": 168282440.0, "step": 4407 }, { "epoch": 0.5607429080269686, "ewc_loss": 0.03871318697929382, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014787404506932944, "grad_norm": 4.928069591522217, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8629957437515259, "num_tokens": 168325303.0, "step": 4408 }, { "epoch": 0.5608701183055591, "ewc_loss": 0.03865569829940796, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014729915710631758, "grad_norm": 4.961533546447754, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8646645545959473, "num_tokens": 168360628.0, "step": 4409 }, { "epoch": 0.5609973285841496, "ewc_loss": 0.03868478536605835, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001475900353398174, "grad_norm": 4.923798561096191, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8457574248313904, "num_tokens": 168398601.0, "step": 4410 }, { "epoch": 0.5611245388627402, "ewc_loss": 0.03864389285445213, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014718111196998507, "grad_norm": 4.917394161224365, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8439826965332031, "num_tokens": 168436893.0, "step": 4411 }, { "epoch": 0.5612517491413306, "ewc_loss": 0.03864826261997223, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014722482592333108, "grad_norm": 4.869184494018555, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8531546592712402, "num_tokens": 168478794.0, "step": 4412 }, { "epoch": 0.5613789594199211, "ewc_loss": 0.03865228593349457, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014726503286510706, "grad_norm": 4.943758964538574, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8559394478797913, "num_tokens": 168520184.0, "step": 4413 }, { "epoch": 0.5615061696985116, "ewc_loss": 0.038796328008174896, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014748475223314017, "grad_norm": 4.905118942260742, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8534170389175415, "num_tokens": 168554305.0, "step": 4414 }, { "epoch": 0.5616333799771022, "ewc_loss": 0.03878118470311165, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014733332500327379, "grad_norm": 4.872563362121582, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8422124981880188, "num_tokens": 168597752.0, "step": 4415 }, { "epoch": 0.5617605902556927, "ewc_loss": 0.03868613764643669, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014760355406906456, "grad_norm": 4.9390387535095215, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8637421131134033, "num_tokens": 168637764.0, "step": 4416 }, { "epoch": 0.5618878005342832, "ewc_loss": 0.03870581090450287, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001478003105148673, "grad_norm": 4.96867561340332, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8378312587738037, "num_tokens": 168672260.0, "step": 4417 }, { "epoch": 0.5620150108128736, "ewc_loss": 0.03867724537849426, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014751465641893446, "grad_norm": 4.843392372131348, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8662562370300293, "num_tokens": 168711478.0, "step": 4418 }, { "epoch": 0.5621422210914642, "ewc_loss": 0.03866790980100632, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014742129133082926, "grad_norm": 4.911942481994629, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8613684773445129, "num_tokens": 168751773.0, "step": 4419 }, { "epoch": 0.5622694313700547, "ewc_loss": 0.03870680555701256, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014781023492105305, "grad_norm": 4.9046502113342285, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8687291741371155, "num_tokens": 168786364.0, "step": 4420 }, { "epoch": 0.5623966416486452, "ewc_loss": 0.038710881024599075, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014785100938752294, "grad_norm": 4.954206943511963, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8550305366516113, "num_tokens": 168826819.0, "step": 4421 }, { "epoch": 0.5625238519272358, "ewc_loss": 0.038846418261528015, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014798564370721579, "grad_norm": 4.868085861206055, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8558979034423828, "num_tokens": 168869405.0, "step": 4422 }, { "epoch": 0.5626510622058263, "ewc_loss": 0.03879052773118019, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014742676285095513, "grad_norm": 4.975735187530518, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8632320165634155, "num_tokens": 168904323.0, "step": 4423 }, { "epoch": 0.5627782724844167, "ewc_loss": 0.03874616697430611, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014820386422798038, "grad_norm": 4.848519325256348, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8482104539871216, "num_tokens": 168948654.0, "step": 4424 }, { "epoch": 0.5629054827630072, "ewc_loss": 0.03867807984352112, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014752299466636032, "grad_norm": 4.940830707550049, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8649102449417114, "num_tokens": 168984333.0, "step": 4425 }, { "epoch": 0.5630326930415978, "ewc_loss": 0.03875460475683212, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014828825078438967, "grad_norm": 4.925497055053711, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8584953546524048, "num_tokens": 169019843.0, "step": 4426 }, { "epoch": 0.5631599033201883, "ewc_loss": 0.0386713407933712, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014745559019502252, "grad_norm": 4.866507530212402, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.868653416633606, "num_tokens": 169058177.0, "step": 4427 }, { "epoch": 0.5632871135987788, "ewc_loss": 0.038859348744153976, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001481149811297655, "grad_norm": 4.894361972808838, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8631066083908081, "num_tokens": 169102268.0, "step": 4428 }, { "epoch": 0.5634143238773693, "ewc_loss": 0.03886110335588455, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014813253073953092, "grad_norm": 4.929997444152832, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8520349264144897, "num_tokens": 169141709.0, "step": 4429 }, { "epoch": 0.5635415341559598, "ewc_loss": 0.03886108845472336, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014813235611654818, "grad_norm": 4.985859394073486, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8540281653404236, "num_tokens": 169177107.0, "step": 4430 }, { "epoch": 0.5636687444345503, "ewc_loss": 0.03870178759098053, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014776005991734564, "grad_norm": 4.892072677612305, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8547555208206177, "num_tokens": 169211531.0, "step": 4431 }, { "epoch": 0.5637959547131408, "ewc_loss": 0.03873741626739502, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014811634900979698, "grad_norm": 4.9645209312438965, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8505910634994507, "num_tokens": 169247507.0, "step": 4432 }, { "epoch": 0.5639231649917313, "ewc_loss": 0.03875584155321121, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014830060536041856, "grad_norm": 4.918548107147217, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.871471643447876, "num_tokens": 169281007.0, "step": 4433 }, { "epoch": 0.5640503752703219, "ewc_loss": 0.0388193316757679, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001477147889090702, "grad_norm": 4.888651371002197, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8595301508903503, "num_tokens": 169322295.0, "step": 4434 }, { "epoch": 0.5641775855489124, "ewc_loss": 0.03874132037162781, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014815539179835469, "grad_norm": 4.934831142425537, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8545199632644653, "num_tokens": 169358184.0, "step": 4435 }, { "epoch": 0.5643047958275028, "ewc_loss": 0.03872981294989586, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014804031525272876, "grad_norm": 4.919287204742432, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8629269599914551, "num_tokens": 169391518.0, "step": 4436 }, { "epoch": 0.5644320061060933, "ewc_loss": 0.03872409462928772, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014798311167396605, "grad_norm": 4.976283073425293, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8583117723464966, "num_tokens": 169426743.0, "step": 4437 }, { "epoch": 0.5645592163846839, "ewc_loss": 0.0387871190905571, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014861336967442185, "grad_norm": 4.85906982421875, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8774893879890442, "num_tokens": 169466452.0, "step": 4438 }, { "epoch": 0.5646864266632744, "ewc_loss": 0.038823433220386505, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014775579620618373, "grad_norm": 4.93435525894165, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8622763752937317, "num_tokens": 169500295.0, "step": 4439 }, { "epoch": 0.5648136369418649, "ewc_loss": 0.038902007043361664, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014854155597276986, "grad_norm": 4.905701160430908, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8580594658851624, "num_tokens": 169541829.0, "step": 4440 }, { "epoch": 0.5649408472204555, "ewc_loss": 0.03884805738925934, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001480020728195086, "grad_norm": 4.963521480560303, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8429226875305176, "num_tokens": 169581547.0, "step": 4441 }, { "epoch": 0.5650680574990459, "ewc_loss": 0.03889687731862068, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014849024591967463, "grad_norm": 4.866971015930176, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8624019622802734, "num_tokens": 169622107.0, "step": 4442 }, { "epoch": 0.5651952677776364, "ewc_loss": 0.038851648569107056, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014803795784246176, "grad_norm": 4.98649787902832, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8560527563095093, "num_tokens": 169657258.0, "step": 4443 }, { "epoch": 0.5653224780562269, "ewc_loss": 0.03891368582844734, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001486583350924775, "grad_norm": 4.943700790405273, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8478983640670776, "num_tokens": 169691568.0, "step": 4444 }, { "epoch": 0.5654496883348175, "ewc_loss": 0.03888614475727081, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001483829109929502, "grad_norm": 4.946897506713867, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8648073077201843, "num_tokens": 169732429.0, "step": 4445 }, { "epoch": 0.565576898613408, "ewc_loss": 0.038852617144584656, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014804763486608863, "grad_norm": 4.897632598876953, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8514134287834167, "num_tokens": 169769224.0, "step": 4446 }, { "epoch": 0.5657041088919985, "ewc_loss": 0.03871224820613861, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001478646881878376, "grad_norm": 4.85027551651001, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8572679758071899, "num_tokens": 169812830.0, "step": 4447 }, { "epoch": 0.5658313191705889, "ewc_loss": 0.038851525634527206, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014803673548158258, "grad_norm": 4.949244976043701, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8557398319244385, "num_tokens": 169843823.0, "step": 4448 }, { "epoch": 0.5659585294491795, "ewc_loss": 0.03890814259648323, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014860290684737265, "grad_norm": 4.918753623962402, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.86054527759552, "num_tokens": 169882847.0, "step": 4449 }, { "epoch": 0.56608573972777, "ewc_loss": 0.038864750415086746, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001481689978390932, "grad_norm": 4.895961761474609, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8394409418106079, "num_tokens": 169921264.0, "step": 4450 }, { "epoch": 0.5662129500063605, "ewc_loss": 0.038885295391082764, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014837445633020252, "grad_norm": 4.956449031829834, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8651657104492188, "num_tokens": 169949681.0, "step": 4451 }, { "epoch": 0.566340160284951, "ewc_loss": 0.03892659395933151, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014878741058055311, "grad_norm": 4.881172180175781, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8718445301055908, "num_tokens": 169986254.0, "step": 4452 }, { "epoch": 0.5664673705635416, "ewc_loss": 0.038902297616004944, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001485444518039003, "grad_norm": 4.936580657958984, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8530105352401733, "num_tokens": 170022745.0, "step": 4453 }, { "epoch": 0.566594580842132, "ewc_loss": 0.03894934803247452, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001490149588789791, "grad_norm": 4.91960334777832, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8690924048423767, "num_tokens": 170057038.0, "step": 4454 }, { "epoch": 0.5667217911207225, "ewc_loss": 0.03893255814909935, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014884707343298942, "grad_norm": 4.863102912902832, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8765933513641357, "num_tokens": 170098460.0, "step": 4455 }, { "epoch": 0.566849001399313, "ewc_loss": 0.03893426060676575, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014886407006997615, "grad_norm": 4.921976089477539, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8462117910385132, "num_tokens": 170137689.0, "step": 4456 }, { "epoch": 0.5669762116779036, "ewc_loss": 0.03902202844619751, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014974178338889033, "grad_norm": 4.926156520843506, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8585571050643921, "num_tokens": 170176716.0, "step": 4457 }, { "epoch": 0.5671034219564941, "ewc_loss": 0.03892836347222328, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001488051057094708, "grad_norm": 4.9548516273498535, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8531053066253662, "num_tokens": 170212621.0, "step": 4458 }, { "epoch": 0.5672306322350846, "ewc_loss": 0.0389644093811512, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014916557120159268, "grad_norm": 4.940370559692383, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8478270769119263, "num_tokens": 170254493.0, "step": 4459 }, { "epoch": 0.5673578425136752, "ewc_loss": 0.038897138088941574, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014849286526441574, "grad_norm": 4.880731582641602, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8790901899337769, "num_tokens": 170290559.0, "step": 4460 }, { "epoch": 0.5674850527922656, "ewc_loss": 0.03892286494374275, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014875012857373804, "grad_norm": 4.89514684677124, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8532954454421997, "num_tokens": 170335079.0, "step": 4461 }, { "epoch": 0.5676122630708561, "ewc_loss": 0.03892969712615013, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014881844981573522, "grad_norm": 4.924657821655273, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8615718483924866, "num_tokens": 170370665.0, "step": 4462 }, { "epoch": 0.5677394733494466, "ewc_loss": 0.03892715275287628, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001487929985160008, "grad_norm": 4.920764446258545, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8567665815353394, "num_tokens": 170412585.0, "step": 4463 }, { "epoch": 0.5678666836280372, "ewc_loss": 0.038887880742549896, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014840027142781764, "grad_norm": 4.877147197723389, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8643417954444885, "num_tokens": 170450806.0, "step": 4464 }, { "epoch": 0.5679938939066277, "ewc_loss": 0.038903795182704926, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014855945482850075, "grad_norm": 4.99290657043457, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8588305115699768, "num_tokens": 170483265.0, "step": 4465 }, { "epoch": 0.5681211041852182, "ewc_loss": 0.038947366178035736, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001489951682742685, "grad_norm": 4.901970386505127, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8748607635498047, "num_tokens": 170520910.0, "step": 4466 }, { "epoch": 0.5682483144638086, "ewc_loss": 0.03886384516954422, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014815994654782116, "grad_norm": 4.961491584777832, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8410060405731201, "num_tokens": 170556516.0, "step": 4467 }, { "epoch": 0.5683755247423992, "ewc_loss": 0.0388035848736763, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014877803914714605, "grad_norm": 4.962192058563232, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8563588857650757, "num_tokens": 170595571.0, "step": 4468 }, { "epoch": 0.5685027350209897, "ewc_loss": 0.0388772152364254, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001482936495449394, "grad_norm": 4.86656379699707, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8769352436065674, "num_tokens": 170634117.0, "step": 4469 }, { "epoch": 0.5686299452995802, "ewc_loss": 0.0388835184276104, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014835665933787823, "grad_norm": 4.916118621826172, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8522807359695435, "num_tokens": 170677624.0, "step": 4470 }, { "epoch": 0.5687571555781707, "ewc_loss": 0.03888389840722084, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014836047193966806, "grad_norm": 4.916683673858643, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8439769744873047, "num_tokens": 170716732.0, "step": 4471 }, { "epoch": 0.5688843658567613, "ewc_loss": 0.038934893906116486, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001488704001531005, "grad_norm": 5.036066055297852, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8516900539398193, "num_tokens": 170758521.0, "step": 4472 }, { "epoch": 0.5690115761353517, "ewc_loss": 0.03892405331134796, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014876200293656439, "grad_norm": 4.876409530639648, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8584462404251099, "num_tokens": 170796799.0, "step": 4473 }, { "epoch": 0.5691387864139422, "ewc_loss": 0.03868343308568001, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014757651661057025, "grad_norm": 4.935649871826172, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8448340892791748, "num_tokens": 170838569.0, "step": 4474 }, { "epoch": 0.5692659966925327, "ewc_loss": 0.03879881650209427, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001487303525209427, "grad_norm": 4.893296718597412, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8597154021263123, "num_tokens": 170878206.0, "step": 4475 }, { "epoch": 0.5693932069711233, "ewc_loss": 0.038707323372364044, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014781541540287435, "grad_norm": 4.975915431976318, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8447268605232239, "num_tokens": 170915506.0, "step": 4476 }, { "epoch": 0.5695204172497138, "ewc_loss": 0.03879113122820854, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014865348930470645, "grad_norm": 4.898942470550537, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.854706883430481, "num_tokens": 170952303.0, "step": 4477 }, { "epoch": 0.5696476275283043, "ewc_loss": 0.03882049024105072, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014772640133742243, "grad_norm": 4.921087265014648, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8512542247772217, "num_tokens": 170993757.0, "step": 4478 }, { "epoch": 0.5697748378068948, "ewc_loss": 0.03876076638698578, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014834984904155135, "grad_norm": 4.900558948516846, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.855160117149353, "num_tokens": 171030240.0, "step": 4479 }, { "epoch": 0.5699020480854853, "ewc_loss": 0.03873901814222336, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001481323706684634, "grad_norm": 4.938559055328369, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8349708318710327, "num_tokens": 171069654.0, "step": 4480 }, { "epoch": 0.5700292583640758, "ewc_loss": 0.038760870695114136, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001483508967794478, "grad_norm": 4.917033672332764, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.841580331325531, "num_tokens": 171107341.0, "step": 4481 }, { "epoch": 0.5701564686426663, "ewc_loss": 0.03877920284867287, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014853420725557953, "grad_norm": 4.955379486083984, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8476244211196899, "num_tokens": 171144063.0, "step": 4482 }, { "epoch": 0.5702836789212569, "ewc_loss": 0.038770418614149094, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001484463718952611, "grad_norm": 4.927919387817383, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.850898027420044, "num_tokens": 171180448.0, "step": 4483 }, { "epoch": 0.5704108891998474, "ewc_loss": 0.038782309740781784, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.0001485652755945921, "grad_norm": 4.975062370300293, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8615390062332153, "num_tokens": 171214220.0, "step": 4484 }, { "epoch": 0.5705380994784378, "ewc_loss": 0.038833700120449066, "ewc_loss_diag": 2.396106719970703e-05, "ewc_loss_parallel": 0.00014907917648088187, "grad_norm": 4.8741230964660645, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8648720979690552, "num_tokens": 171252798.0, "step": 4485 }, { "epoch": 0.5706653097570283, "ewc_loss": 0.038890015333890915, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014842164819128811, "grad_norm": 4.856742858886719, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8625461459159851, "num_tokens": 171295434.0, "step": 4486 }, { "epoch": 0.5707925200356189, "ewc_loss": 0.038966868072748184, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014919017849024385, "grad_norm": 4.935698509216309, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8593067526817322, "num_tokens": 171334129.0, "step": 4487 }, { "epoch": 0.5709197303142094, "ewc_loss": 0.03898250311613083, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014934652426745743, "grad_norm": 4.943176746368408, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8542566299438477, "num_tokens": 171370239.0, "step": 4488 }, { "epoch": 0.5710469405927999, "ewc_loss": 0.03892776370048523, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014879911032039672, "grad_norm": 4.931146621704102, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8723419308662415, "num_tokens": 171402411.0, "step": 4489 }, { "epoch": 0.5711741508713905, "ewc_loss": 0.038975462317466736, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014927609299775213, "grad_norm": 4.925990581512451, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8676803112030029, "num_tokens": 171438736.0, "step": 4490 }, { "epoch": 0.5713013611499809, "ewc_loss": 0.038984037935733795, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014936187653802335, "grad_norm": 4.904196739196777, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8625925183296204, "num_tokens": 171477949.0, "step": 4491 }, { "epoch": 0.5714285714285714, "ewc_loss": 0.03898271545767784, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014934864884708077, "grad_norm": 4.951473712921143, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8534985780715942, "num_tokens": 171520953.0, "step": 4492 }, { "epoch": 0.5715557817071619, "ewc_loss": 0.038978882133960724, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014931030455045402, "grad_norm": 4.894787311553955, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.862808346748352, "num_tokens": 171559415.0, "step": 4493 }, { "epoch": 0.5716829919857525, "ewc_loss": 0.038929011672735214, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014881159586366266, "grad_norm": 4.963574409484863, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8654179573059082, "num_tokens": 171593118.0, "step": 4494 }, { "epoch": 0.571810202264343, "ewc_loss": 0.03898340091109276, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014935550279915333, "grad_norm": 4.917938709259033, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8447881937026978, "num_tokens": 171634514.0, "step": 4495 }, { "epoch": 0.5719374125429335, "ewc_loss": 0.038944412022829056, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014896559878252447, "grad_norm": 4.945949554443359, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8609318733215332, "num_tokens": 171675239.0, "step": 4496 }, { "epoch": 0.5720646228215239, "ewc_loss": 0.03895687311887741, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014909023593645543, "grad_norm": 4.9762043952941895, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8647812604904175, "num_tokens": 171713154.0, "step": 4497 }, { "epoch": 0.5721918331001145, "ewc_loss": 0.03894619271159172, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014898342487867922, "grad_norm": 4.9623260498046875, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8510237336158752, "num_tokens": 171750046.0, "step": 4498 }, { "epoch": 0.572319043378705, "ewc_loss": 0.03893997520208359, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014892122999299318, "grad_norm": 4.9763407707214355, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8474392890930176, "num_tokens": 171782708.0, "step": 4499 }, { "epoch": 0.5724462536572955, "ewc_loss": 0.03893708437681198, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014889234444126487, "grad_norm": 4.931222438812256, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8537415266036987, "num_tokens": 171819421.0, "step": 4500 }, { "epoch": 0.572573463935886, "ewc_loss": 0.03893114626407623, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001488329580752179, "grad_norm": 4.956608772277832, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8570376634597778, "num_tokens": 171860993.0, "step": 4501 }, { "epoch": 0.5727006742144766, "ewc_loss": 0.03892884403467178, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001488099223934114, "grad_norm": 4.917727470397949, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8574104905128479, "num_tokens": 171896894.0, "step": 4502 }, { "epoch": 0.572827884493067, "ewc_loss": 0.038910407572984695, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014862556417938322, "grad_norm": 4.941155910491943, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.855613112449646, "num_tokens": 171938694.0, "step": 4503 }, { "epoch": 0.5729550947716575, "ewc_loss": 0.038937050849199295, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001488919951952994, "grad_norm": 4.934780120849609, "learning_rate": 1e-06, "loss": 0.3808, "mean_token_accuracy": 0.8825422525405884, "num_tokens": 171972606.0, "step": 4504 }, { "epoch": 0.573082305050248, "ewc_loss": 0.03891054540872574, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014862691750749946, "grad_norm": 4.956310749053955, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8601861000061035, "num_tokens": 172010861.0, "step": 4505 }, { "epoch": 0.5732095153288386, "ewc_loss": 0.038945864886045456, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014898012159392238, "grad_norm": 4.995337009429932, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8428843021392822, "num_tokens": 172049867.0, "step": 4506 }, { "epoch": 0.5733367256074291, "ewc_loss": 0.03891857713460922, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001487072731833905, "grad_norm": 4.914806842803955, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8578250408172607, "num_tokens": 172092176.0, "step": 4507 }, { "epoch": 0.5734639358860196, "ewc_loss": 0.038885679095983505, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014837828348390758, "grad_norm": 4.903439521789551, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8574179410934448, "num_tokens": 172132789.0, "step": 4508 }, { "epoch": 0.5735911461646102, "ewc_loss": 0.0388965904712677, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001484874082962051, "grad_norm": 4.948207378387451, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8722425103187561, "num_tokens": 172170536.0, "step": 4509 }, { "epoch": 0.5737183564432006, "ewc_loss": 0.0389169380068779, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014869087317492813, "grad_norm": 5.015363693237305, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8570055961608887, "num_tokens": 172206828.0, "step": 4510 }, { "epoch": 0.5738455667217911, "ewc_loss": 0.03893442079424858, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014886569988448173, "grad_norm": 4.953667163848877, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8501148223876953, "num_tokens": 172242311.0, "step": 4511 }, { "epoch": 0.5739727770003816, "ewc_loss": 0.038878172636032104, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001483031956013292, "grad_norm": 4.963485240936279, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8530930280685425, "num_tokens": 172275135.0, "step": 4512 }, { "epoch": 0.5740999872789722, "ewc_loss": 0.039175331592559814, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014883338008075953, "grad_norm": 5.040414333343506, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8656995892524719, "num_tokens": 172308682.0, "step": 4513 }, { "epoch": 0.5742271975575627, "ewc_loss": 0.038905978202819824, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014858125359751284, "grad_norm": 4.888605117797852, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8533903360366821, "num_tokens": 172345834.0, "step": 4514 }, { "epoch": 0.5743544078361532, "ewc_loss": 0.0388980507850647, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001485019747633487, "grad_norm": 5.015568256378174, "learning_rate": 1e-06, "loss": 0.5435, "mean_token_accuracy": 0.8346529006958008, "num_tokens": 172387976.0, "step": 4515 }, { "epoch": 0.5744816181147436, "ewc_loss": 0.03900869935750961, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014960847329348326, "grad_norm": 4.933987617492676, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8578264713287354, "num_tokens": 172427447.0, "step": 4516 }, { "epoch": 0.5746088283933342, "ewc_loss": 0.038874443620443344, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014826592814642936, "grad_norm": 4.909396171569824, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8494223356246948, "num_tokens": 172468686.0, "step": 4517 }, { "epoch": 0.5747360386719247, "ewc_loss": 0.038988277316093445, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014940423716325313, "grad_norm": 5.010379314422607, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8541065454483032, "num_tokens": 172508190.0, "step": 4518 }, { "epoch": 0.5748632489505152, "ewc_loss": 0.03920549526810646, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014913504128344357, "grad_norm": 4.92799711227417, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8725792169570923, "num_tokens": 172552295.0, "step": 4519 }, { "epoch": 0.5749904592291057, "ewc_loss": 0.03918440267443657, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014892409672029316, "grad_norm": 4.947686195373535, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8627824783325195, "num_tokens": 172591065.0, "step": 4520 }, { "epoch": 0.5751176695076963, "ewc_loss": 0.039245739579200745, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014953748905099928, "grad_norm": 4.96889591217041, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8603731989860535, "num_tokens": 172633741.0, "step": 4521 }, { "epoch": 0.5752448797862867, "ewc_loss": 0.039182890206575394, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001489089772803709, "grad_norm": 4.977097988128662, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8660834431648254, "num_tokens": 172669081.0, "step": 4522 }, { "epoch": 0.5753720900648772, "ewc_loss": 0.03918787091970444, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001489587884861976, "grad_norm": 5.031106472015381, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8356478214263916, "num_tokens": 172704956.0, "step": 4523 }, { "epoch": 0.5754993003434677, "ewc_loss": 0.039218779653310776, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001492678711656481, "grad_norm": 4.9589362144470215, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8609955906867981, "num_tokens": 172742270.0, "step": 4524 }, { "epoch": 0.5756265106220583, "ewc_loss": 0.039182402193546295, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014890411694068462, "grad_norm": 4.980565547943115, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8450404405593872, "num_tokens": 172778065.0, "step": 4525 }, { "epoch": 0.5757537209006488, "ewc_loss": 0.03921616077423096, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001492417068220675, "grad_norm": 4.901060104370117, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8663175106048584, "num_tokens": 172815825.0, "step": 4526 }, { "epoch": 0.5758809311792393, "ewc_loss": 0.039249271154403687, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014957277744542807, "grad_norm": 5.012596607208252, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8568021655082703, "num_tokens": 172849560.0, "step": 4527 }, { "epoch": 0.5760081414578297, "ewc_loss": 0.03927534446120262, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014983353321440518, "grad_norm": 4.933047294616699, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8590598106384277, "num_tokens": 172885203.0, "step": 4528 }, { "epoch": 0.5761353517364203, "ewc_loss": 0.039226233959198, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014934243517927825, "grad_norm": 4.965261459350586, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8580163717269897, "num_tokens": 172922341.0, "step": 4529 }, { "epoch": 0.5762625620150108, "ewc_loss": 0.03927759453654289, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014985603047534823, "grad_norm": 5.01072883605957, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8636681437492371, "num_tokens": 172953363.0, "step": 4530 }, { "epoch": 0.5763897722936013, "ewc_loss": 0.039244238287210464, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00014952245692256838, "grad_norm": 5.009913921356201, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8633617162704468, "num_tokens": 172986253.0, "step": 4531 }, { "epoch": 0.5765169825721919, "ewc_loss": 0.03925874084234238, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001496674958616495, "grad_norm": 4.9311909675598145, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8688638806343079, "num_tokens": 173023466.0, "step": 4532 }, { "epoch": 0.5766441928507824, "ewc_loss": 0.038977086544036865, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.000149292332935147, "grad_norm": 4.971943378448486, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8520649671554565, "num_tokens": 173063329.0, "step": 4533 }, { "epoch": 0.5767714031293728, "ewc_loss": 0.03904777020215988, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014999917766544968, "grad_norm": 4.918249130249023, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8640384674072266, "num_tokens": 173100676.0, "step": 4534 }, { "epoch": 0.5768986134079633, "ewc_loss": 0.039014093577861786, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001496624172432348, "grad_norm": 4.937448978424072, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8681302666664124, "num_tokens": 173135088.0, "step": 4535 }, { "epoch": 0.5770258236865539, "ewc_loss": 0.039304640144109726, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015012649237178266, "grad_norm": 4.926408290863037, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8495655059814453, "num_tokens": 173175810.0, "step": 4536 }, { "epoch": 0.5771530339651444, "ewc_loss": 0.03902075067162514, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014972897770348936, "grad_norm": 4.880778789520264, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.859083354473114, "num_tokens": 173220826.0, "step": 4537 }, { "epoch": 0.5772802442437349, "ewc_loss": 0.03901892900466919, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014971077325753868, "grad_norm": 4.91519832611084, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8686633706092834, "num_tokens": 173258411.0, "step": 4538 }, { "epoch": 0.5774074545223254, "ewc_loss": 0.03910398483276367, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015056133270263672, "grad_norm": 4.98632287979126, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.841164231300354, "num_tokens": 173297298.0, "step": 4539 }, { "epoch": 0.5775346648009159, "ewc_loss": 0.039315108209848404, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001502311642980203, "grad_norm": 5.002426624298096, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8403608798980713, "num_tokens": 173330839.0, "step": 4540 }, { "epoch": 0.5776618750795064, "ewc_loss": 0.039076052606105804, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015028202324174345, "grad_norm": 4.963982582092285, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8502801656723022, "num_tokens": 173370219.0, "step": 4541 }, { "epoch": 0.5777890853580969, "ewc_loss": 0.03927788510918617, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001498589408583939, "grad_norm": 4.925682544708252, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8587135672569275, "num_tokens": 173410425.0, "step": 4542 }, { "epoch": 0.5779162956366874, "ewc_loss": 0.03902435302734375, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014976503734942526, "grad_norm": 4.959614276885986, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8578912019729614, "num_tokens": 173447109.0, "step": 4543 }, { "epoch": 0.578043505915278, "ewc_loss": 0.039070066064596176, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015022215666249394, "grad_norm": 4.899072170257568, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8614906668663025, "num_tokens": 173490064.0, "step": 4544 }, { "epoch": 0.5781707161938685, "ewc_loss": 0.03925567865371704, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001496368640800938, "grad_norm": 5.000917434692383, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8612328767776489, "num_tokens": 173523791.0, "step": 4545 }, { "epoch": 0.5782979264724589, "ewc_loss": 0.03909885510802269, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001505100226495415, "grad_norm": 4.905595302581787, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8667310476303101, "num_tokens": 173564069.0, "step": 4546 }, { "epoch": 0.5784251367510495, "ewc_loss": 0.03900277614593506, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014954923244658858, "grad_norm": 4.996242523193359, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8566911816596985, "num_tokens": 173601893.0, "step": 4547 }, { "epoch": 0.57855234702964, "ewc_loss": 0.03910582885146141, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001505797845311463, "grad_norm": 4.9928083419799805, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8671797513961792, "num_tokens": 173642299.0, "step": 4548 }, { "epoch": 0.5786795573082305, "ewc_loss": 0.039054952561855316, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001500710059190169, "grad_norm": 5.002745151519775, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8597018718719482, "num_tokens": 173675824.0, "step": 4549 }, { "epoch": 0.578806767586821, "ewc_loss": 0.0391869843006134, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.0001501706283306703, "grad_norm": 4.963308811187744, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8594123125076294, "num_tokens": 173712192.0, "step": 4550 }, { "epoch": 0.5789339778654116, "ewc_loss": 0.03911540284752846, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00014945481962058693, "grad_norm": 4.911345481872559, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8653281927108765, "num_tokens": 173746949.0, "step": 4551 }, { "epoch": 0.579061188144002, "ewc_loss": 0.03915417939424515, "ewc_loss_diag": 2.4199485778808594e-05, "ewc_loss_parallel": 0.00014984255540184677, "grad_norm": 4.929591655731201, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.860648512840271, "num_tokens": 173784527.0, "step": 4552 }, { "epoch": 0.5791883984225925, "ewc_loss": 0.03905393183231354, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015006081957835704, "grad_norm": 4.939894676208496, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8643723726272583, "num_tokens": 173820648.0, "step": 4553 }, { "epoch": 0.579315608701183, "ewc_loss": 0.03907616436481476, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015028314373921603, "grad_norm": 4.937132835388184, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8521654605865479, "num_tokens": 173860427.0, "step": 4554 }, { "epoch": 0.5794428189797736, "ewc_loss": 0.03902386501431465, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014976014790590852, "grad_norm": 4.890854358673096, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8601847290992737, "num_tokens": 173899876.0, "step": 4555 }, { "epoch": 0.5795700292583641, "ewc_loss": 0.03907352313399315, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001502567029092461, "grad_norm": 4.920680522918701, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8670877814292908, "num_tokens": 173937354.0, "step": 4556 }, { "epoch": 0.5796972395369546, "ewc_loss": 0.039089299738407135, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015041447477415204, "grad_norm": 4.951129913330078, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8465336561203003, "num_tokens": 173976039.0, "step": 4557 }, { "epoch": 0.5798244498155452, "ewc_loss": 0.039355047047138214, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001506305707152933, "grad_norm": 5.011601448059082, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8371615409851074, "num_tokens": 174006359.0, "step": 4558 }, { "epoch": 0.5799516600941356, "ewc_loss": 0.039381243288517, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001508925051894039, "grad_norm": 4.978517055511475, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8494349122047424, "num_tokens": 174044662.0, "step": 4559 }, { "epoch": 0.5800788703727261, "ewc_loss": 0.03906751424074173, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.0001501966326031834, "grad_norm": 4.992866516113281, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8537296056747437, "num_tokens": 174077133.0, "step": 4560 }, { "epoch": 0.5802060806513166, "ewc_loss": 0.039057593792676926, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015009741764515638, "grad_norm": 4.886632919311523, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.852916955947876, "num_tokens": 174121873.0, "step": 4561 }, { "epoch": 0.5803332909299072, "ewc_loss": 0.03904358670115471, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00014995734090916812, "grad_norm": 4.936977863311768, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8596861362457275, "num_tokens": 174158873.0, "step": 4562 }, { "epoch": 0.5804605012084977, "ewc_loss": 0.03910620138049126, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015058349526952952, "grad_norm": 5.0072102546691895, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8655033707618713, "num_tokens": 174189222.0, "step": 4563 }, { "epoch": 0.5805877114870882, "ewc_loss": 0.0390981025993824, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015050251386128366, "grad_norm": 4.963441848754883, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8499833941459656, "num_tokens": 174229018.0, "step": 4564 }, { "epoch": 0.5807149217656786, "ewc_loss": 0.039102789014577866, "ewc_loss_diag": 2.4080276489257812e-05, "ewc_loss_parallel": 0.00015054938558023423, "grad_norm": 4.914938449859619, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8591964244842529, "num_tokens": 174269215.0, "step": 4565 }, { "epoch": 0.5808421320442692, "ewc_loss": 0.03935200721025467, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000150600157212466, "grad_norm": 4.9824090003967285, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8659747838973999, "num_tokens": 174303926.0, "step": 4566 }, { "epoch": 0.5809693423228597, "ewc_loss": 0.03939483314752579, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015102839097380638, "grad_norm": 4.893941879272461, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8591985702514648, "num_tokens": 174347491.0, "step": 4567 }, { "epoch": 0.5810965526014502, "ewc_loss": 0.03936544805765152, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015073457325343043, "grad_norm": 4.944470405578613, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.859518826007843, "num_tokens": 174384246.0, "step": 4568 }, { "epoch": 0.5812237628800407, "ewc_loss": 0.039394404739141464, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015102411271072924, "grad_norm": 4.9572272300720215, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8569802045822144, "num_tokens": 174417081.0, "step": 4569 }, { "epoch": 0.5813509731586313, "ewc_loss": 0.039413727819919586, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015121734759304672, "grad_norm": 4.988147258758545, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.84715735912323, "num_tokens": 174457364.0, "step": 4570 }, { "epoch": 0.5814781834372217, "ewc_loss": 0.03938180208206177, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015089807857293636, "grad_norm": 4.923793792724609, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.858616828918457, "num_tokens": 174494304.0, "step": 4571 }, { "epoch": 0.5816053937158122, "ewc_loss": 0.039388518780469894, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015096527931746095, "grad_norm": 5.026928424835205, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.858817458152771, "num_tokens": 174528047.0, "step": 4572 }, { "epoch": 0.5817326039944027, "ewc_loss": 0.03943778574466705, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015145791985560209, "grad_norm": 4.93914270401001, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8629887104034424, "num_tokens": 174565990.0, "step": 4573 }, { "epoch": 0.5818598142729933, "ewc_loss": 0.03934624791145325, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001505425461800769, "grad_norm": 4.989226341247559, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8443442583084106, "num_tokens": 174599039.0, "step": 4574 }, { "epoch": 0.5819870245515838, "ewc_loss": 0.03941548243165016, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015123491175472736, "grad_norm": 4.988114833831787, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8515386581420898, "num_tokens": 174634786.0, "step": 4575 }, { "epoch": 0.5821142348301743, "ewc_loss": 0.03937837481498718, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015086382336448878, "grad_norm": 4.91763162612915, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8490248322486877, "num_tokens": 174677875.0, "step": 4576 }, { "epoch": 0.5822414451087647, "ewc_loss": 0.03932715952396393, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015035168325994164, "grad_norm": 4.900827884674072, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8523571491241455, "num_tokens": 174715421.0, "step": 4577 }, { "epoch": 0.5823686553873553, "ewc_loss": 0.03937971591949463, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001508772256784141, "grad_norm": 5.056396007537842, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8551493287086487, "num_tokens": 174746555.0, "step": 4578 }, { "epoch": 0.5824958656659458, "ewc_loss": 0.03938845917582512, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001509646826889366, "grad_norm": 4.925686359405518, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8460388779640198, "num_tokens": 174786470.0, "step": 4579 }, { "epoch": 0.5826230759445363, "ewc_loss": 0.03930322080850601, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000150112304254435, "grad_norm": 4.9429030418396, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8554564714431763, "num_tokens": 174827131.0, "step": 4580 }, { "epoch": 0.5827502862231269, "ewc_loss": 0.03939295932650566, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001510096772108227, "grad_norm": 5.0122599601745605, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.850691556930542, "num_tokens": 174861391.0, "step": 4581 }, { "epoch": 0.5828774965017174, "ewc_loss": 0.03936135768890381, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001506936678197235, "grad_norm": 4.9247517585754395, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8619194030761719, "num_tokens": 174897437.0, "step": 4582 }, { "epoch": 0.5830047067803078, "ewc_loss": 0.03936714679002762, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015075152623467147, "grad_norm": 5.029562473297119, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8481490612030029, "num_tokens": 174932504.0, "step": 4583 }, { "epoch": 0.5831319170588983, "ewc_loss": 0.03939404711127281, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001510205474914983, "grad_norm": 4.929802894592285, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8515578508377075, "num_tokens": 174974746.0, "step": 4584 }, { "epoch": 0.5832591273374889, "ewc_loss": 0.039331529289484024, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001503953681094572, "grad_norm": 4.93522310256958, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8518255949020386, "num_tokens": 175016155.0, "step": 4585 }, { "epoch": 0.5833863376160794, "ewc_loss": 0.039383456110954285, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000150914624100551, "grad_norm": 4.987682819366455, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.842797040939331, "num_tokens": 175060595.0, "step": 4586 }, { "epoch": 0.5835135478946699, "ewc_loss": 0.0394209660589695, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015128974337130785, "grad_norm": 4.999546051025391, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.856834352016449, "num_tokens": 175101278.0, "step": 4587 }, { "epoch": 0.5836407581732604, "ewc_loss": 0.03935689106583595, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001506489934399724, "grad_norm": 4.945617198944092, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8507368564605713, "num_tokens": 175139479.0, "step": 4588 }, { "epoch": 0.5837679684518509, "ewc_loss": 0.03934422880411148, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015052237722557038, "grad_norm": 4.999802112579346, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.840705156326294, "num_tokens": 175182376.0, "step": 4589 }, { "epoch": 0.5838951787304414, "ewc_loss": 0.03933854401111603, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015046550834085792, "grad_norm": 4.914518356323242, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8502193689346313, "num_tokens": 175225449.0, "step": 4590 }, { "epoch": 0.5840223890090319, "ewc_loss": 0.03933755308389664, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015045562759041786, "grad_norm": 5.054150104522705, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8547424674034119, "num_tokens": 175255959.0, "step": 4591 }, { "epoch": 0.5841495992876224, "ewc_loss": 0.03941582143306732, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015123828779906034, "grad_norm": 5.012700080871582, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8452587127685547, "num_tokens": 175292142.0, "step": 4592 }, { "epoch": 0.584276809566213, "ewc_loss": 0.03932693600654602, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015034944226499647, "grad_norm": 4.947413444519043, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.850493848323822, "num_tokens": 175336630.0, "step": 4593 }, { "epoch": 0.5844040198448035, "ewc_loss": 0.03935531899333, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015063326281961054, "grad_norm": 4.974514007568359, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8693599700927734, "num_tokens": 175372697.0, "step": 4594 }, { "epoch": 0.5845312301233939, "ewc_loss": 0.039408911019563675, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015116919530555606, "grad_norm": 5.003561973571777, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8458611965179443, "num_tokens": 175412395.0, "step": 4595 }, { "epoch": 0.5846584404019844, "ewc_loss": 0.03937337547540665, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015081383753567934, "grad_norm": 5.086012363433838, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.856410801410675, "num_tokens": 175447295.0, "step": 4596 }, { "epoch": 0.584785650680575, "ewc_loss": 0.039396822452545166, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015104831254575402, "grad_norm": 4.968334674835205, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8659610748291016, "num_tokens": 175480229.0, "step": 4597 }, { "epoch": 0.5849128609591655, "ewc_loss": 0.03932206705212593, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015030073700472713, "grad_norm": 4.990019798278809, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8571751713752747, "num_tokens": 175523081.0, "step": 4598 }, { "epoch": 0.585040071237756, "ewc_loss": 0.039388567209243774, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015096574497874826, "grad_norm": 4.915313720703125, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8605538606643677, "num_tokens": 175565036.0, "step": 4599 }, { "epoch": 0.5851672815163466, "ewc_loss": 0.039344336837530136, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015052345406729728, "grad_norm": 4.95815372467041, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.857133150100708, "num_tokens": 175606733.0, "step": 4600 }, { "epoch": 0.585294491794937, "ewc_loss": 0.03941265866160393, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015120665193535388, "grad_norm": 5.019432544708252, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8720992803573608, "num_tokens": 175637663.0, "step": 4601 }, { "epoch": 0.5854217020735275, "ewc_loss": 0.03937975689768791, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015087764768395573, "grad_norm": 4.927692413330078, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8592501282691956, "num_tokens": 175677381.0, "step": 4602 }, { "epoch": 0.585548912352118, "ewc_loss": 0.03939678892493248, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015104797785170376, "grad_norm": 4.992656230926514, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8674246668815613, "num_tokens": 175713049.0, "step": 4603 }, { "epoch": 0.5856761226307086, "ewc_loss": 0.03946007043123245, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015168078243732452, "grad_norm": 4.937145233154297, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8472676873207092, "num_tokens": 175753681.0, "step": 4604 }, { "epoch": 0.5858033329092991, "ewc_loss": 0.0394010916352272, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015109099331311882, "grad_norm": 4.984018325805664, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.844491720199585, "num_tokens": 175789725.0, "step": 4605 }, { "epoch": 0.5859305431878896, "ewc_loss": 0.03943963348865509, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001514764007879421, "grad_norm": 5.00624418258667, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8556963205337524, "num_tokens": 175823203.0, "step": 4606 }, { "epoch": 0.5860577534664801, "ewc_loss": 0.03942076861858368, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001512877643108368, "grad_norm": 5.051839351654053, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8511816263198853, "num_tokens": 175860506.0, "step": 4607 }, { "epoch": 0.5861849637450706, "ewc_loss": 0.03942263871431351, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015130646352190524, "grad_norm": 4.952804088592529, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8735665082931519, "num_tokens": 175895080.0, "step": 4608 }, { "epoch": 0.5863121740236611, "ewc_loss": 0.039370715618133545, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015078723663464189, "grad_norm": 4.934603691101074, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.851188063621521, "num_tokens": 175937113.0, "step": 4609 }, { "epoch": 0.5864393843022516, "ewc_loss": 0.039421167224645615, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015129173698369414, "grad_norm": 4.976806640625, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8700739741325378, "num_tokens": 175966193.0, "step": 4610 }, { "epoch": 0.5865665945808421, "ewc_loss": 0.0394417829811573, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015149790851864964, "grad_norm": 4.94757604598999, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8506519794464111, "num_tokens": 176003771.0, "step": 4611 }, { "epoch": 0.5866938048594327, "ewc_loss": 0.03944726288318634, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015155271103139967, "grad_norm": 4.942444324493408, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8625588417053223, "num_tokens": 176050069.0, "step": 4612 }, { "epoch": 0.5868210151380232, "ewc_loss": 0.039451077580451965, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000151590837049298, "grad_norm": 4.971691608428955, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8708630800247192, "num_tokens": 176088763.0, "step": 4613 }, { "epoch": 0.5869482254166136, "ewc_loss": 0.03948499262332916, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015192999853752553, "grad_norm": 4.98314905166626, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8583202362060547, "num_tokens": 176127853.0, "step": 4614 }, { "epoch": 0.5870754356952042, "ewc_loss": 0.039415519684553146, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015123527555260807, "grad_norm": 4.931757926940918, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8430187702178955, "num_tokens": 176173775.0, "step": 4615 }, { "epoch": 0.5872026459737947, "ewc_loss": 0.039567530155181885, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0001515346666565165, "grad_norm": 5.012664794921875, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8617836833000183, "num_tokens": 176210878.0, "step": 4616 }, { "epoch": 0.5873298562523852, "ewc_loss": 0.03955526277422905, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0001514120085630566, "grad_norm": 4.969846248626709, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8512950539588928, "num_tokens": 176255094.0, "step": 4617 }, { "epoch": 0.5874570665309757, "ewc_loss": 0.0395166352391243, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015102571342140436, "grad_norm": 4.97510290145874, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8545652627944946, "num_tokens": 176293586.0, "step": 4618 }, { "epoch": 0.5875842768095663, "ewc_loss": 0.0395243838429451, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015110321692191064, "grad_norm": 5.007269382476807, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.859475314617157, "num_tokens": 176329203.0, "step": 4619 }, { "epoch": 0.5877114870881567, "ewc_loss": 0.03955620527267456, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015142142365220934, "grad_norm": 4.944385051727295, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8737891912460327, "num_tokens": 176367984.0, "step": 4620 }, { "epoch": 0.5878386973667472, "ewc_loss": 0.03939250111579895, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000151005107909441, "grad_norm": 4.9357171058654785, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8703329563140869, "num_tokens": 176408949.0, "step": 4621 }, { "epoch": 0.5879659076453377, "ewc_loss": 0.03941208869218826, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015120096213649958, "grad_norm": 4.957143306732178, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8729552030563354, "num_tokens": 176451073.0, "step": 4622 }, { "epoch": 0.5880931179239283, "ewc_loss": 0.03944622725248337, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001515423646196723, "grad_norm": 4.9784932136535645, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8642575740814209, "num_tokens": 176490623.0, "step": 4623 }, { "epoch": 0.5882203282025188, "ewc_loss": 0.039432547986507416, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015140557661652565, "grad_norm": 4.994543552398682, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8452513217926025, "num_tokens": 176525771.0, "step": 4624 }, { "epoch": 0.5883475384811093, "ewc_loss": 0.03943707421422005, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015145081852097064, "grad_norm": 4.923340320587158, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8760364055633545, "num_tokens": 176565417.0, "step": 4625 }, { "epoch": 0.5884747487596997, "ewc_loss": 0.03941899538040161, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015127004007808864, "grad_norm": 5.041009902954102, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.854535698890686, "num_tokens": 176599005.0, "step": 4626 }, { "epoch": 0.5886019590382903, "ewc_loss": 0.039491426199674606, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015199434710666537, "grad_norm": 4.970550537109375, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8600654602050781, "num_tokens": 176637947.0, "step": 4627 }, { "epoch": 0.5887291693168808, "ewc_loss": 0.039399851113557816, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015107858052942902, "grad_norm": 4.964269161224365, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8472369909286499, "num_tokens": 176675241.0, "step": 4628 }, { "epoch": 0.5888563795954713, "ewc_loss": 0.039447396993637085, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001515540643595159, "grad_norm": 4.942944049835205, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8509253263473511, "num_tokens": 176715375.0, "step": 4629 }, { "epoch": 0.5889835898740619, "ewc_loss": 0.03944616764783859, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001515417534392327, "grad_norm": 5.0010151863098145, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8571588397026062, "num_tokens": 176750977.0, "step": 4630 }, { "epoch": 0.5891108001526524, "ewc_loss": 0.039471015334129333, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001517902419436723, "grad_norm": 4.98761510848999, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8611093759536743, "num_tokens": 176788968.0, "step": 4631 }, { "epoch": 0.5892380104312428, "ewc_loss": 0.03945070505142212, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015158712631091475, "grad_norm": 5.008257865905762, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8535228967666626, "num_tokens": 176824399.0, "step": 4632 }, { "epoch": 0.5893652207098333, "ewc_loss": 0.03947944939136505, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015187458484433591, "grad_norm": 4.947934627532959, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8528700470924377, "num_tokens": 176862053.0, "step": 4633 }, { "epoch": 0.5894924309884239, "ewc_loss": 0.03944157436490059, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015149582759477198, "grad_norm": 4.962957859039307, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8720911741256714, "num_tokens": 176896943.0, "step": 4634 }, { "epoch": 0.5896196412670144, "ewc_loss": 0.03949245065450668, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015200460620690137, "grad_norm": 4.931856155395508, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.858769953250885, "num_tokens": 176936263.0, "step": 4635 }, { "epoch": 0.5897468515456049, "ewc_loss": 0.039519891142845154, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015227901167236269, "grad_norm": 5.001369476318359, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8454498648643494, "num_tokens": 176976632.0, "step": 4636 }, { "epoch": 0.5898740618241954, "ewc_loss": 0.03952258080244064, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015230587450787425, "grad_norm": 5.019636154174805, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8632727861404419, "num_tokens": 177007036.0, "step": 4637 }, { "epoch": 0.5900012721027859, "ewc_loss": 0.039507705718278885, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015215712483040988, "grad_norm": 4.9986572265625, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8480761647224426, "num_tokens": 177042189.0, "step": 4638 }, { "epoch": 0.5901284823813764, "ewc_loss": 0.039532117545604706, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001524012623121962, "grad_norm": 4.967754364013672, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8534098863601685, "num_tokens": 177082428.0, "step": 4639 }, { "epoch": 0.5902556926599669, "ewc_loss": 0.03948243334889412, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015190440171863884, "grad_norm": 5.022948741912842, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8591595888137817, "num_tokens": 177124447.0, "step": 4640 }, { "epoch": 0.5903829029385574, "ewc_loss": 0.03953080624341965, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015238815103657544, "grad_norm": 4.947991847991943, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8730503916740417, "num_tokens": 177160516.0, "step": 4641 }, { "epoch": 0.590510113217148, "ewc_loss": 0.039468396455049515, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.000151764033944346, "grad_norm": 4.966236591339111, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8659793138504028, "num_tokens": 177198100.0, "step": 4642 }, { "epoch": 0.5906373234957385, "ewc_loss": 0.039497192949056625, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015205200179480016, "grad_norm": 4.9204182624816895, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8508254885673523, "num_tokens": 177243170.0, "step": 4643 }, { "epoch": 0.5907645337743289, "ewc_loss": 0.03946622833609581, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015174235159065574, "grad_norm": 4.961847305297852, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8553929328918457, "num_tokens": 177282976.0, "step": 4644 }, { "epoch": 0.5908917440529194, "ewc_loss": 0.03962026163935661, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015206199896056205, "grad_norm": 4.958345413208008, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8405824899673462, "num_tokens": 177323725.0, "step": 4645 }, { "epoch": 0.59101895433151, "ewc_loss": 0.039622947573661804, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015208886179607362, "grad_norm": 5.04042911529541, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8584444522857666, "num_tokens": 177352178.0, "step": 4646 }, { "epoch": 0.5911461646101005, "ewc_loss": 0.03964012861251831, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015226064715534449, "grad_norm": 4.934388160705566, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.855053186416626, "num_tokens": 177392322.0, "step": 4647 }, { "epoch": 0.591273374888691, "ewc_loss": 0.039620235562324524, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015206175157800317, "grad_norm": 4.9926629066467285, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8559170365333557, "num_tokens": 177432013.0, "step": 4648 }, { "epoch": 0.5914005851672816, "ewc_loss": 0.039648693054914474, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015234629972837865, "grad_norm": 4.901077747344971, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8703352212905884, "num_tokens": 177473542.0, "step": 4649 }, { "epoch": 0.591527795445872, "ewc_loss": 0.0395941287279129, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015180067566689104, "grad_norm": 4.945376873016357, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8542121648788452, "num_tokens": 177516590.0, "step": 4650 }, { "epoch": 0.5916550057244625, "ewc_loss": 0.03966525197029114, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015251190052367747, "grad_norm": 4.964940547943115, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8712103366851807, "num_tokens": 177553750.0, "step": 4651 }, { "epoch": 0.591782216003053, "ewc_loss": 0.03960646316409111, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015192400314845145, "grad_norm": 4.980882167816162, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8515114784240723, "num_tokens": 177586093.0, "step": 4652 }, { "epoch": 0.5919094262816436, "ewc_loss": 0.03968873992562294, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015274678298737854, "grad_norm": 4.945939540863037, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8531070351600647, "num_tokens": 177626800.0, "step": 4653 }, { "epoch": 0.5920366365602341, "ewc_loss": 0.039520371705293655, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015228379925247282, "grad_norm": 4.935774326324463, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8556918501853943, "num_tokens": 177662336.0, "step": 4654 }, { "epoch": 0.5921638468388246, "ewc_loss": 0.039540763944387436, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015248772979248315, "grad_norm": 4.947246074676514, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8654558062553406, "num_tokens": 177706922.0, "step": 4655 }, { "epoch": 0.592291057117415, "ewc_loss": 0.03955497965216637, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015262988745234907, "grad_norm": 4.981407642364502, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8556704521179199, "num_tokens": 177743922.0, "step": 4656 }, { "epoch": 0.5924182673960056, "ewc_loss": 0.03955955058336258, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.00015267559501808137, "grad_norm": 4.95617151260376, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8654651045799255, "num_tokens": 177777069.0, "step": 4657 }, { "epoch": 0.5925454776745961, "ewc_loss": 0.03957673907279968, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001528474676888436, "grad_norm": 4.977949142456055, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8667817115783691, "num_tokens": 177810959.0, "step": 4658 }, { "epoch": 0.5926726879531866, "ewc_loss": 0.039578065276145935, "ewc_loss_diag": 2.4318695068359375e-05, "ewc_loss_parallel": 0.0001528607535874471, "grad_norm": 4.987697124481201, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.847474217414856, "num_tokens": 177846366.0, "step": 4659 }, { "epoch": 0.5927998982317771, "ewc_loss": 0.03969722241163254, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015283162065315992, "grad_norm": 4.960544586181641, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8702023029327393, "num_tokens": 177881450.0, "step": 4660 }, { "epoch": 0.5929271085103677, "ewc_loss": 0.03968757390975952, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015273512690328062, "grad_norm": 4.941257476806641, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8588055968284607, "num_tokens": 177922931.0, "step": 4661 }, { "epoch": 0.5930543187889582, "ewc_loss": 0.039691463112831116, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015277402417268604, "grad_norm": 4.985846519470215, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8404390215873718, "num_tokens": 177962722.0, "step": 4662 }, { "epoch": 0.5931815290675486, "ewc_loss": 0.03969065472483635, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015276591875590384, "grad_norm": 4.950570583343506, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8559170961380005, "num_tokens": 177999956.0, "step": 4663 }, { "epoch": 0.5933087393461391, "ewc_loss": 0.039678994566202164, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015264931425917894, "grad_norm": 4.980165004730225, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8479424715042114, "num_tokens": 178032450.0, "step": 4664 }, { "epoch": 0.5934359496247297, "ewc_loss": 0.039743099361658096, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015329035522881895, "grad_norm": 5.0261101722717285, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8576326966285706, "num_tokens": 178075028.0, "step": 4665 }, { "epoch": 0.5935631599033202, "ewc_loss": 0.03968644514679909, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0001527238346170634, "grad_norm": 4.973084449768066, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8525975346565247, "num_tokens": 178108267.0, "step": 4666 }, { "epoch": 0.5936903701819107, "ewc_loss": 0.03969060257077217, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015276539488695562, "grad_norm": 4.932058811187744, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8587250113487244, "num_tokens": 178148091.0, "step": 4667 }, { "epoch": 0.5938175804605013, "ewc_loss": 0.03969690576195717, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015282841923180968, "grad_norm": 4.995835304260254, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8707325458526611, "num_tokens": 178185626.0, "step": 4668 }, { "epoch": 0.5939447907390917, "ewc_loss": 0.0397125780582428, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0001529851433588192, "grad_norm": 4.938417911529541, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8653805255889893, "num_tokens": 178224165.0, "step": 4669 }, { "epoch": 0.5940720010176822, "ewc_loss": 0.03969837725162506, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.000152843160321936, "grad_norm": 5.045113563537598, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8569192886352539, "num_tokens": 178264118.0, "step": 4670 }, { "epoch": 0.5941992112962727, "ewc_loss": 0.03974984213709831, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0001533577888039872, "grad_norm": 5.103566646575928, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8686931133270264, "num_tokens": 178295634.0, "step": 4671 }, { "epoch": 0.5943264215748633, "ewc_loss": 0.03967813029885292, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015264068497344851, "grad_norm": 5.023046493530273, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8648584485054016, "num_tokens": 178327076.0, "step": 4672 }, { "epoch": 0.5944536318534538, "ewc_loss": 0.03964722529053688, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015233161684591323, "grad_norm": 5.025757789611816, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8492367267608643, "num_tokens": 178357469.0, "step": 4673 }, { "epoch": 0.5945808421320443, "ewc_loss": 0.03967393934726715, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0001525987609056756, "grad_norm": 4.930333614349365, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.863782525062561, "num_tokens": 178393830.0, "step": 4674 }, { "epoch": 0.5947080524106347, "ewc_loss": 0.039636239409446716, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.0001522217644378543, "grad_norm": 5.0056939125061035, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8574178218841553, "num_tokens": 178427560.0, "step": 4675 }, { "epoch": 0.5948352626892253, "ewc_loss": 0.03973368555307388, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015319623344112188, "grad_norm": 5.01712703704834, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8589774370193481, "num_tokens": 178464666.0, "step": 4676 }, { "epoch": 0.5949624729678158, "ewc_loss": 0.03968215361237526, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015268090646713972, "grad_norm": 4.903411388397217, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8479467630386353, "num_tokens": 178508771.0, "step": 4677 }, { "epoch": 0.5950896832464063, "ewc_loss": 0.03978065028786659, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015244516544044018, "grad_norm": 4.93514347076416, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8621349334716797, "num_tokens": 178548516.0, "step": 4678 }, { "epoch": 0.5952168935249968, "ewc_loss": 0.0398503839969635, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015314252232201397, "grad_norm": 4.94650936126709, "learning_rate": 1e-06, "loss": 0.581, "mean_token_accuracy": 0.8227707147598267, "num_tokens": 178591276.0, "step": 4679 }, { "epoch": 0.5953441038035874, "ewc_loss": 0.03983994573354721, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015303811233025044, "grad_norm": 4.931326389312744, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8566421270370483, "num_tokens": 178632580.0, "step": 4680 }, { "epoch": 0.5954713140821778, "ewc_loss": 0.039764221757650375, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015350159083027393, "grad_norm": 4.989251136779785, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8534327745437622, "num_tokens": 178672858.0, "step": 4681 }, { "epoch": 0.5955985243607683, "ewc_loss": 0.039760708808898926, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015346647705882788, "grad_norm": 4.968990325927734, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8588405251502991, "num_tokens": 178706435.0, "step": 4682 }, { "epoch": 0.5957257346393589, "ewc_loss": 0.03977353870868683, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015359476674348116, "grad_norm": 4.9352216720581055, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8548674583435059, "num_tokens": 178746309.0, "step": 4683 }, { "epoch": 0.5958529449179494, "ewc_loss": 0.039851970970630646, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015315839846152812, "grad_norm": 5.0211286544799805, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8390871286392212, "num_tokens": 178787639.0, "step": 4684 }, { "epoch": 0.5959801551965399, "ewc_loss": 0.03988436609506607, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015348235319834203, "grad_norm": 4.925358295440674, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8627345561981201, "num_tokens": 178828271.0, "step": 4685 }, { "epoch": 0.5961073654751304, "ewc_loss": 0.039852168411016464, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015316034841816872, "grad_norm": 5.017462253570557, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8785185813903809, "num_tokens": 178862862.0, "step": 4686 }, { "epoch": 0.5962345757537209, "ewc_loss": 0.039858296513557434, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015322165563702583, "grad_norm": 4.928738594055176, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8695967793464661, "num_tokens": 178897969.0, "step": 4687 }, { "epoch": 0.5963617860323114, "ewc_loss": 0.03979703411459923, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001526090200059116, "grad_norm": 4.932317733764648, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8617451190948486, "num_tokens": 178942002.0, "step": 4688 }, { "epoch": 0.5964889963109019, "ewc_loss": 0.039702609181404114, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015288546273950487, "grad_norm": 4.933472156524658, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8673921227455139, "num_tokens": 178983473.0, "step": 4689 }, { "epoch": 0.5966162065894924, "ewc_loss": 0.039714571088552475, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015300509403459728, "grad_norm": 5.010976791381836, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.84662264585495, "num_tokens": 179022370.0, "step": 4690 }, { "epoch": 0.596743416868083, "ewc_loss": 0.03986068442463875, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015324550622608513, "grad_norm": 4.889267921447754, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8733471035957336, "num_tokens": 179064841.0, "step": 4691 }, { "epoch": 0.5968706271466735, "ewc_loss": 0.039733827114105225, "ewc_loss_diag": 2.4437904357910156e-05, "ewc_loss_parallel": 0.00015319765952881426, "grad_norm": 5.039359092712402, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.82999587059021, "num_tokens": 179099399.0, "step": 4692 }, { "epoch": 0.5969978374252639, "ewc_loss": 0.03993404656648636, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.000153979126480408, "grad_norm": 5.022493839263916, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8491021394729614, "num_tokens": 179133969.0, "step": 4693 }, { "epoch": 0.5971250477038544, "ewc_loss": 0.03986058384180069, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015324453124776483, "grad_norm": 5.024522304534912, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8554975986480713, "num_tokens": 179165456.0, "step": 4694 }, { "epoch": 0.597252257982445, "ewc_loss": 0.039860136806964874, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001532400492578745, "grad_norm": 4.991833686828613, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8630213737487793, "num_tokens": 179198209.0, "step": 4695 }, { "epoch": 0.5973794682610355, "ewc_loss": 0.03993428498506546, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001539815275464207, "grad_norm": 5.035123348236084, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8419444561004639, "num_tokens": 179233872.0, "step": 4696 }, { "epoch": 0.597506678539626, "ewc_loss": 0.03994818776845932, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015412052744068205, "grad_norm": 4.943023681640625, "learning_rate": 1e-06, "loss": 0.3803, "mean_token_accuracy": 0.8784395456314087, "num_tokens": 179272721.0, "step": 4697 }, { "epoch": 0.5976338888182166, "ewc_loss": 0.03988267481327057, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015346542932093143, "grad_norm": 5.060099124908447, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8566519618034363, "num_tokens": 179305781.0, "step": 4698 }, { "epoch": 0.597761099096807, "ewc_loss": 0.0399538092315197, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015417675604112446, "grad_norm": 4.913306713104248, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8619484901428223, "num_tokens": 179350049.0, "step": 4699 }, { "epoch": 0.5978883093753975, "ewc_loss": 0.03992098569869995, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015384853759314865, "grad_norm": 5.024300575256348, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8576476573944092, "num_tokens": 179388063.0, "step": 4700 }, { "epoch": 0.598015519653988, "ewc_loss": 0.039987243711948395, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015451111539732665, "grad_norm": 4.963467597961426, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8646550178527832, "num_tokens": 179427607.0, "step": 4701 }, { "epoch": 0.5981427299325786, "ewc_loss": 0.039923034608364105, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001538690266897902, "grad_norm": 5.000840187072754, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8410018682479858, "num_tokens": 179471456.0, "step": 4702 }, { "epoch": 0.5982699402111691, "ewc_loss": 0.04010814428329468, "ewc_loss_diag": 2.467632293701172e-05, "ewc_loss_parallel": 0.0001544994011055678, "grad_norm": 4.981735706329346, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8691533803939819, "num_tokens": 179514318.0, "step": 4703 }, { "epoch": 0.5983971504897596, "ewc_loss": 0.03992963954806328, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015393507783301175, "grad_norm": 4.984240531921387, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8707016110420227, "num_tokens": 179553307.0, "step": 4704 }, { "epoch": 0.59852436076835, "ewc_loss": 0.03995245695114136, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015416322275996208, "grad_norm": 4.949743747711182, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8503393530845642, "num_tokens": 179597288.0, "step": 4705 }, { "epoch": 0.5986515710469406, "ewc_loss": 0.03987091779708862, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015334786439780146, "grad_norm": 5.0887770652771, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8577989339828491, "num_tokens": 179629583.0, "step": 4706 }, { "epoch": 0.5987787813255311, "ewc_loss": 0.039993345737457275, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015457211702596396, "grad_norm": 4.987598896026611, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8529028296470642, "num_tokens": 179672104.0, "step": 4707 }, { "epoch": 0.5989059916041216, "ewc_loss": 0.039824455976486206, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015288325084839016, "grad_norm": 5.001536846160889, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8489103317260742, "num_tokens": 179712951.0, "step": 4708 }, { "epoch": 0.5990332018827121, "ewc_loss": 0.03991994634270668, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001538381475256756, "grad_norm": 4.973945617675781, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8566542267799377, "num_tokens": 179754626.0, "step": 4709 }, { "epoch": 0.5991604121613027, "ewc_loss": 0.03987245634198189, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015336323122028261, "grad_norm": 5.004374027252197, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.864138126373291, "num_tokens": 179790068.0, "step": 4710 }, { "epoch": 0.5992876224398932, "ewc_loss": 0.039895132184028625, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015359000826720148, "grad_norm": 5.042603015899658, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8375288248062134, "num_tokens": 179823936.0, "step": 4711 }, { "epoch": 0.5994148327184836, "ewc_loss": 0.03998619690537453, "ewc_loss_diag": 2.467632293701172e-05, "ewc_loss_parallel": 0.00015327993605751544, "grad_norm": 5.00054407119751, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8592259287834167, "num_tokens": 179860920.0, "step": 4712 }, { "epoch": 0.5995420429970741, "ewc_loss": 0.039848361164331436, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015312228060793132, "grad_norm": 4.966391563415527, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8485145568847656, "num_tokens": 179899848.0, "step": 4713 }, { "epoch": 0.5996692532756647, "ewc_loss": 0.03986632078886032, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015330188034567982, "grad_norm": 5.023308277130127, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8749757409095764, "num_tokens": 179934462.0, "step": 4714 }, { "epoch": 0.5997964635542552, "ewc_loss": 0.03988893702626228, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001535280462121591, "grad_norm": 4.98105525970459, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8577741384506226, "num_tokens": 179973955.0, "step": 4715 }, { "epoch": 0.5999236738328457, "ewc_loss": 0.03985781967639923, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015321685350500047, "grad_norm": 4.982020854949951, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8596755266189575, "num_tokens": 180015256.0, "step": 4716 }, { "epoch": 0.6000508841114363, "ewc_loss": 0.03989555314183235, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015359421377070248, "grad_norm": 4.928786754608154, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8651880025863647, "num_tokens": 180054145.0, "step": 4717 }, { "epoch": 0.6001780943900267, "ewc_loss": 0.03986959159374237, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015333460760302842, "grad_norm": 4.9423651695251465, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.862293004989624, "num_tokens": 180093411.0, "step": 4718 }, { "epoch": 0.6003053046686172, "ewc_loss": 0.04003900662064552, "ewc_loss_diag": 2.467632293701172e-05, "ewc_loss_parallel": 0.0001538080396130681, "grad_norm": 4.94621467590332, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8680745363235474, "num_tokens": 180135799.0, "step": 4719 }, { "epoch": 0.6004325149472077, "ewc_loss": 0.04005297273397446, "ewc_loss_diag": 2.467632293701172e-05, "ewc_loss_parallel": 0.0001539476797915995, "grad_norm": 5.004204750061035, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.851678192615509, "num_tokens": 180172623.0, "step": 4720 }, { "epoch": 0.6005597252257983, "ewc_loss": 0.03997329622507095, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015437163528986275, "grad_norm": 4.953678131103516, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8544620275497437, "num_tokens": 180213833.0, "step": 4721 }, { "epoch": 0.6006869355043888, "ewc_loss": 0.03992114216089249, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001538501091999933, "grad_norm": 4.993626594543457, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8652758598327637, "num_tokens": 180254982.0, "step": 4722 }, { "epoch": 0.6008141457829793, "ewc_loss": 0.0399656668305397, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015429535415023565, "grad_norm": 5.02911901473999, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8669261336326599, "num_tokens": 180288544.0, "step": 4723 }, { "epoch": 0.6009413560615697, "ewc_loss": 0.03990158811211586, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001536545460112393, "grad_norm": 4.955279350280762, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8556009531021118, "num_tokens": 180330233.0, "step": 4724 }, { "epoch": 0.6010685663401603, "ewc_loss": 0.039899639785289764, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001536350609967485, "grad_norm": 4.99000883102417, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8609747886657715, "num_tokens": 180361516.0, "step": 4725 }, { "epoch": 0.6011957766187508, "ewc_loss": 0.039893556386232376, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015357424854300916, "grad_norm": 4.935732841491699, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8568394780158997, "num_tokens": 180404380.0, "step": 4726 }, { "epoch": 0.6013229868973413, "ewc_loss": 0.03985537961125374, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015319246449507773, "grad_norm": 5.012378215789795, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8512459397315979, "num_tokens": 180436916.0, "step": 4727 }, { "epoch": 0.6014501971759318, "ewc_loss": 0.039946869015693665, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015410735795740038, "grad_norm": 4.970016956329346, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8406577110290527, "num_tokens": 180476895.0, "step": 4728 }, { "epoch": 0.6015774074545224, "ewc_loss": 0.03989554941654205, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015359418466687202, "grad_norm": 5.030043601989746, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8575762510299683, "num_tokens": 180515248.0, "step": 4729 }, { "epoch": 0.6017046177331128, "ewc_loss": 0.03994818031787872, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001541204546811059, "grad_norm": 4.928635597229004, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8649448156356812, "num_tokens": 180554575.0, "step": 4730 }, { "epoch": 0.6018318280117033, "ewc_loss": 0.03987840190529823, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015342269034590572, "grad_norm": 5.008027076721191, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8486288785934448, "num_tokens": 180594485.0, "step": 4731 }, { "epoch": 0.6019590382902938, "ewc_loss": 0.03998241201043129, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015446278848685324, "grad_norm": 4.969797611236572, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8438671231269836, "num_tokens": 180634746.0, "step": 4732 }, { "epoch": 0.6020862485688844, "ewc_loss": 0.03990120440721512, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015365073340944946, "grad_norm": 5.019967555999756, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8483445644378662, "num_tokens": 180674946.0, "step": 4733 }, { "epoch": 0.6022134588474749, "ewc_loss": 0.039948005229234695, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015411872300319374, "grad_norm": 4.923429012298584, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8582852482795715, "num_tokens": 180715538.0, "step": 4734 }, { "epoch": 0.6023406691260654, "ewc_loss": 0.0399165004491806, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001538036740384996, "grad_norm": 4.993311882019043, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.85760098695755, "num_tokens": 180755298.0, "step": 4735 }, { "epoch": 0.6024678794046558, "ewc_loss": 0.039983510971069336, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015447377518285066, "grad_norm": 4.996688365936279, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8517857789993286, "num_tokens": 180797104.0, "step": 4736 }, { "epoch": 0.6025950896832464, "ewc_loss": 0.040037382394075394, "ewc_loss_diag": 2.467632293701172e-05, "ewc_loss_parallel": 0.00015379178512375802, "grad_norm": 4.929314136505127, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8686264157295227, "num_tokens": 180839845.0, "step": 4737 }, { "epoch": 0.6027222999618369, "ewc_loss": 0.03993716090917587, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015401028213091195, "grad_norm": 4.973998546600342, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8718993663787842, "num_tokens": 180878849.0, "step": 4738 }, { "epoch": 0.6028495102404274, "ewc_loss": 0.03996099904179573, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015424867160618305, "grad_norm": 5.010690689086914, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8571637868881226, "num_tokens": 180918427.0, "step": 4739 }, { "epoch": 0.602976720519018, "ewc_loss": 0.039944954216480255, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015408819308504462, "grad_norm": 4.994258880615234, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8506606221199036, "num_tokens": 180956206.0, "step": 4740 }, { "epoch": 0.6031039307976085, "ewc_loss": 0.039902739226818085, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015366605657618493, "grad_norm": 4.96987771987915, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.864258885383606, "num_tokens": 180996945.0, "step": 4741 }, { "epoch": 0.6032311410761989, "ewc_loss": 0.03990337997674942, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015367248852271587, "grad_norm": 5.0130791664123535, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8625693321228027, "num_tokens": 181031763.0, "step": 4742 }, { "epoch": 0.6033583513547894, "ewc_loss": 0.039944082498550415, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015407952014356852, "grad_norm": 5.000494003295898, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8654650449752808, "num_tokens": 181070661.0, "step": 4743 }, { "epoch": 0.60348556163338, "ewc_loss": 0.03993643820285797, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015400304982904345, "grad_norm": 4.997524738311768, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8455615043640137, "num_tokens": 181112769.0, "step": 4744 }, { "epoch": 0.6036127719119705, "ewc_loss": 0.03990772366523743, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015371592598967254, "grad_norm": 4.98988676071167, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8593897819519043, "num_tokens": 181151155.0, "step": 4745 }, { "epoch": 0.603739982190561, "ewc_loss": 0.03990988805890083, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015373756468761712, "grad_norm": 4.997664928436279, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.858099102973938, "num_tokens": 181188689.0, "step": 4746 }, { "epoch": 0.6038671924691515, "ewc_loss": 0.03989163786172867, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015355504001490772, "grad_norm": 5.004951000213623, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8626853823661804, "num_tokens": 181230031.0, "step": 4747 }, { "epoch": 0.603994402747742, "ewc_loss": 0.03987889364361763, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001534276088932529, "grad_norm": 4.994147777557373, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8576879501342773, "num_tokens": 181271464.0, "step": 4748 }, { "epoch": 0.6041216130263325, "ewc_loss": 0.03991517797112465, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.000153790446347557, "grad_norm": 5.0147809982299805, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.852803111076355, "num_tokens": 181312607.0, "step": 4749 }, { "epoch": 0.604248823304923, "ewc_loss": 0.03992420807480812, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015388075553346425, "grad_norm": 5.003880023956299, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8667173385620117, "num_tokens": 181348628.0, "step": 4750 }, { "epoch": 0.6043760335835135, "ewc_loss": 0.03988320752978325, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015347075532190502, "grad_norm": 5.014347076416016, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8507922887802124, "num_tokens": 181387975.0, "step": 4751 }, { "epoch": 0.6045032438621041, "ewc_loss": 0.03992664813995361, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015390517364721745, "grad_norm": 5.019142150878906, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8714138269424438, "num_tokens": 181424163.0, "step": 4752 }, { "epoch": 0.6046304541406946, "ewc_loss": 0.03993320092558861, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015397067181766033, "grad_norm": 5.0514984130859375, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8483785390853882, "num_tokens": 181457002.0, "step": 4753 }, { "epoch": 0.604757664419285, "ewc_loss": 0.039962995797395706, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015426862228196114, "grad_norm": 4.995506763458252, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8533118963241577, "num_tokens": 181494410.0, "step": 4754 }, { "epoch": 0.6048848746978756, "ewc_loss": 0.03993060067296028, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015394468209706247, "grad_norm": 5.009623050689697, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8547218441963196, "num_tokens": 181534595.0, "step": 4755 }, { "epoch": 0.6050120849764661, "ewc_loss": 0.039910025894641876, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015373891801573336, "grad_norm": 5.005784511566162, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8640965819358826, "num_tokens": 181570654.0, "step": 4756 }, { "epoch": 0.6051392952550566, "ewc_loss": 0.03994910418987274, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015412970969919115, "grad_norm": 5.038750648498535, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8518449664115906, "num_tokens": 181608366.0, "step": 4757 }, { "epoch": 0.6052665055336471, "ewc_loss": 0.039957694709300995, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015421563875861466, "grad_norm": 4.992175102233887, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8460866212844849, "num_tokens": 181647168.0, "step": 4758 }, { "epoch": 0.6053937158122377, "ewc_loss": 0.039973847568035126, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015437713591381907, "grad_norm": 4.996424198150635, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8614888191223145, "num_tokens": 181689177.0, "step": 4759 }, { "epoch": 0.6055209260908282, "ewc_loss": 0.039935603737831116, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015399472613353282, "grad_norm": 5.0938239097595215, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8629567623138428, "num_tokens": 181723742.0, "step": 4760 }, { "epoch": 0.6056481363694186, "ewc_loss": 0.040025681257247925, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015489548968616873, "grad_norm": 4.983423233032227, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8676374554634094, "num_tokens": 181761539.0, "step": 4761 }, { "epoch": 0.6057753466480091, "ewc_loss": 0.03988179191946983, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001534565817564726, "grad_norm": 5.0192694664001465, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8559472560882568, "num_tokens": 181799674.0, "step": 4762 }, { "epoch": 0.6059025569265997, "ewc_loss": 0.03997340798377991, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015437275578733534, "grad_norm": 4.9661078453063965, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8456459641456604, "num_tokens": 181838784.0, "step": 4763 }, { "epoch": 0.6060297672051902, "ewc_loss": 0.039939239621162415, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015403107681777328, "grad_norm": 5.0586066246032715, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8541460037231445, "num_tokens": 181875669.0, "step": 4764 }, { "epoch": 0.6061569774837807, "ewc_loss": 0.04002343863248825, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015487307973671705, "grad_norm": 5.023951053619385, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8414555191993713, "num_tokens": 181912450.0, "step": 4765 }, { "epoch": 0.6062841877623713, "ewc_loss": 0.03993919491767883, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001540306257084012, "grad_norm": 5.085426330566406, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8573303818702698, "num_tokens": 181947691.0, "step": 4766 }, { "epoch": 0.6064113980409617, "ewc_loss": 0.039999596774578094, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001546346175018698, "grad_norm": 5.037824630737305, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.862407386302948, "num_tokens": 181982364.0, "step": 4767 }, { "epoch": 0.6065386083195522, "ewc_loss": 0.03992844372987747, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015392311615869403, "grad_norm": 5.068187236785889, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8577248454093933, "num_tokens": 182017071.0, "step": 4768 }, { "epoch": 0.6066658185981427, "ewc_loss": 0.039982009679079056, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001544587721582502, "grad_norm": 4.980155944824219, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8587380051612854, "num_tokens": 182057234.0, "step": 4769 }, { "epoch": 0.6067930288767333, "ewc_loss": 0.039936140179634094, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001540000957902521, "grad_norm": 4.97546911239624, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8698514103889465, "num_tokens": 182092450.0, "step": 4770 }, { "epoch": 0.6069202391553238, "ewc_loss": 0.04000161215662956, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015465480100829154, "grad_norm": 5.041618347167969, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8498364686965942, "num_tokens": 182126398.0, "step": 4771 }, { "epoch": 0.6070474494339143, "ewc_loss": 0.03998555988073349, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001544942642794922, "grad_norm": 5.015837669372559, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8727487325668335, "num_tokens": 182160848.0, "step": 4772 }, { "epoch": 0.6071746597125047, "ewc_loss": 0.04000934958457947, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.000154732188093476, "grad_norm": 4.957950115203857, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8669490814208984, "num_tokens": 182202596.0, "step": 4773 }, { "epoch": 0.6073018699910953, "ewc_loss": 0.03998652845621109, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015450394130311906, "grad_norm": 5.011441230773926, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8563099503517151, "num_tokens": 182239235.0, "step": 4774 }, { "epoch": 0.6074290802696858, "ewc_loss": 0.04003109037876129, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001549495937069878, "grad_norm": 4.996490001678467, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.869838535785675, "num_tokens": 182273662.0, "step": 4775 }, { "epoch": 0.6075562905482763, "ewc_loss": 0.04004760831594467, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015511475794482976, "grad_norm": 5.014313697814941, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8429911136627197, "num_tokens": 182311991.0, "step": 4776 }, { "epoch": 0.6076835008268668, "ewc_loss": 0.04005616158246994, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015520027955062687, "grad_norm": 4.997840881347656, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.859764575958252, "num_tokens": 182350722.0, "step": 4777 }, { "epoch": 0.6078107111054574, "ewc_loss": 0.04007956385612488, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015543430345132947, "grad_norm": 5.025495529174805, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8607624769210815, "num_tokens": 182386755.0, "step": 4778 }, { "epoch": 0.6079379213840478, "ewc_loss": 0.040130626410245895, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001559449447086081, "grad_norm": 5.019643306732178, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8739665746688843, "num_tokens": 182423128.0, "step": 4779 }, { "epoch": 0.6080651316626383, "ewc_loss": 0.04005636274814606, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015520231681875885, "grad_norm": 5.00502347946167, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8447067737579346, "num_tokens": 182461518.0, "step": 4780 }, { "epoch": 0.6081923419412288, "ewc_loss": 0.04008874297142029, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015552609693259, "grad_norm": 5.0166544914245605, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8579183220863342, "num_tokens": 182495707.0, "step": 4781 }, { "epoch": 0.6083195522198194, "ewc_loss": 0.04007696360349655, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015540828462690115, "grad_norm": 4.943948745727539, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8486754894256592, "num_tokens": 182539058.0, "step": 4782 }, { "epoch": 0.6084467624984099, "ewc_loss": 0.040058016777038574, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001552188186906278, "grad_norm": 5.086813449859619, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.843355655670166, "num_tokens": 182572482.0, "step": 4783 }, { "epoch": 0.6085739727770004, "ewc_loss": 0.040132924914360046, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.0001559679367346689, "grad_norm": 4.981576442718506, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8638695478439331, "num_tokens": 182607665.0, "step": 4784 }, { "epoch": 0.6087011830555908, "ewc_loss": 0.04007565230131149, "ewc_loss_diag": 2.4557113647460938e-05, "ewc_loss_parallel": 0.00015539521700702608, "grad_norm": 5.0120134353637695, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8594193458557129, "num_tokens": 182650503.0, "step": 4785 }, { "epoch": 0.6088283933341814, "ewc_loss": 0.04035813361406326, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015577861631754786, "grad_norm": 4.987936973571777, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8683204650878906, "num_tokens": 182681662.0, "step": 4786 }, { "epoch": 0.6089556036127719, "ewc_loss": 0.04035584256052971, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015575568249914795, "grad_norm": 5.051067352294922, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8466116189956665, "num_tokens": 182714912.0, "step": 4787 }, { "epoch": 0.6090828138913624, "ewc_loss": 0.040406517684459686, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015626246749889106, "grad_norm": 5.042566299438477, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8556006550788879, "num_tokens": 182748813.0, "step": 4788 }, { "epoch": 0.609210024169953, "ewc_loss": 0.040389128029346466, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015608854300808161, "grad_norm": 5.034004211425781, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8429702520370483, "num_tokens": 182786123.0, "step": 4789 }, { "epoch": 0.6093372344485435, "ewc_loss": 0.04038968309760094, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015609410183969885, "grad_norm": 5.0097503662109375, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8711956739425659, "num_tokens": 182820550.0, "step": 4790 }, { "epoch": 0.6094644447271339, "ewc_loss": 0.04040200263261795, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001562172983540222, "grad_norm": 5.100982189178467, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8584638237953186, "num_tokens": 182863135.0, "step": 4791 }, { "epoch": 0.6095916550057244, "ewc_loss": 0.04036828503012657, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015588011592626572, "grad_norm": 4.988137722015381, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8605245351791382, "num_tokens": 182905193.0, "step": 4792 }, { "epoch": 0.609718865284315, "ewc_loss": 0.04029169678688049, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015511424862779677, "grad_norm": 5.122661590576172, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8588680624961853, "num_tokens": 182944934.0, "step": 4793 }, { "epoch": 0.6098460755629055, "ewc_loss": 0.04034581035375595, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015565537614747882, "grad_norm": 4.992087364196777, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8452544212341309, "num_tokens": 182984604.0, "step": 4794 }, { "epoch": 0.609973285841496, "ewc_loss": 0.0402383878827095, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015458113921340555, "grad_norm": 5.032870769500732, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8562992811203003, "num_tokens": 183020421.0, "step": 4795 }, { "epoch": 0.6101004961200865, "ewc_loss": 0.04027663543820381, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015496360720135272, "grad_norm": 4.976517200469971, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8493890762329102, "num_tokens": 183057635.0, "step": 4796 }, { "epoch": 0.610227706398677, "ewc_loss": 0.04024241492152214, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015462141891475767, "grad_norm": 5.025890350341797, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8459833264350891, "num_tokens": 183094689.0, "step": 4797 }, { "epoch": 0.6103549166772675, "ewc_loss": 0.04027271643280983, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015492443344555795, "grad_norm": 5.022675514221191, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8635389804840088, "num_tokens": 183129118.0, "step": 4798 }, { "epoch": 0.610482126955858, "ewc_loss": 0.040267057716846466, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001548678264953196, "grad_norm": 5.033938407897949, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8572653532028198, "num_tokens": 183163365.0, "step": 4799 }, { "epoch": 0.6106093372344485, "ewc_loss": 0.040278252214193344, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015497978893108666, "grad_norm": 5.032569885253906, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8617185950279236, "num_tokens": 183199271.0, "step": 4800 }, { "epoch": 0.6107365475130391, "ewc_loss": 0.04030044376850128, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015520170563831925, "grad_norm": 4.985336780548096, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.8770358562469482, "num_tokens": 183234523.0, "step": 4801 }, { "epoch": 0.6108637577916296, "ewc_loss": 0.040250327438116074, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015470055222976953, "grad_norm": 5.047119140625, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8581270575523376, "num_tokens": 183267098.0, "step": 4802 }, { "epoch": 0.61099096807022, "ewc_loss": 0.04026718810200691, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015486915071960539, "grad_norm": 4.965164661407471, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8593316078186035, "num_tokens": 183306825.0, "step": 4803 }, { "epoch": 0.6111181783488105, "ewc_loss": 0.04029885306954384, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015518580039497465, "grad_norm": 5.0211567878723145, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8580101728439331, "num_tokens": 183348712.0, "step": 4804 }, { "epoch": 0.6112453886274011, "ewc_loss": 0.040297456085681915, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015517184510827065, "grad_norm": 5.006190299987793, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8703289031982422, "num_tokens": 183383797.0, "step": 4805 }, { "epoch": 0.6113725989059916, "ewc_loss": 0.04030408337712288, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001552380999783054, "grad_norm": 5.052276134490967, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8697755336761475, "num_tokens": 183417484.0, "step": 4806 }, { "epoch": 0.6114998091845821, "ewc_loss": 0.04031054675579071, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015530275413766503, "grad_norm": 5.012631416320801, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8639602661132812, "num_tokens": 183453808.0, "step": 4807 }, { "epoch": 0.6116270194631727, "ewc_loss": 0.04026737064123154, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015487098426092416, "grad_norm": 4.947176456451416, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8650544881820679, "num_tokens": 183496693.0, "step": 4808 }, { "epoch": 0.6117542297417632, "ewc_loss": 0.04027104750275612, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015490774239879102, "grad_norm": 5.002866268157959, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8605890274047852, "num_tokens": 183536727.0, "step": 4809 }, { "epoch": 0.6118814400203536, "ewc_loss": 0.04028366133570671, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001550338783999905, "grad_norm": 5.018603324890137, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8595291376113892, "num_tokens": 183571809.0, "step": 4810 }, { "epoch": 0.6120086502989441, "ewc_loss": 0.04026491567492485, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015484642062801868, "grad_norm": 5.046049118041992, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8577019572257996, "num_tokens": 183610761.0, "step": 4811 }, { "epoch": 0.6121358605775347, "ewc_loss": 0.04027784615755081, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015497574349865317, "grad_norm": 5.04149866104126, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8555619120597839, "num_tokens": 183648304.0, "step": 4812 }, { "epoch": 0.6122630708561252, "ewc_loss": 0.04023578763008118, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015455513494089246, "grad_norm": 5.03819465637207, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8539097309112549, "num_tokens": 183684201.0, "step": 4813 }, { "epoch": 0.6123902811347157, "ewc_loss": 0.04026130586862564, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001548103173263371, "grad_norm": 4.999309539794922, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8522043228149414, "num_tokens": 183726595.0, "step": 4814 }, { "epoch": 0.6125174914133062, "ewc_loss": 0.04021212086081505, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001543184625916183, "grad_norm": 5.032926559448242, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.867348313331604, "num_tokens": 183762222.0, "step": 4815 }, { "epoch": 0.6126447016918967, "ewc_loss": 0.04023512452840805, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015454849926754832, "grad_norm": 5.025907516479492, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8697716593742371, "num_tokens": 183797167.0, "step": 4816 }, { "epoch": 0.6127719119704872, "ewc_loss": 0.040222715586423874, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015442441508639604, "grad_norm": 4.9906392097473145, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8568837642669678, "num_tokens": 183839591.0, "step": 4817 }, { "epoch": 0.6128991222490777, "ewc_loss": 0.04022088646888733, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015440612332895398, "grad_norm": 5.025569915771484, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8591779470443726, "num_tokens": 183877862.0, "step": 4818 }, { "epoch": 0.6130263325276682, "ewc_loss": 0.04026350378990173, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015483230527024716, "grad_norm": 4.9743452072143555, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8633490800857544, "num_tokens": 183915964.0, "step": 4819 }, { "epoch": 0.6131535428062588, "ewc_loss": 0.040216151624917984, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015435877139680088, "grad_norm": 5.035449028015137, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8555172681808472, "num_tokens": 183952823.0, "step": 4820 }, { "epoch": 0.6132807530848493, "ewc_loss": 0.04029182344675064, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015511551464442164, "grad_norm": 5.00321626663208, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8499714732170105, "num_tokens": 183991650.0, "step": 4821 }, { "epoch": 0.6134079633634397, "ewc_loss": 0.040225908160209656, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001544563565403223, "grad_norm": 5.0089545249938965, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8597112894058228, "num_tokens": 184027768.0, "step": 4822 }, { "epoch": 0.6135351736420303, "ewc_loss": 0.04032154753804207, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015541275206487626, "grad_norm": 5.0557756423950195, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8701574802398682, "num_tokens": 184063754.0, "step": 4823 }, { "epoch": 0.6136623839206208, "ewc_loss": 0.04029027372598648, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015509998775087297, "grad_norm": 4.997290134429932, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8528159856796265, "num_tokens": 184105286.0, "step": 4824 }, { "epoch": 0.6137895941992113, "ewc_loss": 0.04026004672050476, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015479772991966456, "grad_norm": 5.053731918334961, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8633158206939697, "num_tokens": 184139312.0, "step": 4825 }, { "epoch": 0.6139168044778018, "ewc_loss": 0.04031411558389664, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015533842088188976, "grad_norm": 4.978909492492676, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8575466871261597, "num_tokens": 184181541.0, "step": 4826 }, { "epoch": 0.6140440147563924, "ewc_loss": 0.0402294360101223, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015449161583092064, "grad_norm": 5.156223297119141, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8586336374282837, "num_tokens": 184216857.0, "step": 4827 }, { "epoch": 0.6141712250349828, "ewc_loss": 0.04037145897746086, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.000155911868205294, "grad_norm": 5.007795333862305, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8630058765411377, "num_tokens": 184253318.0, "step": 4828 }, { "epoch": 0.6142984353135733, "ewc_loss": 0.04050421342253685, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001547980064060539, "grad_norm": 5.003880500793457, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8699995279312134, "num_tokens": 184298625.0, "step": 4829 }, { "epoch": 0.6144256455921638, "ewc_loss": 0.0405765138566494, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015552100376226008, "grad_norm": 5.1142354011535645, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8567204475402832, "num_tokens": 184334569.0, "step": 4830 }, { "epoch": 0.6145528558707544, "ewc_loss": 0.040342140942811966, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015561866166535765, "grad_norm": 5.001021862030029, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8418471813201904, "num_tokens": 184377477.0, "step": 4831 }, { "epoch": 0.6146800661493449, "ewc_loss": 0.040361519902944565, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015459176211152226, "grad_norm": 5.032022953033447, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.84605872631073, "num_tokens": 184416671.0, "step": 4832 }, { "epoch": 0.6148072764279354, "ewc_loss": 0.040595345199108124, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015570930554531515, "grad_norm": 5.0433831214904785, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8539363145828247, "num_tokens": 184453174.0, "step": 4833 }, { "epoch": 0.6149344867065258, "ewc_loss": 0.04060373082756996, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015579316823277622, "grad_norm": 5.125008583068848, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8655683994293213, "num_tokens": 184488313.0, "step": 4834 }, { "epoch": 0.6150616969851164, "ewc_loss": 0.04059908539056778, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001557467330712825, "grad_norm": 4.989441871643066, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8389468193054199, "num_tokens": 184523733.0, "step": 4835 }, { "epoch": 0.6151889072637069, "ewc_loss": 0.04053489863872528, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015510486264247447, "grad_norm": 5.013507843017578, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8551119565963745, "num_tokens": 184568399.0, "step": 4836 }, { "epoch": 0.6153161175422974, "ewc_loss": 0.04063567891716957, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001561126409796998, "grad_norm": 5.043801784515381, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8577187061309814, "num_tokens": 184603302.0, "step": 4837 }, { "epoch": 0.615443327820888, "ewc_loss": 0.040433332324028015, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015530988457612693, "grad_norm": 4.971683025360107, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8787933588027954, "num_tokens": 184637346.0, "step": 4838 }, { "epoch": 0.6155705380994785, "ewc_loss": 0.0405878946185112, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015563481429126114, "grad_norm": 5.1027703285217285, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8655981421470642, "num_tokens": 184671687.0, "step": 4839 }, { "epoch": 0.6156977483780689, "ewc_loss": 0.040653981268405914, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015629566041752696, "grad_norm": 5.028471946716309, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8679502606391907, "num_tokens": 184706867.0, "step": 4840 }, { "epoch": 0.6158249586566594, "ewc_loss": 0.04054431617259979, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015519899898208678, "grad_norm": 5.0272111892700195, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8546552658081055, "num_tokens": 184743916.0, "step": 4841 }, { "epoch": 0.61595216893525, "ewc_loss": 0.040655121207237244, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015630709822289646, "grad_norm": 4.9929375648498535, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8678526282310486, "num_tokens": 184783247.0, "step": 4842 }, { "epoch": 0.6160793792138405, "ewc_loss": 0.04057605564594269, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015551641990896314, "grad_norm": 4.991715908050537, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8596366047859192, "num_tokens": 184826501.0, "step": 4843 }, { "epoch": 0.616206589492431, "ewc_loss": 0.04063405469059944, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015609641559422016, "grad_norm": 5.009531497955322, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8570975065231323, "num_tokens": 184868743.0, "step": 4844 }, { "epoch": 0.6163337997710215, "ewc_loss": 0.04060628265142441, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015581866318825632, "grad_norm": 5.037585258483887, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8542318344116211, "num_tokens": 184908992.0, "step": 4845 }, { "epoch": 0.616461010049612, "ewc_loss": 0.040618740022182465, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015594327123835683, "grad_norm": 5.053387641906738, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.848118007183075, "num_tokens": 184942456.0, "step": 4846 }, { "epoch": 0.6165882203282025, "ewc_loss": 0.04036729037761688, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015587016241624951, "grad_norm": 5.1161789894104, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8490673303604126, "num_tokens": 184975834.0, "step": 4847 }, { "epoch": 0.616715430606793, "ewc_loss": 0.04065970703959465, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001563529367558658, "grad_norm": 5.024480819702148, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8808197975158691, "num_tokens": 185012295.0, "step": 4848 }, { "epoch": 0.6168426408853835, "ewc_loss": 0.04057272523641586, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001554831105750054, "grad_norm": 5.01737117767334, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8627875447273254, "num_tokens": 185049272.0, "step": 4849 }, { "epoch": 0.6169698511639741, "ewc_loss": 0.04060918092727661, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015584765060339123, "grad_norm": 5.027154922485352, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8708175420761108, "num_tokens": 185081054.0, "step": 4850 }, { "epoch": 0.6170970614425646, "ewc_loss": 0.04062001034617424, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015595596050843596, "grad_norm": 5.013543605804443, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8547550439834595, "num_tokens": 185119809.0, "step": 4851 }, { "epoch": 0.617224271721155, "ewc_loss": 0.040599625557661057, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000155752117279917, "grad_norm": 5.006536960601807, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8696954250335693, "num_tokens": 185161462.0, "step": 4852 }, { "epoch": 0.6173514819997455, "ewc_loss": 0.04045689105987549, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015554549463558942, "grad_norm": 5.0737457275390625, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8506491184234619, "num_tokens": 185196238.0, "step": 4853 }, { "epoch": 0.6174786922783361, "ewc_loss": 0.04036536067724228, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001558508665766567, "grad_norm": 4.988247394561768, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8581120371818542, "num_tokens": 185239317.0, "step": 4854 }, { "epoch": 0.6176059025569266, "ewc_loss": 0.04044586420059204, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015543520567007363, "grad_norm": 5.039790153503418, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8684303760528564, "num_tokens": 185273555.0, "step": 4855 }, { "epoch": 0.6177331128355171, "ewc_loss": 0.04040388762950897, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001562361285323277, "grad_norm": 5.051452159881592, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.865812361240387, "num_tokens": 185309721.0, "step": 4856 }, { "epoch": 0.6178603231141077, "ewc_loss": 0.04032697156071663, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015546698705293238, "grad_norm": 5.047534465789795, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8513266444206238, "num_tokens": 185350846.0, "step": 4857 }, { "epoch": 0.6179875333926982, "ewc_loss": 0.04031379148364067, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015533519035670906, "grad_norm": 5.046225547790527, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8524240255355835, "num_tokens": 185392472.0, "step": 4858 }, { "epoch": 0.6181147436712886, "ewc_loss": 0.040304165333509445, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015523892943747342, "grad_norm": 5.014800548553467, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8548718690872192, "num_tokens": 185438250.0, "step": 4859 }, { "epoch": 0.6182419539498791, "ewc_loss": 0.040439918637275696, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015537574654445052, "grad_norm": 5.0168352127075195, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8554452657699585, "num_tokens": 185479455.0, "step": 4860 }, { "epoch": 0.6183691642284697, "ewc_loss": 0.0405484139919281, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001552399917272851, "grad_norm": 5.028207778930664, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8550348281860352, "num_tokens": 185518719.0, "step": 4861 }, { "epoch": 0.6184963745070602, "ewc_loss": 0.04045441001653671, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015552063996437937, "grad_norm": 5.0413923263549805, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8441557884216309, "num_tokens": 185556671.0, "step": 4862 }, { "epoch": 0.6186235847856507, "ewc_loss": 0.04045809060335159, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015555747086182237, "grad_norm": 5.025742053985596, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8558803796768188, "num_tokens": 185595063.0, "step": 4863 }, { "epoch": 0.6187507950642412, "ewc_loss": 0.04050163924694061, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001559929660288617, "grad_norm": 5.086313247680664, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8486588001251221, "num_tokens": 185635626.0, "step": 4864 }, { "epoch": 0.6188780053428317, "ewc_loss": 0.04044264554977417, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015540301683358848, "grad_norm": 5.030667304992676, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.847447395324707, "num_tokens": 185671687.0, "step": 4865 }, { "epoch": 0.6190052156214222, "ewc_loss": 0.04045649245381355, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015554149285890162, "grad_norm": 5.033204555511475, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8615642786026001, "num_tokens": 185706459.0, "step": 4866 }, { "epoch": 0.6191324259000127, "ewc_loss": 0.040458548814058304, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015556204016320407, "grad_norm": 5.03202486038208, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8764692544937134, "num_tokens": 185743162.0, "step": 4867 }, { "epoch": 0.6192596361786032, "ewc_loss": 0.040322527289390564, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015542256005574018, "grad_norm": 5.015291213989258, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8442046642303467, "num_tokens": 185784036.0, "step": 4868 }, { "epoch": 0.6193868464571938, "ewc_loss": 0.040449902415275574, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015547560178674757, "grad_norm": 5.0750508308410645, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.854675829410553, "num_tokens": 185821407.0, "step": 4869 }, { "epoch": 0.6195140567357843, "ewc_loss": 0.0405191108584404, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001561676908750087, "grad_norm": 5.1054229736328125, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8645315766334534, "num_tokens": 185857789.0, "step": 4870 }, { "epoch": 0.6196412670143747, "ewc_loss": 0.04047347605228424, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015571131370961666, "grad_norm": 5.019398212432861, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8699652552604675, "num_tokens": 185896989.0, "step": 4871 }, { "epoch": 0.6197684772929652, "ewc_loss": 0.04051848500967026, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015494073159061372, "grad_norm": 5.053997993469238, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8730760216712952, "num_tokens": 185938123.0, "step": 4872 }, { "epoch": 0.6198956875715558, "ewc_loss": 0.04041746258735657, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015515118138864636, "grad_norm": 5.012178421020508, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8590226769447327, "num_tokens": 185973440.0, "step": 4873 }, { "epoch": 0.6200228978501463, "ewc_loss": 0.04029958322644234, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.0001551931054564193, "grad_norm": 5.033615589141846, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8599703907966614, "num_tokens": 186010069.0, "step": 4874 }, { "epoch": 0.6201501081287368, "ewc_loss": 0.04042623192071915, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015523888578172773, "grad_norm": 5.058261871337891, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8529607653617859, "num_tokens": 186048365.0, "step": 4875 }, { "epoch": 0.6202773184073274, "ewc_loss": 0.04048214480280876, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015579799946863204, "grad_norm": 5.075901985168457, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8647881746292114, "num_tokens": 186085358.0, "step": 4876 }, { "epoch": 0.6204045286859178, "ewc_loss": 0.040481679141521454, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015579337195958942, "grad_norm": 5.083560943603516, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8629238605499268, "num_tokens": 186118476.0, "step": 4877 }, { "epoch": 0.6205317389645083, "ewc_loss": 0.04045313596725464, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015550790703855455, "grad_norm": 5.043450832366943, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8551636934280396, "num_tokens": 186155487.0, "step": 4878 }, { "epoch": 0.6206589492430988, "ewc_loss": 0.040484845638275146, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015582503692712635, "grad_norm": 5.053792953491211, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8640468716621399, "num_tokens": 186192941.0, "step": 4879 }, { "epoch": 0.6207861595216894, "ewc_loss": 0.0404711589217186, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001556881470605731, "grad_norm": 4.9954447746276855, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8618038892745972, "num_tokens": 186233860.0, "step": 4880 }, { "epoch": 0.6209133698002799, "ewc_loss": 0.04046519845724106, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015562855696771294, "grad_norm": 5.130508899688721, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8477262258529663, "num_tokens": 186273081.0, "step": 4881 }, { "epoch": 0.6210405800788704, "ewc_loss": 0.040503330528736115, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001560098462505266, "grad_norm": 4.986466407775879, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8639258742332458, "num_tokens": 186310596.0, "step": 4882 }, { "epoch": 0.6211677903574608, "ewc_loss": 0.04044586420059204, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015543520567007363, "grad_norm": 5.047458648681641, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8534854650497437, "num_tokens": 186346262.0, "step": 4883 }, { "epoch": 0.6212950006360514, "ewc_loss": 0.04056403785943985, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015661693760193884, "grad_norm": 5.061873435974121, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8603904843330383, "num_tokens": 186386508.0, "step": 4884 }, { "epoch": 0.6214222109146419, "ewc_loss": 0.04052732512354851, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001562498218845576, "grad_norm": 5.043009281158447, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8539568781852722, "num_tokens": 186425184.0, "step": 4885 }, { "epoch": 0.6215494211932324, "ewc_loss": 0.04050959274172783, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015607249224558473, "grad_norm": 5.051297187805176, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8570506572723389, "num_tokens": 186463072.0, "step": 4886 }, { "epoch": 0.621676631471823, "ewc_loss": 0.04052765667438507, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015625312516931444, "grad_norm": 5.047610282897949, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8679918050765991, "num_tokens": 186504062.0, "step": 4887 }, { "epoch": 0.6218038417504135, "ewc_loss": 0.04051302745938301, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015610683476552367, "grad_norm": 5.034241676330566, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8498530387878418, "num_tokens": 186544564.0, "step": 4888 }, { "epoch": 0.6219310520290039, "ewc_loss": 0.0404883548617363, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015586009249091148, "grad_norm": 5.02631139755249, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8446201086044312, "num_tokens": 186587196.0, "step": 4889 }, { "epoch": 0.6220582623075944, "ewc_loss": 0.0404675118625164, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015565167996101081, "grad_norm": 5.155003547668457, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8598378896713257, "num_tokens": 186620673.0, "step": 4890 }, { "epoch": 0.622185472586185, "ewc_loss": 0.04053281247615814, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015630466805305332, "grad_norm": 4.99076509475708, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8613379597663879, "num_tokens": 186654028.0, "step": 4891 }, { "epoch": 0.6223126828647755, "ewc_loss": 0.04041473940014839, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015512396930716932, "grad_norm": 5.007612705230713, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8563053607940674, "num_tokens": 186693395.0, "step": 4892 }, { "epoch": 0.622439893143366, "ewc_loss": 0.04039356857538223, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015613294090144336, "grad_norm": 5.009453773498535, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8408480882644653, "num_tokens": 186734440.0, "step": 4893 }, { "epoch": 0.6225671034219565, "ewc_loss": 0.04052905738353729, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001562671095598489, "grad_norm": 5.037525177001953, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8618494272232056, "num_tokens": 186769637.0, "step": 4894 }, { "epoch": 0.622694313700547, "ewc_loss": 0.04052850604057312, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015626163803972304, "grad_norm": 5.005006313323975, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8553017973899841, "num_tokens": 186810188.0, "step": 4895 }, { "epoch": 0.6228215239791375, "ewc_loss": 0.04051675274968147, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015614410222042352, "grad_norm": 5.071402549743652, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.84867262840271, "num_tokens": 186847512.0, "step": 4896 }, { "epoch": 0.622948734257728, "ewc_loss": 0.04052596166729927, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015623618673998863, "grad_norm": 5.059630870819092, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8678085207939148, "num_tokens": 186878721.0, "step": 4897 }, { "epoch": 0.6230759445363185, "ewc_loss": 0.04051259905099869, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001561025419505313, "grad_norm": 4.981517314910889, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8631750345230103, "num_tokens": 186920239.0, "step": 4898 }, { "epoch": 0.6232031548149091, "ewc_loss": 0.04063001275062561, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000156056004925631, "grad_norm": 4.998816967010498, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8525716066360474, "num_tokens": 186964158.0, "step": 4899 }, { "epoch": 0.6233303650934996, "ewc_loss": 0.040657177567481995, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015632766007911414, "grad_norm": 4.994217395782471, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8740987181663513, "num_tokens": 187003176.0, "step": 4900 }, { "epoch": 0.62345757537209, "ewc_loss": 0.04068351536989212, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001565910060890019, "grad_norm": 5.0003461837768555, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8575921058654785, "num_tokens": 187046152.0, "step": 4901 }, { "epoch": 0.6235847856506805, "ewc_loss": 0.040713146328926086, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001568873121868819, "grad_norm": 5.038589954376221, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8704161643981934, "num_tokens": 187086629.0, "step": 4902 }, { "epoch": 0.6237119959292711, "ewc_loss": 0.04067926108837128, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015654845628887415, "grad_norm": 5.058006763458252, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8524729609489441, "num_tokens": 187122856.0, "step": 4903 }, { "epoch": 0.6238392062078616, "ewc_loss": 0.04075048863887787, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001572607725393027, "grad_norm": 5.032491683959961, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8718080520629883, "num_tokens": 187161568.0, "step": 4904 }, { "epoch": 0.6239664164864521, "ewc_loss": 0.0406789667904377, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015654553135391325, "grad_norm": 5.014942169189453, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.859418511390686, "num_tokens": 187202831.0, "step": 4905 }, { "epoch": 0.6240936267650427, "ewc_loss": 0.04071016609668732, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015685752441640943, "grad_norm": 5.017791271209717, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8652132153511047, "num_tokens": 187239059.0, "step": 4906 }, { "epoch": 0.6242208370436332, "ewc_loss": 0.04067442566156387, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015650011482648551, "grad_norm": 5.072292804718018, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8533298969268799, "num_tokens": 187272940.0, "step": 4907 }, { "epoch": 0.6243480473222236, "ewc_loss": 0.04059493541717529, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001569258893141523, "grad_norm": 5.114047050476074, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8722498416900635, "num_tokens": 187310963.0, "step": 4908 }, { "epoch": 0.6244752576008141, "ewc_loss": 0.04071955755352974, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015695142792537808, "grad_norm": 5.017554759979248, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.877703070640564, "num_tokens": 187345698.0, "step": 4909 }, { "epoch": 0.6246024678794047, "ewc_loss": 0.040673837065696716, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015649423585273325, "grad_norm": 5.049795150756836, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8567613363265991, "num_tokens": 187384762.0, "step": 4910 }, { "epoch": 0.6247296781579952, "ewc_loss": 0.040716856718063354, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015692441957071424, "grad_norm": 6.603848457336426, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8476523756980896, "num_tokens": 187424288.0, "step": 4911 }, { "epoch": 0.6248568884365857, "ewc_loss": 0.041960593312978745, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001693617959972471, "grad_norm": 5.111331939697266, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8670682907104492, "num_tokens": 187461481.0, "step": 4912 }, { "epoch": 0.6249840987151762, "ewc_loss": 0.0402744859457016, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015250069554895163, "grad_norm": 5.033454895019531, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8735870718955994, "num_tokens": 187497690.0, "step": 4913 }, { "epoch": 0.6251113089937667, "ewc_loss": 0.04084400832653046, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001594166678842157, "grad_norm": 5.096591949462891, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8534287810325623, "num_tokens": 187537825.0, "step": 4914 }, { "epoch": 0.6252385192723572, "ewc_loss": 0.04075321927666664, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015728804282844067, "grad_norm": 4.97154426574707, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8685548901557922, "num_tokens": 187583998.0, "step": 4915 }, { "epoch": 0.6253657295509477, "ewc_loss": 0.040629446506500244, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015727101708762348, "grad_norm": 5.029675006866455, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8570268154144287, "num_tokens": 187621962.0, "step": 4916 }, { "epoch": 0.6254929398295382, "ewc_loss": 0.04067574441432953, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001577340008225292, "grad_norm": 5.0767316818237305, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.865966796875, "num_tokens": 187655913.0, "step": 4917 }, { "epoch": 0.6256201501081288, "ewc_loss": 0.04068621248006821, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001578387018525973, "grad_norm": 5.06803560256958, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8498777747154236, "num_tokens": 187692433.0, "step": 4918 }, { "epoch": 0.6257473603867193, "ewc_loss": 0.04054545611143112, "ewc_loss_diag": 2.47955322265625e-05, "ewc_loss_parallel": 0.00015765181160531938, "grad_norm": 5.074996471405029, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8706134557723999, "num_tokens": 187724311.0, "step": 4919 }, { "epoch": 0.6258745706653097, "ewc_loss": 0.04069216549396515, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015789823373779655, "grad_norm": 4.9996843338012695, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8671995401382446, "num_tokens": 187766614.0, "step": 4920 }, { "epoch": 0.6260017809439002, "ewc_loss": 0.04074293375015259, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015718521899543703, "grad_norm": 5.014200210571289, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.870150089263916, "num_tokens": 187804847.0, "step": 4921 }, { "epoch": 0.6261289912224908, "ewc_loss": 0.04080396518111229, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001577955117681995, "grad_norm": 5.0300116539001465, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8603512048721313, "num_tokens": 187842791.0, "step": 4922 }, { "epoch": 0.6262562015010813, "ewc_loss": 0.040800366550683975, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015775952488183975, "grad_norm": 5.10218620300293, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8494154810905457, "num_tokens": 187878304.0, "step": 4923 }, { "epoch": 0.6263834117796718, "ewc_loss": 0.040808096528053284, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015783683920744807, "grad_norm": 5.024604797363281, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8609151840209961, "num_tokens": 187915022.0, "step": 4924 }, { "epoch": 0.6265106220582624, "ewc_loss": 0.040800102055072784, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015775686188135296, "grad_norm": 5.058465480804443, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8584299087524414, "num_tokens": 187950533.0, "step": 4925 }, { "epoch": 0.6266378323368528, "ewc_loss": 0.04068199545145035, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001577965304022655, "grad_norm": 5.019472122192383, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8555319309234619, "num_tokens": 187989893.0, "step": 4926 }, { "epoch": 0.6267650426154433, "ewc_loss": 0.04076331853866577, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001573890185682103, "grad_norm": 5.122061252593994, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8529307842254639, "num_tokens": 188025031.0, "step": 4927 }, { "epoch": 0.6268922528940338, "ewc_loss": 0.04068373516201973, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015781390538904816, "grad_norm": 4.998608589172363, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8514540195465088, "num_tokens": 188060797.0, "step": 4928 }, { "epoch": 0.6270194631726244, "ewc_loss": 0.04065070301294327, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015748359146527946, "grad_norm": 5.083977222442627, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8515212535858154, "num_tokens": 188096271.0, "step": 4929 }, { "epoch": 0.6271466734512149, "ewc_loss": 0.04071015864610672, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015807815361768007, "grad_norm": 5.034304618835449, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8599583506584167, "num_tokens": 188136697.0, "step": 4930 }, { "epoch": 0.6272738837298054, "ewc_loss": 0.04063938930630684, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015737046487629414, "grad_norm": 5.01063871383667, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8571153283119202, "num_tokens": 188174715.0, "step": 4931 }, { "epoch": 0.6274010940083958, "ewc_loss": 0.04066159948706627, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015759255620650947, "grad_norm": 5.037898063659668, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8372432589530945, "num_tokens": 188216379.0, "step": 4932 }, { "epoch": 0.6275283042869864, "ewc_loss": 0.04079065099358559, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015766236174385995, "grad_norm": 4.993642330169678, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8640305995941162, "num_tokens": 188252183.0, "step": 4933 }, { "epoch": 0.6276555145655769, "ewc_loss": 0.0408003032207489, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015775891370140016, "grad_norm": 5.015072822570801, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8615439534187317, "num_tokens": 188291128.0, "step": 4934 }, { "epoch": 0.6277827248441674, "ewc_loss": 0.04079846292734146, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015774049097672105, "grad_norm": 5.097548484802246, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.8416719436645508, "num_tokens": 188334256.0, "step": 4935 }, { "epoch": 0.627909935122758, "ewc_loss": 0.040821172297000885, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015796758816577494, "grad_norm": 5.017140865325928, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8513465523719788, "num_tokens": 188373207.0, "step": 4936 }, { "epoch": 0.6280371454013485, "ewc_loss": 0.040767375379800797, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015742960385978222, "grad_norm": 5.049251079559326, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8554596900939941, "num_tokens": 188414937.0, "step": 4937 }, { "epoch": 0.6281643556799389, "ewc_loss": 0.040778227150440216, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001575381465954706, "grad_norm": 5.037679672241211, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8551650643348694, "num_tokens": 188455398.0, "step": 4938 }, { "epoch": 0.6282915659585294, "ewc_loss": 0.040738098323345184, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015713686298113316, "grad_norm": 5.042091369628906, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8582184910774231, "num_tokens": 188496561.0, "step": 4939 }, { "epoch": 0.62841877623712, "ewc_loss": 0.040774375200271606, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015749958402011544, "grad_norm": 5.1120452880859375, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8638851642608643, "num_tokens": 188531318.0, "step": 4940 }, { "epoch": 0.6285459865157105, "ewc_loss": 0.04077538475394249, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015750971215311438, "grad_norm": 5.07509708404541, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8500226736068726, "num_tokens": 188566807.0, "step": 4941 }, { "epoch": 0.628673196794301, "ewc_loss": 0.04072842374444008, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015704009274486452, "grad_norm": 5.0805745124816895, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8540278673171997, "num_tokens": 188608062.0, "step": 4942 }, { "epoch": 0.6288004070728915, "ewc_loss": 0.040738143026828766, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015713729953859001, "grad_norm": 5.087150573730469, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8703100681304932, "num_tokens": 188644953.0, "step": 4943 }, { "epoch": 0.628927617351482, "ewc_loss": 0.040693387389183044, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001566897553857416, "grad_norm": 4.943780899047852, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.873856782913208, "num_tokens": 188687720.0, "step": 4944 }, { "epoch": 0.6290548276300725, "ewc_loss": 0.040578290820121765, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001567594736116007, "grad_norm": 5.059893608093262, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8519493341445923, "num_tokens": 188730077.0, "step": 4945 }, { "epoch": 0.629182037908663, "ewc_loss": 0.04065112769603729, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015748785517644137, "grad_norm": 5.041046619415283, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8640888929367065, "num_tokens": 188768306.0, "step": 4946 }, { "epoch": 0.6293092481872535, "ewc_loss": 0.040701232850551605, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015676820476073772, "grad_norm": 5.07282829284668, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8388150334358215, "num_tokens": 188807480.0, "step": 4947 }, { "epoch": 0.6294364584658441, "ewc_loss": 0.04076894372701645, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015744531992822886, "grad_norm": 5.200653076171875, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8487218618392944, "num_tokens": 188836383.0, "step": 4948 }, { "epoch": 0.6295636687444346, "ewc_loss": 0.04076763242483139, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015743219410069287, "grad_norm": 4.945720672607422, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8623571991920471, "num_tokens": 188876079.0, "step": 4949 }, { "epoch": 0.629690879023025, "ewc_loss": 0.04069150984287262, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015667095431126654, "grad_norm": 5.046214580535889, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.856259822845459, "num_tokens": 188912936.0, "step": 4950 }, { "epoch": 0.6298180893016155, "ewc_loss": 0.040837571024894714, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015813155914656818, "grad_norm": 5.072788238525391, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.857660174369812, "num_tokens": 188951475.0, "step": 4951 }, { "epoch": 0.6299452995802061, "ewc_loss": 0.040751226246356964, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015726812125649303, "grad_norm": 5.018637180328369, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8442937731742859, "num_tokens": 188996972.0, "step": 4952 }, { "epoch": 0.6300725098587966, "ewc_loss": 0.04077877104282379, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015754354535602033, "grad_norm": 5.053222179412842, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8774548768997192, "num_tokens": 189030706.0, "step": 4953 }, { "epoch": 0.6301997201373871, "ewc_loss": 0.040733106434345245, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015708693535998464, "grad_norm": 5.137026309967041, "learning_rate": 1e-06, "loss": 0.3798, "mean_token_accuracy": 0.8811664581298828, "num_tokens": 189065638.0, "step": 4954 }, { "epoch": 0.6303269304159776, "ewc_loss": 0.04075027257204056, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015725858975201845, "grad_norm": 5.079377174377441, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8722571134567261, "num_tokens": 189101747.0, "step": 4955 }, { "epoch": 0.6304541406945681, "ewc_loss": 0.0406946986913681, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015670285210944712, "grad_norm": 4.976084232330322, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8547110557556152, "num_tokens": 189142965.0, "step": 4956 }, { "epoch": 0.6305813509731586, "ewc_loss": 0.040664754807949066, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015640343190170825, "grad_norm": 5.035821914672852, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8635504245758057, "num_tokens": 189177822.0, "step": 4957 }, { "epoch": 0.6307085612517491, "ewc_loss": 0.0407174751162529, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015693063323851675, "grad_norm": 5.010162353515625, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8527113199234009, "num_tokens": 189215710.0, "step": 4958 }, { "epoch": 0.6308357715303397, "ewc_loss": 0.04064256697893143, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015740221715532243, "grad_norm": 5.1252570152282715, "learning_rate": 1e-06, "loss": 0.5524, "mean_token_accuracy": 0.8312308192253113, "num_tokens": 189250788.0, "step": 4959 }, { "epoch": 0.6309629818089302, "ewc_loss": 0.040644269436597824, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015741925744805485, "grad_norm": 5.099178791046143, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8556979894638062, "num_tokens": 189278451.0, "step": 4960 }, { "epoch": 0.6310901920875207, "ewc_loss": 0.04074399918317795, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015719585644546896, "grad_norm": 4.98471736907959, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8612631559371948, "num_tokens": 189316857.0, "step": 4961 }, { "epoch": 0.6312174023661112, "ewc_loss": 0.04063614830374718, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015733804320916533, "grad_norm": 4.985336780548096, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8697901964187622, "num_tokens": 189356600.0, "step": 4962 }, { "epoch": 0.6313446126447017, "ewc_loss": 0.04078485816717148, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015760443056933582, "grad_norm": 5.08704948425293, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8532713055610657, "num_tokens": 189393134.0, "step": 4963 }, { "epoch": 0.6314718229232922, "ewc_loss": 0.04084034636616707, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001581593242008239, "grad_norm": 4.9613542556762695, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8430041074752808, "num_tokens": 189433431.0, "step": 4964 }, { "epoch": 0.6315990332018827, "ewc_loss": 0.04079815745353699, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015773742052260786, "grad_norm": 5.058785915374756, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8440713882446289, "num_tokens": 189473738.0, "step": 4965 }, { "epoch": 0.6317262434804732, "ewc_loss": 0.04088307172060013, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001585865975357592, "grad_norm": 5.0345072746276855, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8723429441452026, "num_tokens": 189509930.0, "step": 4966 }, { "epoch": 0.6318534537590638, "ewc_loss": 0.040857069194316864, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001583265548106283, "grad_norm": 5.042174816131592, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8638675212860107, "num_tokens": 189547801.0, "step": 4967 }, { "epoch": 0.6319806640376543, "ewc_loss": 0.040875405073165894, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015850990894250572, "grad_norm": 5.033551216125488, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8712352514266968, "num_tokens": 189582226.0, "step": 4968 }, { "epoch": 0.6321078743162447, "ewc_loss": 0.040837645530700684, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015813233039807528, "grad_norm": 5.004487037658691, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.857313871383667, "num_tokens": 189622915.0, "step": 4969 }, { "epoch": 0.6322350845948352, "ewc_loss": 0.04087013751268387, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015845723100937903, "grad_norm": 5.0448222160339355, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8654428124427795, "num_tokens": 189658182.0, "step": 4970 }, { "epoch": 0.6323622948734258, "ewc_loss": 0.040841229259967804, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015816817176528275, "grad_norm": 5.028125286102295, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8615628480911255, "num_tokens": 189696627.0, "step": 4971 }, { "epoch": 0.6324895051520163, "ewc_loss": 0.040869347751140594, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015844935842324048, "grad_norm": 5.115268230438232, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.856153130531311, "num_tokens": 189729826.0, "step": 4972 }, { "epoch": 0.6326167154306068, "ewc_loss": 0.040880970656871796, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000158565555466339, "grad_norm": 5.020023345947266, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8632771968841553, "num_tokens": 189771504.0, "step": 4973 }, { "epoch": 0.6327439257091974, "ewc_loss": 0.04082581400871277, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015801397967152297, "grad_norm": 5.01967191696167, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8766719698905945, "num_tokens": 189809220.0, "step": 4974 }, { "epoch": 0.6328711359877878, "ewc_loss": 0.040789052844047546, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001588671002537012, "grad_norm": 5.0706787109375, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8668510317802429, "num_tokens": 189849586.0, "step": 4975 }, { "epoch": 0.6329983462663783, "ewc_loss": 0.04071616381406784, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015813822392374277, "grad_norm": 5.077219486236572, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8597361445426941, "num_tokens": 189885054.0, "step": 4976 }, { "epoch": 0.6331255565449688, "ewc_loss": 0.04088999330997467, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015865580644458532, "grad_norm": 5.094181060791016, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8514712452888489, "num_tokens": 189918286.0, "step": 4977 }, { "epoch": 0.6332527668235594, "ewc_loss": 0.04084664583206177, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015822230488993227, "grad_norm": 5.031595230102539, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8403074741363525, "num_tokens": 189956845.0, "step": 4978 }, { "epoch": 0.6333799771021499, "ewc_loss": 0.04087916761636734, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015854751109145582, "grad_norm": 5.049893379211426, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8760775923728943, "num_tokens": 189994827.0, "step": 4979 }, { "epoch": 0.6335071873807404, "ewc_loss": 0.04074600711464882, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001584366400493309, "grad_norm": 5.0640692710876465, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8533113598823547, "num_tokens": 190036066.0, "step": 4980 }, { "epoch": 0.6336343976593308, "ewc_loss": 0.04087784141302109, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015853428340051323, "grad_norm": 4.993747234344482, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.871275782585144, "num_tokens": 190073326.0, "step": 4981 }, { "epoch": 0.6337616079379214, "ewc_loss": 0.040844909846782684, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015820494445506483, "grad_norm": 5.041351318359375, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8569754362106323, "num_tokens": 190109125.0, "step": 4982 }, { "epoch": 0.6338888182165119, "ewc_loss": 0.04087259620428085, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001584818382980302, "grad_norm": 5.002367973327637, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8642369508743286, "num_tokens": 190148520.0, "step": 4983 }, { "epoch": 0.6340160284951024, "ewc_loss": 0.040861763060092926, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000158373499289155, "grad_norm": 5.087711334228516, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.862331211566925, "num_tokens": 190184272.0, "step": 4984 }, { "epoch": 0.6341432387736929, "ewc_loss": 0.04091733694076538, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001589292223798111, "grad_norm": 5.047237396240234, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8582201600074768, "num_tokens": 190221483.0, "step": 4985 }, { "epoch": 0.6342704490522835, "ewc_loss": 0.04087717831134796, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015852763317525387, "grad_norm": 4.995963096618652, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8705047965049744, "num_tokens": 190266430.0, "step": 4986 }, { "epoch": 0.6343976593308739, "ewc_loss": 0.04088176041841507, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000158573457156308, "grad_norm": 4.974915981292725, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8710998296737671, "num_tokens": 190308575.0, "step": 4987 }, { "epoch": 0.6345248696094644, "ewc_loss": 0.04075438529253006, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015852040087338537, "grad_norm": 5.100692272186279, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.868921160697937, "num_tokens": 190344584.0, "step": 4988 }, { "epoch": 0.6346520798880549, "ewc_loss": 0.04092523083090782, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000159008166519925, "grad_norm": 5.108946800231934, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.867527961730957, "num_tokens": 190376497.0, "step": 4989 }, { "epoch": 0.6347792901666455, "ewc_loss": 0.04086412861943245, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001583971461514011, "grad_norm": 5.089816570281982, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.849026083946228, "num_tokens": 190416879.0, "step": 4990 }, { "epoch": 0.634906500445236, "ewc_loss": 0.04085925221443176, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015834838268347085, "grad_norm": 5.007933616638184, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8798441886901855, "num_tokens": 190454684.0, "step": 4991 }, { "epoch": 0.6350337107238265, "ewc_loss": 0.040816839784383774, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015792425256222486, "grad_norm": 5.215223789215088, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8516196608543396, "num_tokens": 190490524.0, "step": 4992 }, { "epoch": 0.635160921002417, "ewc_loss": 0.04088748246431351, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001586306607350707, "grad_norm": 5.054027080535889, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8664630651473999, "num_tokens": 190526558.0, "step": 4993 }, { "epoch": 0.6352881312810075, "ewc_loss": 0.04061109572649002, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.000157087531988509, "grad_norm": 5.026895046234131, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8594510555267334, "num_tokens": 190571206.0, "step": 4994 }, { "epoch": 0.635415341559598, "ewc_loss": 0.040655482560396194, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015753139450680465, "grad_norm": 5.048191070556641, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8484327793121338, "num_tokens": 190611480.0, "step": 4995 }, { "epoch": 0.6355425518381885, "ewc_loss": 0.040632106363773346, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.0001572976470924914, "grad_norm": 5.151678085327148, "learning_rate": 1e-06, "loss": 0.5367, "mean_token_accuracy": 0.8323182463645935, "num_tokens": 190647699.0, "step": 4996 }, { "epoch": 0.6356697621167791, "ewc_loss": 0.04078662395477295, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015762209659442306, "grad_norm": 5.056311130523682, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8608165979385376, "num_tokens": 190684863.0, "step": 4997 }, { "epoch": 0.6357969723953696, "ewc_loss": 0.040730610489845276, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000157061978825368, "grad_norm": 5.088963508605957, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8445010185241699, "num_tokens": 190725006.0, "step": 4998 }, { "epoch": 0.63592418267396, "ewc_loss": 0.040768131613731384, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015743719995953143, "grad_norm": 5.090501308441162, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8559811115264893, "num_tokens": 190757431.0, "step": 4999 }, { "epoch": 0.6360513929525505, "ewc_loss": 0.040770046412944794, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015745630662422627, "grad_norm": 5.099896430969238, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8640042543411255, "num_tokens": 190793118.0, "step": 5000 }, { "epoch": 0.6361786032311411, "ewc_loss": 0.040785618126392365, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015761202666908503, "grad_norm": 5.004003047943115, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8686037063598633, "num_tokens": 190837105.0, "step": 5001 }, { "epoch": 0.6363058135097316, "ewc_loss": 0.04075019061565399, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001572577893966809, "grad_norm": 5.111062049865723, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8538771271705627, "num_tokens": 190875372.0, "step": 5002 }, { "epoch": 0.6364330237883221, "ewc_loss": 0.0407976359128952, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015773222548887134, "grad_norm": 5.054809093475342, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8645337820053101, "num_tokens": 190921307.0, "step": 5003 }, { "epoch": 0.6365602340669126, "ewc_loss": 0.04077145829796791, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015747043653391302, "grad_norm": 5.045389175415039, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8601340055465698, "num_tokens": 190967919.0, "step": 5004 }, { "epoch": 0.6366874443455031, "ewc_loss": 0.04078961908817291, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001576520735397935, "grad_norm": 5.030487060546875, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8580653071403503, "num_tokens": 191005668.0, "step": 5005 }, { "epoch": 0.6368146546240936, "ewc_loss": 0.040793634951114655, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015769222227390856, "grad_norm": 5.082873344421387, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8489985466003418, "num_tokens": 191044455.0, "step": 5006 }, { "epoch": 0.6369418649026841, "ewc_loss": 0.04081861302256584, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015794199134688824, "grad_norm": 5.0307936668396, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8644720315933228, "num_tokens": 191086369.0, "step": 5007 }, { "epoch": 0.6370690751812746, "ewc_loss": 0.04083362966775894, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000158092167112045, "grad_norm": 5.1099162101745605, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8547540903091431, "num_tokens": 191127070.0, "step": 5008 }, { "epoch": 0.6371962854598652, "ewc_loss": 0.04080752283334732, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001578311057528481, "grad_norm": 5.0087761878967285, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.873672604560852, "num_tokens": 191163623.0, "step": 5009 }, { "epoch": 0.6373234957384557, "ewc_loss": 0.04068369418382645, "ewc_loss_diag": 2.491474151611328e-05, "ewc_loss_parallel": 0.00015781348338350654, "grad_norm": 5.152693748474121, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8544831275939941, "num_tokens": 191196790.0, "step": 5010 }, { "epoch": 0.6374507060170462, "ewc_loss": 0.0408850759267807, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015860662097111344, "grad_norm": 5.067206859588623, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8567924499511719, "num_tokens": 191230265.0, "step": 5011 }, { "epoch": 0.6375779162956366, "ewc_loss": 0.040806517004966736, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001578210503794253, "grad_norm": 5.089456081390381, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.851432204246521, "num_tokens": 191266781.0, "step": 5012 }, { "epoch": 0.6377051265742272, "ewc_loss": 0.040885839611291885, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015861426072660834, "grad_norm": 5.072627067565918, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8494354486465454, "num_tokens": 191305869.0, "step": 5013 }, { "epoch": 0.6378323368528177, "ewc_loss": 0.04083726555109024, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015812853234820068, "grad_norm": 5.100243091583252, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8479672074317932, "num_tokens": 191343498.0, "step": 5014 }, { "epoch": 0.6379595471314082, "ewc_loss": 0.04089280217885971, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001586839061928913, "grad_norm": 5.115397930145264, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8439720869064331, "num_tokens": 191383260.0, "step": 5015 }, { "epoch": 0.6380867574099988, "ewc_loss": 0.04086675867438316, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015842344146221876, "grad_norm": 5.095848560333252, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8507445454597473, "num_tokens": 191417350.0, "step": 5016 }, { "epoch": 0.6382139676885893, "ewc_loss": 0.040913455188274384, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015889042697381228, "grad_norm": 5.0869526863098145, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8605638742446899, "num_tokens": 191454336.0, "step": 5017 }, { "epoch": 0.6383411779671797, "ewc_loss": 0.04088563472032547, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015861220890656114, "grad_norm": 5.070998191833496, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8610405921936035, "num_tokens": 191489977.0, "step": 5018 }, { "epoch": 0.6384683882457702, "ewc_loss": 0.040896207094192505, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015871791401878, "grad_norm": 5.167143821716309, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8492934703826904, "num_tokens": 191524622.0, "step": 5019 }, { "epoch": 0.6385955985243608, "ewc_loss": 0.040955524891614914, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015931110829114914, "grad_norm": 5.2322492599487305, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8479688167572021, "num_tokens": 191565389.0, "step": 5020 }, { "epoch": 0.6387228088029513, "ewc_loss": 0.040935859084129333, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.000159114453708753, "grad_norm": 5.040238857269287, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8785711526870728, "num_tokens": 191604640.0, "step": 5021 }, { "epoch": 0.6388500190815418, "ewc_loss": 0.040830399841070175, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015805986186023802, "grad_norm": 5.086489677429199, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8573664426803589, "num_tokens": 191651029.0, "step": 5022 }, { "epoch": 0.6389772293601323, "ewc_loss": 0.04095379263162613, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001592938060639426, "grad_norm": 5.121913433074951, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8603998422622681, "num_tokens": 191690524.0, "step": 5023 }, { "epoch": 0.6391044396387228, "ewc_loss": 0.04091101512312889, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015886600886005908, "grad_norm": 5.079630374908447, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.854391872882843, "num_tokens": 191733992.0, "step": 5024 }, { "epoch": 0.6392316499173133, "ewc_loss": 0.04091315343976021, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015888738562352955, "grad_norm": 5.1022844314575195, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8697522878646851, "num_tokens": 191769717.0, "step": 5025 }, { "epoch": 0.6393588601959038, "ewc_loss": 0.040930211544036865, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015905796317383647, "grad_norm": 5.046996593475342, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8452458381652832, "num_tokens": 191813334.0, "step": 5026 }, { "epoch": 0.6394860704744944, "ewc_loss": 0.0409366711974144, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015912257367745042, "grad_norm": 5.134891510009766, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8612644672393799, "num_tokens": 191848295.0, "step": 5027 }, { "epoch": 0.6396132807530849, "ewc_loss": 0.040941715240478516, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001591729960637167, "grad_norm": 5.085495471954346, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8526406288146973, "num_tokens": 191885329.0, "step": 5028 }, { "epoch": 0.6397404910316754, "ewc_loss": 0.0409199595451355, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015895544493105263, "grad_norm": 5.151025772094727, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8711862564086914, "num_tokens": 191918070.0, "step": 5029 }, { "epoch": 0.6398677013102658, "ewc_loss": 0.04096384346485138, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015939427248667926, "grad_norm": 5.045693874359131, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8630200028419495, "num_tokens": 191966084.0, "step": 5030 }, { "epoch": 0.6399949115888564, "ewc_loss": 0.04089278355240822, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015868368791416287, "grad_norm": 5.1367506980896, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8587266206741333, "num_tokens": 192004907.0, "step": 5031 }, { "epoch": 0.6401221218674469, "ewc_loss": 0.04096902161836624, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015944606275297701, "grad_norm": 5.121153354644775, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8403229713439941, "num_tokens": 192042942.0, "step": 5032 }, { "epoch": 0.6402493321460374, "ewc_loss": 0.04090835154056549, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015883936430327594, "grad_norm": 5.112796306610107, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8599741458892822, "num_tokens": 192081133.0, "step": 5033 }, { "epoch": 0.6403765424246279, "ewc_loss": 0.04090796783566475, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015883553714957088, "grad_norm": 5.051002502441406, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8591607809066772, "num_tokens": 192120171.0, "step": 5034 }, { "epoch": 0.6405037527032185, "ewc_loss": 0.040884241461753845, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.0001585982827236876, "grad_norm": 5.07004976272583, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8596872091293335, "num_tokens": 192159583.0, "step": 5035 }, { "epoch": 0.6406309629818089, "ewc_loss": 0.04095061123371124, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015926195192150772, "grad_norm": 5.140566349029541, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.856855571269989, "num_tokens": 192199513.0, "step": 5036 }, { "epoch": 0.6407581732603994, "ewc_loss": 0.04098184406757355, "ewc_loss_diag": 2.5033950805664062e-05, "ewc_loss_parallel": 0.00015957429422996938, "grad_norm": 5.140588283538818, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8616786599159241, "num_tokens": 192228352.0, "step": 5037 }, { "epoch": 0.6408853835389899, "ewc_loss": 0.041050322353839874, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015903839084785432, "grad_norm": 5.043513298034668, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8548084497451782, "num_tokens": 192264480.0, "step": 5038 }, { "epoch": 0.6410125938175805, "ewc_loss": 0.041072387248277664, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015925902698654681, "grad_norm": 5.080785751342773, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8573973178863525, "num_tokens": 192303307.0, "step": 5039 }, { "epoch": 0.641139804096171, "ewc_loss": 0.04111924767494202, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015972765686456114, "grad_norm": 5.059114933013916, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8613649010658264, "num_tokens": 192342064.0, "step": 5040 }, { "epoch": 0.6412670143747615, "ewc_loss": 0.04111403599381447, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.0001596755173522979, "grad_norm": 5.084290504455566, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8658382892608643, "num_tokens": 192379297.0, "step": 5041 }, { "epoch": 0.6413942246533519, "ewc_loss": 0.04115603119134903, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.0001600954565219581, "grad_norm": 5.072197914123535, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8621852993965149, "num_tokens": 192416087.0, "step": 5042 }, { "epoch": 0.6415214349319425, "ewc_loss": 0.041131146252155304, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.0001598466042196378, "grad_norm": 5.0650787353515625, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8637666702270508, "num_tokens": 192454421.0, "step": 5043 }, { "epoch": 0.641648645210533, "ewc_loss": 0.04112191125750542, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015975427231751382, "grad_norm": 5.066570281982422, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8554965257644653, "num_tokens": 192493860.0, "step": 5044 }, { "epoch": 0.6417758554891235, "ewc_loss": 0.04115970432758331, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00016013221465982497, "grad_norm": 5.094316005706787, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8504446744918823, "num_tokens": 192533672.0, "step": 5045 }, { "epoch": 0.641903065767714, "ewc_loss": 0.04112954065203667, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015983055345714092, "grad_norm": 5.1217732429504395, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.84766685962677, "num_tokens": 192569569.0, "step": 5046 }, { "epoch": 0.6420302760463046, "ewc_loss": 0.041136641055345535, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015990156680345535, "grad_norm": 5.0926384925842285, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8658237457275391, "num_tokens": 192610716.0, "step": 5047 }, { "epoch": 0.642157486324895, "ewc_loss": 0.04111510515213013, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.0001596861839061603, "grad_norm": 5.090059757232666, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8527249097824097, "num_tokens": 192648040.0, "step": 5048 }, { "epoch": 0.6422846966034855, "ewc_loss": 0.041100047528743744, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015953561523929238, "grad_norm": 5.06993293762207, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8576730489730835, "num_tokens": 192690063.0, "step": 5049 }, { "epoch": 0.6424119068820761, "ewc_loss": 0.04125691205263138, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015988356608431786, "grad_norm": 5.065613746643066, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.854751467704773, "num_tokens": 192732067.0, "step": 5050 }, { "epoch": 0.6425391171606666, "ewc_loss": 0.04123768210411072, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015969127707649022, "grad_norm": 5.081538200378418, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8682523965835571, "num_tokens": 192768598.0, "step": 5051 }, { "epoch": 0.6426663274392571, "ewc_loss": 0.04129520803689957, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016026654338929802, "grad_norm": 5.120822906494141, "learning_rate": 1e-06, "loss": 0.5519, "mean_token_accuracy": 0.830630898475647, "num_tokens": 192805420.0, "step": 5052 }, { "epoch": 0.6427935377178476, "ewc_loss": 0.04125667363405228, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015988116501830518, "grad_norm": 5.115203380584717, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8545907139778137, "num_tokens": 192852082.0, "step": 5053 }, { "epoch": 0.6429207479964381, "ewc_loss": 0.04127331078052521, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016004755161702633, "grad_norm": 5.047120571136475, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.860363245010376, "num_tokens": 192892846.0, "step": 5054 }, { "epoch": 0.6430479582750286, "ewc_loss": 0.041217394173145294, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015948839427437633, "grad_norm": 5.066805362701416, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8767650723457336, "num_tokens": 192932091.0, "step": 5055 }, { "epoch": 0.6431751685536191, "ewc_loss": 0.04127207770943642, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001600352261448279, "grad_norm": 5.1586503982543945, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8604692220687866, "num_tokens": 192960628.0, "step": 5056 }, { "epoch": 0.6433023788322096, "ewc_loss": 0.04129490256309509, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016026347293518484, "grad_norm": 5.046368598937988, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8527706861495972, "num_tokens": 193002075.0, "step": 5057 }, { "epoch": 0.6434295891108002, "ewc_loss": 0.041204072535037994, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015935517149046063, "grad_norm": 5.071669101715088, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8480806350708008, "num_tokens": 193039720.0, "step": 5058 }, { "epoch": 0.6435567993893907, "ewc_loss": 0.04132508859038353, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016056533786468208, "grad_norm": 5.127985000610352, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8592321276664734, "num_tokens": 193072806.0, "step": 5059 }, { "epoch": 0.6436840096679812, "ewc_loss": 0.04132700338959694, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001605844881851226, "grad_norm": 5.135175704956055, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.868403434753418, "num_tokens": 193108478.0, "step": 5060 }, { "epoch": 0.6438112199465716, "ewc_loss": 0.04130277782678604, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016034222790040076, "grad_norm": 5.025623321533203, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8626478910446167, "num_tokens": 193150663.0, "step": 5061 }, { "epoch": 0.6439384302251622, "ewc_loss": 0.041146375238895416, "ewc_loss_diag": 2.5153160095214844e-05, "ewc_loss_parallel": 0.00015999891911633313, "grad_norm": 5.089479446411133, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8625130653381348, "num_tokens": 193191499.0, "step": 5062 }, { "epoch": 0.6440656405037527, "ewc_loss": 0.04132923483848572, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016060678171925247, "grad_norm": 5.0887956619262695, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8496862649917603, "num_tokens": 193236103.0, "step": 5063 }, { "epoch": 0.6441928507823432, "ewc_loss": 0.04125479608774185, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015986243670340627, "grad_norm": 5.113073825836182, "learning_rate": 1e-06, "loss": 0.529, "mean_token_accuracy": 0.8367109298706055, "num_tokens": 193274095.0, "step": 5064 }, { "epoch": 0.6443200610609338, "ewc_loss": 0.04129882901906967, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001603027485543862, "grad_norm": 5.095137119293213, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8640829920768738, "num_tokens": 193316988.0, "step": 5065 }, { "epoch": 0.6444472713395243, "ewc_loss": 0.04124113917350769, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015972582332324237, "grad_norm": 5.0731611251831055, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8569397926330566, "num_tokens": 193357430.0, "step": 5066 }, { "epoch": 0.6445744816181147, "ewc_loss": 0.04128824174404144, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016019688337109983, "grad_norm": 5.10897970199585, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.865734338760376, "num_tokens": 193390392.0, "step": 5067 }, { "epoch": 0.6447016918967052, "ewc_loss": 0.041266001760959625, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015997445734683424, "grad_norm": 5.06281852722168, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8562153577804565, "num_tokens": 193430961.0, "step": 5068 }, { "epoch": 0.6448289021752958, "ewc_loss": 0.04127958416938782, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016011031402740628, "grad_norm": 5.088397026062012, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8568757176399231, "num_tokens": 193473767.0, "step": 5069 }, { "epoch": 0.6449561124538863, "ewc_loss": 0.04128272831439972, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016014173161238432, "grad_norm": 5.103453159332275, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8478546142578125, "num_tokens": 193511375.0, "step": 5070 }, { "epoch": 0.6450833227324768, "ewc_loss": 0.04127812385559082, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.000160095703904517, "grad_norm": 5.07419490814209, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8690837621688843, "num_tokens": 193551296.0, "step": 5071 }, { "epoch": 0.6452105330110673, "ewc_loss": 0.04132292419672012, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016054371371865273, "grad_norm": 5.035272121429443, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8623261451721191, "num_tokens": 193596484.0, "step": 5072 }, { "epoch": 0.6453377432896578, "ewc_loss": 0.04128854349255562, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001601998956175521, "grad_norm": 5.118640422821045, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8649085760116577, "num_tokens": 193637513.0, "step": 5073 }, { "epoch": 0.6454649535682483, "ewc_loss": 0.041326381266117096, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016057828906923532, "grad_norm": 5.1725172996521, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8538511991500854, "num_tokens": 193672808.0, "step": 5074 }, { "epoch": 0.6455921638468388, "ewc_loss": 0.0413481704890728, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016079616034403443, "grad_norm": 5.072200298309326, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8744657039642334, "num_tokens": 193713018.0, "step": 5075 }, { "epoch": 0.6457193741254293, "ewc_loss": 0.04126306623220444, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00015994513523764908, "grad_norm": 5.0461745262146, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.860364556312561, "num_tokens": 193757649.0, "step": 5076 }, { "epoch": 0.6458465844040199, "ewc_loss": 0.041339654475450516, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016071100253611803, "grad_norm": 5.1078691482543945, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8560305833816528, "num_tokens": 193792390.0, "step": 5077 }, { "epoch": 0.6459737946826104, "ewc_loss": 0.04134872555732727, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016080170462373644, "grad_norm": 5.08501672744751, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8665685057640076, "num_tokens": 193827721.0, "step": 5078 }, { "epoch": 0.6461010049612008, "ewc_loss": 0.041323162615299225, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001605460565770045, "grad_norm": 5.118653297424316, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8486244678497314, "num_tokens": 193867217.0, "step": 5079 }, { "epoch": 0.6462282152397913, "ewc_loss": 0.041344933211803436, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001607637677807361, "grad_norm": 5.126906394958496, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.86607825756073, "num_tokens": 193900128.0, "step": 5080 }, { "epoch": 0.6463554255183819, "ewc_loss": 0.041346244513988495, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001607769081601873, "grad_norm": 5.127871036529541, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8700728416442871, "num_tokens": 193935744.0, "step": 5081 }, { "epoch": 0.6464826357969724, "ewc_loss": 0.04134654626250267, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016077992040663958, "grad_norm": 5.057293891906738, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8564062118530273, "num_tokens": 193977344.0, "step": 5082 }, { "epoch": 0.6466098460755629, "ewc_loss": 0.041287291795015335, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016018736641854048, "grad_norm": 5.103562831878662, "learning_rate": 1e-06, "loss": 0.544, "mean_token_accuracy": 0.8315427899360657, "num_tokens": 194014295.0, "step": 5083 }, { "epoch": 0.6467370563541535, "ewc_loss": 0.04133500158786774, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001606644655112177, "grad_norm": 5.0975446701049805, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8436927199363708, "num_tokens": 194055245.0, "step": 5084 }, { "epoch": 0.6468642666327439, "ewc_loss": 0.04130638390779495, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001603783166501671, "grad_norm": 5.121438026428223, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8522223234176636, "num_tokens": 194089313.0, "step": 5085 }, { "epoch": 0.6469914769113344, "ewc_loss": 0.04133982211351395, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016071264690253884, "grad_norm": 5.072036266326904, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8657121062278748, "num_tokens": 194128293.0, "step": 5086 }, { "epoch": 0.6471186871899249, "ewc_loss": 0.041297584772109985, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016029029211495072, "grad_norm": 5.095672130584717, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8517818450927734, "num_tokens": 194166632.0, "step": 5087 }, { "epoch": 0.6472458974685155, "ewc_loss": 0.04134584963321686, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016077296459116042, "grad_norm": 5.089383125305176, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8449733257293701, "num_tokens": 194209176.0, "step": 5088 }, { "epoch": 0.647373107747106, "ewc_loss": 0.041319336742162704, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016050782869569957, "grad_norm": 5.122068405151367, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.847769021987915, "num_tokens": 194245617.0, "step": 5089 }, { "epoch": 0.6475003180256965, "ewc_loss": 0.04135549068450928, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016086937102954835, "grad_norm": 5.058590888977051, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8649430274963379, "num_tokens": 194285983.0, "step": 5090 }, { "epoch": 0.6476275283042869, "ewc_loss": 0.04132257029414177, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.000160540163051337, "grad_norm": 5.184721946716309, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8552051782608032, "num_tokens": 194317088.0, "step": 5091 }, { "epoch": 0.6477547385828775, "ewc_loss": 0.0413791798055172, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001611062471056357, "grad_norm": 5.073258876800537, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8634305596351624, "num_tokens": 194354025.0, "step": 5092 }, { "epoch": 0.647881948861468, "ewc_loss": 0.04145292937755585, "ewc_loss_diag": 2.5391578674316406e-05, "ewc_loss_parallel": 0.00016062303620856255, "grad_norm": 5.159488677978516, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8629640936851501, "num_tokens": 194388539.0, "step": 5093 }, { "epoch": 0.6480091591400585, "ewc_loss": 0.04136604815721512, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016097494517453015, "grad_norm": 5.110184669494629, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8600399494171143, "num_tokens": 194424518.0, "step": 5094 }, { "epoch": 0.648136369418649, "ewc_loss": 0.04140762239694595, "ewc_loss_diag": 2.5391578674316406e-05, "ewc_loss_parallel": 0.0001601699914317578, "grad_norm": 5.087224960327148, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8554812669754028, "num_tokens": 194463310.0, "step": 5095 }, { "epoch": 0.6482635796972396, "ewc_loss": 0.04146290570497513, "ewc_loss_diag": 2.5391578674316406e-05, "ewc_loss_parallel": 0.00016072280413936824, "grad_norm": 5.099067211151123, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8582101464271545, "num_tokens": 194499911.0, "step": 5096 }, { "epoch": 0.64839078997583, "ewc_loss": 0.041417136788368225, "ewc_loss_diag": 2.5391578674316406e-05, "ewc_loss_parallel": 0.00016026511730160564, "grad_norm": 5.155972957611084, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8484852313995361, "num_tokens": 194536731.0, "step": 5097 }, { "epoch": 0.6485180002544205, "ewc_loss": 0.041341930627822876, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001607337617315352, "grad_norm": 5.067719459533691, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8482968211174011, "num_tokens": 194574014.0, "step": 5098 }, { "epoch": 0.648645210533011, "ewc_loss": 0.04130569472908974, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.00016037140449043363, "grad_norm": 5.110109806060791, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8503832221031189, "num_tokens": 194611944.0, "step": 5099 }, { "epoch": 0.6487724208116016, "ewc_loss": 0.04140174016356468, "ewc_loss_diag": 2.5272369384765625e-05, "ewc_loss_parallel": 0.0001613318599993363, "grad_norm": 5.081751346588135, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8682735562324524, "num_tokens": 194649734.0, "step": 5100 }, { "epoch": 0.6488996310901921, "ewc_loss": 0.04158210754394531, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016069412231445312, "grad_norm": 5.07961893081665, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8667389750480652, "num_tokens": 194688344.0, "step": 5101 }, { "epoch": 0.6490268413687826, "ewc_loss": 0.04161965101957321, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016106957627926022, "grad_norm": 5.077471733093262, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.845561146736145, "num_tokens": 194728582.0, "step": 5102 }, { "epoch": 0.649154051647373, "ewc_loss": 0.04162261262536049, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016109917487483472, "grad_norm": 5.122758865356445, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8586441874504089, "num_tokens": 194762753.0, "step": 5103 }, { "epoch": 0.6492812619259636, "ewc_loss": 0.04164658486843109, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016133887402247638, "grad_norm": 5.066417217254639, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8706381320953369, "num_tokens": 194802325.0, "step": 5104 }, { "epoch": 0.6494084722045541, "ewc_loss": 0.04162362217903137, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016110924480017275, "grad_norm": 5.081790447235107, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8577471971511841, "num_tokens": 194846349.0, "step": 5105 }, { "epoch": 0.6495356824831446, "ewc_loss": 0.04162375256419182, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016111056902445853, "grad_norm": 5.070237159729004, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8590985536575317, "num_tokens": 194880334.0, "step": 5106 }, { "epoch": 0.6496628927617352, "ewc_loss": 0.04163060337305069, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001611790939932689, "grad_norm": 5.17337703704834, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8494719862937927, "num_tokens": 194919343.0, "step": 5107 }, { "epoch": 0.6497901030403257, "ewc_loss": 0.04164847731590271, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016135780606418848, "grad_norm": 5.088918209075928, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8603395223617554, "num_tokens": 194958800.0, "step": 5108 }, { "epoch": 0.6499173133189162, "ewc_loss": 0.04162374883890152, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016111052536871284, "grad_norm": 5.186924457550049, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8325492739677429, "num_tokens": 194997249.0, "step": 5109 }, { "epoch": 0.6500445235975066, "ewc_loss": 0.04162914305925369, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001611644693184644, "grad_norm": 5.115274429321289, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8605585098266602, "num_tokens": 195035879.0, "step": 5110 }, { "epoch": 0.6501717338760972, "ewc_loss": 0.041572436690330505, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001605974102858454, "grad_norm": 5.049352169036865, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.853848934173584, "num_tokens": 195075925.0, "step": 5111 }, { "epoch": 0.6502989441546877, "ewc_loss": 0.041617050766944885, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016104352835100144, "grad_norm": 5.147050380706787, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8601999282836914, "num_tokens": 195110185.0, "step": 5112 }, { "epoch": 0.6504261544332782, "ewc_loss": 0.04165458679199219, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016141889500431716, "grad_norm": 5.058428764343262, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8495171070098877, "num_tokens": 195154666.0, "step": 5113 }, { "epoch": 0.6505533647118688, "ewc_loss": 0.0415942519903183, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016081555804703385, "grad_norm": 5.098371505737305, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8685715198516846, "num_tokens": 195188998.0, "step": 5114 }, { "epoch": 0.6506805749904593, "ewc_loss": 0.04161142185330391, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016098727064672858, "grad_norm": 5.079528331756592, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8660280704498291, "num_tokens": 195224124.0, "step": 5115 }, { "epoch": 0.6508077852690497, "ewc_loss": 0.041612207889556885, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016099514323286712, "grad_norm": 5.182432174682617, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.858838677406311, "num_tokens": 195253601.0, "step": 5116 }, { "epoch": 0.6509349955476402, "ewc_loss": 0.04163973778486252, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016127042181324214, "grad_norm": 5.066105365753174, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8511722683906555, "num_tokens": 195289045.0, "step": 5117 }, { "epoch": 0.6510622058262308, "ewc_loss": 0.04160482436418533, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001609213068149984, "grad_norm": 5.121154308319092, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8606870770454407, "num_tokens": 195326983.0, "step": 5118 }, { "epoch": 0.6511894161048213, "ewc_loss": 0.0416686125099659, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016155917546711862, "grad_norm": 5.140539646148682, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8700487613677979, "num_tokens": 195360425.0, "step": 5119 }, { "epoch": 0.6513166263834118, "ewc_loss": 0.041613031178712845, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016100336506497115, "grad_norm": 5.059549808502197, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8470807671546936, "num_tokens": 195402648.0, "step": 5120 }, { "epoch": 0.6514438366620023, "ewc_loss": 0.0416160449385643, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016103348752949387, "grad_norm": 5.1132307052612305, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8756155967712402, "num_tokens": 195440945.0, "step": 5121 }, { "epoch": 0.6515710469405928, "ewc_loss": 0.041696950793266296, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016184253036044538, "grad_norm": 5.104024410247803, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.861480176448822, "num_tokens": 195481872.0, "step": 5122 }, { "epoch": 0.6516982572191833, "ewc_loss": 0.041647493839263916, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016134798352140933, "grad_norm": 5.0929274559021, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8638122081756592, "num_tokens": 195525460.0, "step": 5123 }, { "epoch": 0.6518254674977738, "ewc_loss": 0.041604116559028625, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001609142345841974, "grad_norm": 5.094647407531738, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8612662553787231, "num_tokens": 195563522.0, "step": 5124 }, { "epoch": 0.6519526777763643, "ewc_loss": 0.04161448031663895, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016101784422062337, "grad_norm": 5.162264347076416, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8685206770896912, "num_tokens": 195598114.0, "step": 5125 }, { "epoch": 0.6520798880549549, "ewc_loss": 0.041588328778743744, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016075634630396962, "grad_norm": 5.069256782531738, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8537737131118774, "num_tokens": 195639826.0, "step": 5126 }, { "epoch": 0.6522070983335454, "ewc_loss": 0.04157274216413498, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016060046618804336, "grad_norm": 5.068727493286133, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8659922480583191, "num_tokens": 195682047.0, "step": 5127 }, { "epoch": 0.6523343086121358, "ewc_loss": 0.041581522673368454, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016068827244453132, "grad_norm": 5.09367561340332, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8633420467376709, "num_tokens": 195722453.0, "step": 5128 }, { "epoch": 0.6524615188907263, "ewc_loss": 0.04160483926534653, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016092145233415067, "grad_norm": 5.076018810272217, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8549185991287231, "num_tokens": 195759182.0, "step": 5129 }, { "epoch": 0.6525887291693169, "ewc_loss": 0.0415894091129303, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001607671147212386, "grad_norm": 5.118975639343262, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8629249334335327, "num_tokens": 195795641.0, "step": 5130 }, { "epoch": 0.6527159394479074, "ewc_loss": 0.041613467037677765, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016100773063953966, "grad_norm": 5.090825080871582, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8689650893211365, "num_tokens": 195831200.0, "step": 5131 }, { "epoch": 0.6528431497264979, "ewc_loss": 0.04158549755811691, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016072801372502, "grad_norm": 5.043483734130859, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8610113859176636, "num_tokens": 195875110.0, "step": 5132 }, { "epoch": 0.6529703600050885, "ewc_loss": 0.04163387045264244, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016121174849104136, "grad_norm": 5.0903239250183105, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8618173599243164, "num_tokens": 195916724.0, "step": 5133 }, { "epoch": 0.6530975702836789, "ewc_loss": 0.04162199795246124, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016109301941469312, "grad_norm": 5.110380172729492, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8530742526054382, "num_tokens": 195956219.0, "step": 5134 }, { "epoch": 0.6532247805622694, "ewc_loss": 0.041597478091716766, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001608478050911799, "grad_norm": 5.08195686340332, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8626164793968201, "num_tokens": 195991563.0, "step": 5135 }, { "epoch": 0.6533519908408599, "ewc_loss": 0.04165298491716385, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016140287334565073, "grad_norm": 5.183457851409912, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8654323816299438, "num_tokens": 196031864.0, "step": 5136 }, { "epoch": 0.6534792011194505, "ewc_loss": 0.041659776121377945, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016147080168593675, "grad_norm": 5.142817497253418, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8511098623275757, "num_tokens": 196067208.0, "step": 5137 }, { "epoch": 0.653606411398041, "ewc_loss": 0.041634123772382736, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001612142805242911, "grad_norm": 5.094949722290039, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8628232479095459, "num_tokens": 196103934.0, "step": 5138 }, { "epoch": 0.6537336216766315, "ewc_loss": 0.04159993678331375, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016087241237983108, "grad_norm": 5.144554138183594, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8544951677322388, "num_tokens": 196140886.0, "step": 5139 }, { "epoch": 0.6538608319552219, "ewc_loss": 0.04159889370203018, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016086197865661234, "grad_norm": 5.115391254425049, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8675280809402466, "num_tokens": 196178940.0, "step": 5140 }, { "epoch": 0.6539880422338125, "ewc_loss": 0.04161059856414795, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016097903426270932, "grad_norm": 5.059643268585205, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8615009784698486, "num_tokens": 196229461.0, "step": 5141 }, { "epoch": 0.654115252512403, "ewc_loss": 0.04159587249159813, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016083176888059825, "grad_norm": 5.080368995666504, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8718281388282776, "num_tokens": 196265793.0, "step": 5142 }, { "epoch": 0.6542424627909935, "ewc_loss": 0.041646040976047516, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016133346071001142, "grad_norm": 5.120649337768555, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.865708589553833, "num_tokens": 196304919.0, "step": 5143 }, { "epoch": 0.654369673069584, "ewc_loss": 0.04164785146713257, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001613515632925555, "grad_norm": 5.036098480224609, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8740197420120239, "num_tokens": 196344761.0, "step": 5144 }, { "epoch": 0.6544968833481746, "ewc_loss": 0.04165215790271759, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001613946515135467, "grad_norm": 5.113935470581055, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.874028742313385, "num_tokens": 196382625.0, "step": 5145 }, { "epoch": 0.654624093626765, "ewc_loss": 0.041692037135362625, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016179341764654964, "grad_norm": 5.072131633758545, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.870559573173523, "num_tokens": 196422279.0, "step": 5146 }, { "epoch": 0.6547513039053555, "ewc_loss": 0.04167415201663971, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016161456005647779, "grad_norm": 5.105773448944092, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.855430006980896, "num_tokens": 196463684.0, "step": 5147 }, { "epoch": 0.654878514183946, "ewc_loss": 0.04169534146785736, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001618264359422028, "grad_norm": 5.089780807495117, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8626738786697388, "num_tokens": 196506319.0, "step": 5148 }, { "epoch": 0.6550057244625366, "ewc_loss": 0.041672177612781525, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016159484221134335, "grad_norm": 5.095660209655762, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8535056114196777, "num_tokens": 196546367.0, "step": 5149 }, { "epoch": 0.6551329347411271, "ewc_loss": 0.04170329496264458, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001619059912627563, "grad_norm": 5.206904411315918, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8451381325721741, "num_tokens": 196576598.0, "step": 5150 }, { "epoch": 0.6552601450197176, "ewc_loss": 0.04174254089593887, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016229844186455011, "grad_norm": 5.159215450286865, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8533809185028076, "num_tokens": 196610633.0, "step": 5151 }, { "epoch": 0.655387355298308, "ewc_loss": 0.0416383370757103, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016125640831887722, "grad_norm": 5.142789363861084, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8510696887969971, "num_tokens": 196646183.0, "step": 5152 }, { "epoch": 0.6555145655768986, "ewc_loss": 0.04167983680963516, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016167142894119024, "grad_norm": 5.086197376251221, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8691650629043579, "num_tokens": 196689454.0, "step": 5153 }, { "epoch": 0.6556417758554891, "ewc_loss": 0.041645873337984085, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016133177268784493, "grad_norm": 5.08706521987915, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8444958925247192, "num_tokens": 196731820.0, "step": 5154 }, { "epoch": 0.6557689861340796, "ewc_loss": 0.041664689779281616, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016151994350366294, "grad_norm": 5.179316520690918, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.850157618522644, "num_tokens": 196768542.0, "step": 5155 }, { "epoch": 0.6558961964126702, "ewc_loss": 0.041684988886117935, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016172294272109866, "grad_norm": 5.047842979431152, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8443267345428467, "num_tokens": 196810300.0, "step": 5156 }, { "epoch": 0.6560234066912607, "ewc_loss": 0.04165717959403992, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001614448701729998, "grad_norm": 5.103826522827148, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8433637022972107, "num_tokens": 196847271.0, "step": 5157 }, { "epoch": 0.6561506169698512, "ewc_loss": 0.041738130152225494, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016225433500949293, "grad_norm": 5.112042427062988, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8750826120376587, "num_tokens": 196880439.0, "step": 5158 }, { "epoch": 0.6562778272484416, "ewc_loss": 0.041705578565597534, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001619288232177496, "grad_norm": 5.141330242156982, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8528738021850586, "num_tokens": 196914911.0, "step": 5159 }, { "epoch": 0.6564050375270322, "ewc_loss": 0.04173221439123154, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016219518147408962, "grad_norm": 5.170742988586426, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.858981728553772, "num_tokens": 196949419.0, "step": 5160 }, { "epoch": 0.6565322478056227, "ewc_loss": 0.04171530902385712, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016202613187488168, "grad_norm": 5.07345724105835, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.852942943572998, "num_tokens": 196989401.0, "step": 5161 }, { "epoch": 0.6566594580842132, "ewc_loss": 0.04171539098024368, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001620269613340497, "grad_norm": 5.144419193267822, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8520010709762573, "num_tokens": 197028377.0, "step": 5162 }, { "epoch": 0.6567866683628037, "ewc_loss": 0.04173370450735092, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016221011173911393, "grad_norm": 5.131943702697754, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.838678240776062, "num_tokens": 197068334.0, "step": 5163 }, { "epoch": 0.6569138786413943, "ewc_loss": 0.041690245270729065, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016177551879081875, "grad_norm": 5.066534042358398, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8612557649612427, "num_tokens": 197105978.0, "step": 5164 }, { "epoch": 0.6570410889199847, "ewc_loss": 0.04175829887390137, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001624560245545581, "grad_norm": 5.122072696685791, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8417494297027588, "num_tokens": 197148641.0, "step": 5165 }, { "epoch": 0.6571682991985752, "ewc_loss": 0.0417451411485672, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016232446068897843, "grad_norm": 5.093098163604736, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8399389982223511, "num_tokens": 197188820.0, "step": 5166 }, { "epoch": 0.6572955094771658, "ewc_loss": 0.04175877571105957, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.000162460797582753, "grad_norm": 5.133811950683594, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8473209142684937, "num_tokens": 197227109.0, "step": 5167 }, { "epoch": 0.6574227197557563, "ewc_loss": 0.041766151785850525, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016253457579296082, "grad_norm": 5.091994285583496, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8653515577316284, "num_tokens": 197264083.0, "step": 5168 }, { "epoch": 0.6575499300343468, "ewc_loss": 0.0417105033993721, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016197809600271285, "grad_norm": 5.116628170013428, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8553051352500916, "num_tokens": 197301349.0, "step": 5169 }, { "epoch": 0.6576771403129373, "ewc_loss": 0.041781600564718246, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016268905892502517, "grad_norm": 5.1000776290893555, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8485007286071777, "num_tokens": 197348176.0, "step": 5170 }, { "epoch": 0.6578043505915278, "ewc_loss": 0.041780732572078705, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016268038598354906, "grad_norm": 5.138633728027344, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8513416647911072, "num_tokens": 197380490.0, "step": 5171 }, { "epoch": 0.6579315608701183, "ewc_loss": 0.041762568056583405, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016249873442575336, "grad_norm": 5.1416144371032715, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8455725908279419, "num_tokens": 197423075.0, "step": 5172 }, { "epoch": 0.6580587711487088, "ewc_loss": 0.04180794209241867, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001629524922464043, "grad_norm": 5.14901876449585, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8627440929412842, "num_tokens": 197461730.0, "step": 5173 }, { "epoch": 0.6581859814272993, "ewc_loss": 0.04172898828983307, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016216293442994356, "grad_norm": 5.06657600402832, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8480183482170105, "num_tokens": 197505801.0, "step": 5174 }, { "epoch": 0.6583131917058899, "ewc_loss": 0.04189968854188919, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016264924488496035, "grad_norm": 5.210419654846191, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8753739595413208, "num_tokens": 197534643.0, "step": 5175 }, { "epoch": 0.6584404019844804, "ewc_loss": 0.041783954948186874, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016271260392386466, "grad_norm": 5.077425479888916, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8645414113998413, "num_tokens": 197576490.0, "step": 5176 }, { "epoch": 0.6585676122630708, "ewc_loss": 0.0418461374938488, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016211371985264122, "grad_norm": 5.126419544219971, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8720338940620422, "num_tokens": 197612745.0, "step": 5177 }, { "epoch": 0.6586948225416613, "ewc_loss": 0.041919708251953125, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.000162849435582757, "grad_norm": 5.119574069976807, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.854590654373169, "num_tokens": 197653756.0, "step": 5178 }, { "epoch": 0.6588220328202519, "ewc_loss": 0.04190691187977791, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016272146604023874, "grad_norm": 5.177980422973633, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8828703165054321, "num_tokens": 197694417.0, "step": 5179 }, { "epoch": 0.6589492430988424, "ewc_loss": 0.041770800948143005, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.000162581040058285, "grad_norm": 5.081964492797852, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.854290783405304, "num_tokens": 197732738.0, "step": 5180 }, { "epoch": 0.6590764533774329, "ewc_loss": 0.04194767773151398, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001631291233934462, "grad_norm": 5.26984167098999, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.850803792476654, "num_tokens": 197759408.0, "step": 5181 }, { "epoch": 0.6592036636560235, "ewc_loss": 0.041807472705841064, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001629477774258703, "grad_norm": 5.107921123504639, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8612791299819946, "num_tokens": 197794334.0, "step": 5182 }, { "epoch": 0.6593308739346139, "ewc_loss": 0.041881561279296875, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016246794257313013, "grad_norm": 5.108779430389404, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8688640594482422, "num_tokens": 197832497.0, "step": 5183 }, { "epoch": 0.6594580842132044, "ewc_loss": 0.041929878294467926, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016295115347020328, "grad_norm": 5.170504093170166, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8510042428970337, "num_tokens": 197869021.0, "step": 5184 }, { "epoch": 0.6595852944917949, "ewc_loss": 0.04194100201129913, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016306234465446323, "grad_norm": 5.15409517288208, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8449960947036743, "num_tokens": 197906365.0, "step": 5185 }, { "epoch": 0.6597125047703855, "ewc_loss": 0.04192739352583885, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.000162926284247078, "grad_norm": 5.048665523529053, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8613568544387817, "num_tokens": 197950725.0, "step": 5186 }, { "epoch": 0.659839715048976, "ewc_loss": 0.041893020272254944, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016258256800938398, "grad_norm": 5.144253253936768, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8666603565216064, "num_tokens": 197991178.0, "step": 5187 }, { "epoch": 0.6599669253275665, "ewc_loss": 0.04198852553963661, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016353759565390646, "grad_norm": 5.119306564331055, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8503731489181519, "num_tokens": 198030176.0, "step": 5188 }, { "epoch": 0.6600941356061569, "ewc_loss": 0.04179767519235611, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001628497993806377, "grad_norm": 5.118701457977295, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8535038232803345, "num_tokens": 198067169.0, "step": 5189 }, { "epoch": 0.6602213458847475, "ewc_loss": 0.04181487858295441, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016302181757055223, "grad_norm": 5.101385593414307, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8482646942138672, "num_tokens": 198104994.0, "step": 5190 }, { "epoch": 0.660348556163338, "ewc_loss": 0.04175426810979843, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016241573030129075, "grad_norm": 5.077571392059326, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8630190491676331, "num_tokens": 198142791.0, "step": 5191 }, { "epoch": 0.6604757664419285, "ewc_loss": 0.04179546982049942, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016282772412523627, "grad_norm": 5.1354169845581055, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8556958436965942, "num_tokens": 198179300.0, "step": 5192 }, { "epoch": 0.660602976720519, "ewc_loss": 0.04178847372531891, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001627577730687335, "grad_norm": 5.058391571044922, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8731608986854553, "num_tokens": 198220230.0, "step": 5193 }, { "epoch": 0.6607301869991096, "ewc_loss": 0.04180673509836197, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016294037050101906, "grad_norm": 5.2657623291015625, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8464576601982117, "num_tokens": 198251067.0, "step": 5194 }, { "epoch": 0.6608573972777, "ewc_loss": 0.04198008030653, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016345313633792102, "grad_norm": 5.140336990356445, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8672218322753906, "num_tokens": 198282130.0, "step": 5195 }, { "epoch": 0.6609846075562905, "ewc_loss": 0.04185926169157028, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001622449344722554, "grad_norm": 5.0659637451171875, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8677725791931152, "num_tokens": 198325609.0, "step": 5196 }, { "epoch": 0.661111817834881, "ewc_loss": 0.04192041605710983, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016285649326164275, "grad_norm": 5.154038906097412, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8617069721221924, "num_tokens": 198365116.0, "step": 5197 }, { "epoch": 0.6612390281134716, "ewc_loss": 0.041899174451828, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001626440935069695, "grad_norm": 5.07381534576416, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.859917402267456, "num_tokens": 198403467.0, "step": 5198 }, { "epoch": 0.6613662383920621, "ewc_loss": 0.04189520329236984, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016260439588222653, "grad_norm": 5.108057022094727, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8653134107589722, "num_tokens": 198440676.0, "step": 5199 }, { "epoch": 0.6614934486706526, "ewc_loss": 0.0419602245092392, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016325459000654519, "grad_norm": 5.083693981170654, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8610530495643616, "num_tokens": 198479082.0, "step": 5200 }, { "epoch": 0.661620658949243, "ewc_loss": 0.04178440570831299, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016271711501758546, "grad_norm": 5.087416172027588, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8547670245170593, "num_tokens": 198521741.0, "step": 5201 }, { "epoch": 0.6617478692278336, "ewc_loss": 0.04183552786707878, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016322832379955798, "grad_norm": 5.063700199127197, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8525993824005127, "num_tokens": 198562177.0, "step": 5202 }, { "epoch": 0.6618750795064241, "ewc_loss": 0.04180730879306793, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016294616216327995, "grad_norm": 5.167423248291016, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8594350814819336, "num_tokens": 198600726.0, "step": 5203 }, { "epoch": 0.6620022897850146, "ewc_loss": 0.04185847193002701, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016345774929504842, "grad_norm": 5.154836654663086, "learning_rate": 1e-06, "loss": 0.5493, "mean_token_accuracy": 0.8304122686386108, "num_tokens": 198638156.0, "step": 5204 }, { "epoch": 0.6621295000636052, "ewc_loss": 0.04177510738372803, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016262414283119142, "grad_norm": 5.0951313972473145, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8406991958618164, "num_tokens": 198680312.0, "step": 5205 }, { "epoch": 0.6622567103421957, "ewc_loss": 0.041771143674850464, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016258448886219412, "grad_norm": 5.059688568115234, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8765530586242676, "num_tokens": 198723886.0, "step": 5206 }, { "epoch": 0.6623839206207861, "ewc_loss": 0.041879285126924515, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001624451979296282, "grad_norm": 5.11212158203125, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8591331243515015, "num_tokens": 198765331.0, "step": 5207 }, { "epoch": 0.6625111308993766, "ewc_loss": 0.0420471653342247, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016290329222101718, "grad_norm": 5.138851642608643, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8627550601959229, "num_tokens": 198802421.0, "step": 5208 }, { "epoch": 0.6626383411779672, "ewc_loss": 0.0417727530002594, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016260059783235192, "grad_norm": 5.109765529632568, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8688002824783325, "num_tokens": 198843335.0, "step": 5209 }, { "epoch": 0.6627655514565577, "ewc_loss": 0.04175901412963867, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016246319864876568, "grad_norm": 5.11378288269043, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8662675023078918, "num_tokens": 198880925.0, "step": 5210 }, { "epoch": 0.6628927617351482, "ewc_loss": 0.04181496053934097, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016302264702972025, "grad_norm": 5.181307792663574, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8582714796066284, "num_tokens": 198915448.0, "step": 5211 }, { "epoch": 0.6630199720137387, "ewc_loss": 0.04192754253745079, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001629277685424313, "grad_norm": 5.069562911987305, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8691343665122986, "num_tokens": 198952254.0, "step": 5212 }, { "epoch": 0.6631471822923293, "ewc_loss": 0.04191013425588608, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016275369853246957, "grad_norm": 5.196329593658447, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8638668656349182, "num_tokens": 198990841.0, "step": 5213 }, { "epoch": 0.6632743925709197, "ewc_loss": 0.041947197169065475, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016312430670950562, "grad_norm": 5.102427005767822, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.861563503742218, "num_tokens": 199029383.0, "step": 5214 }, { "epoch": 0.6634016028495102, "ewc_loss": 0.041883695870637894, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016248930478468537, "grad_norm": 5.152356147766113, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.858735203742981, "num_tokens": 199066731.0, "step": 5215 }, { "epoch": 0.6635288131281007, "ewc_loss": 0.0419369712471962, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016302207950502634, "grad_norm": 5.146765232086182, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.871220588684082, "num_tokens": 199108752.0, "step": 5216 }, { "epoch": 0.6636560234066913, "ewc_loss": 0.04189230129122734, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001625753502594307, "grad_norm": 5.106184005737305, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8632327914237976, "num_tokens": 199146438.0, "step": 5217 }, { "epoch": 0.6637832336852818, "ewc_loss": 0.04195094481110573, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001631617924431339, "grad_norm": 5.152266025543213, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8616076111793518, "num_tokens": 199186367.0, "step": 5218 }, { "epoch": 0.6639104439638723, "ewc_loss": 0.0419473871588707, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016312622756231576, "grad_norm": 5.116983413696289, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8724373579025269, "num_tokens": 199225785.0, "step": 5219 }, { "epoch": 0.6640376542424627, "ewc_loss": 0.04193393886089325, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016299175331369042, "grad_norm": 5.14281702041626, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8627102375030518, "num_tokens": 199262574.0, "step": 5220 }, { "epoch": 0.6641648645210533, "ewc_loss": 0.04193488508462906, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016300119750667363, "grad_norm": 5.088275909423828, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8644527792930603, "num_tokens": 199299093.0, "step": 5221 }, { "epoch": 0.6642920747996438, "ewc_loss": 0.041958294808864594, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001632353087188676, "grad_norm": 5.116064548492432, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.864881694316864, "num_tokens": 199337446.0, "step": 5222 }, { "epoch": 0.6644192850782343, "ewc_loss": 0.04209697246551514, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016340138972736895, "grad_norm": 5.414203643798828, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8616995811462402, "num_tokens": 199377321.0, "step": 5223 }, { "epoch": 0.6645464953568249, "ewc_loss": 0.042023785412311554, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016389017400797457, "grad_norm": 5.07363748550415, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8701496124267578, "num_tokens": 199413057.0, "step": 5224 }, { "epoch": 0.6646737056354154, "ewc_loss": 0.041849590837955475, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016214825154747814, "grad_norm": 5.135529518127441, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8665304183959961, "num_tokens": 199452730.0, "step": 5225 }, { "epoch": 0.6648009159140058, "ewc_loss": 0.042009398341178894, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016374631377402693, "grad_norm": 5.099794864654541, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.870736837387085, "num_tokens": 199487860.0, "step": 5226 }, { "epoch": 0.6649281261925963, "ewc_loss": 0.04196804389357567, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001633327774470672, "grad_norm": 5.1169281005859375, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8672395944595337, "num_tokens": 199532790.0, "step": 5227 }, { "epoch": 0.6650553364711869, "ewc_loss": 0.042008981108665466, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001637421373743564, "grad_norm": 5.18827486038208, "learning_rate": 1e-06, "loss": 0.3843, "mean_token_accuracy": 0.8790943622589111, "num_tokens": 199561072.0, "step": 5228 }, { "epoch": 0.6651825467497774, "ewc_loss": 0.04199498891830444, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016360223526135087, "grad_norm": 5.141534328460693, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8660241365432739, "num_tokens": 199595820.0, "step": 5229 }, { "epoch": 0.6653097570283679, "ewc_loss": 0.041978660970926285, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016343894822057337, "grad_norm": 5.110804080963135, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8480498194694519, "num_tokens": 199640101.0, "step": 5230 }, { "epoch": 0.6654369673069584, "ewc_loss": 0.0420193187892437, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016384552873205394, "grad_norm": 5.12509298324585, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8558731079101562, "num_tokens": 199680287.0, "step": 5231 }, { "epoch": 0.6655641775855489, "ewc_loss": 0.04199010133743286, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016355336993001401, "grad_norm": 5.1171159744262695, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8595737814903259, "num_tokens": 199721958.0, "step": 5232 }, { "epoch": 0.6656913878641394, "ewc_loss": 0.041992951184511185, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016358186258003116, "grad_norm": 5.16607666015625, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8651713728904724, "num_tokens": 199756021.0, "step": 5233 }, { "epoch": 0.6658185981427299, "ewc_loss": 0.041991982609033585, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016357217100448906, "grad_norm": 5.094400405883789, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.864508867263794, "num_tokens": 199793586.0, "step": 5234 }, { "epoch": 0.6659458084213205, "ewc_loss": 0.042122356593608856, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016365521878469735, "grad_norm": 5.212815284729004, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8472216129302979, "num_tokens": 199835728.0, "step": 5235 }, { "epoch": 0.666073018699911, "ewc_loss": 0.042000964283943176, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016366198542527854, "grad_norm": 5.111790657043457, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8680717945098877, "num_tokens": 199873348.0, "step": 5236 }, { "epoch": 0.6662002289785015, "ewc_loss": 0.04195700213313103, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016322235751431435, "grad_norm": 5.192907810211182, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8602312803268433, "num_tokens": 199908223.0, "step": 5237 }, { "epoch": 0.6663274392570919, "ewc_loss": 0.04199633747339249, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016361571033485234, "grad_norm": 5.138014793395996, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8567606210708618, "num_tokens": 199947017.0, "step": 5238 }, { "epoch": 0.6664546495356825, "ewc_loss": 0.04192056506872177, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016285797755699605, "grad_norm": 5.196346759796143, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8681448698043823, "num_tokens": 199984806.0, "step": 5239 }, { "epoch": 0.666581859814273, "ewc_loss": 0.041970059275627136, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016335291729774326, "grad_norm": 5.192359447479248, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8537232875823975, "num_tokens": 200022600.0, "step": 5240 }, { "epoch": 0.6667090700928635, "ewc_loss": 0.041906971484422684, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001627220626687631, "grad_norm": 5.162130355834961, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8692392110824585, "num_tokens": 200058407.0, "step": 5241 }, { "epoch": 0.666836280371454, "ewc_loss": 0.04193786904215813, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016303102893289179, "grad_norm": 5.13494348526001, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8332497477531433, "num_tokens": 200101871.0, "step": 5242 }, { "epoch": 0.6669634906500446, "ewc_loss": 0.04193033277988434, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016295569366775453, "grad_norm": 5.188265800476074, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8606499433517456, "num_tokens": 200138501.0, "step": 5243 }, { "epoch": 0.667090700928635, "ewc_loss": 0.041956089437007904, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016321321891155094, "grad_norm": 5.1752166748046875, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8591201901435852, "num_tokens": 200176639.0, "step": 5244 }, { "epoch": 0.6672179112072255, "ewc_loss": 0.04191732034087181, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016282555588986725, "grad_norm": 5.1946916580200195, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8547898530960083, "num_tokens": 200211873.0, "step": 5245 }, { "epoch": 0.667345121485816, "ewc_loss": 0.04209504276514053, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016338209388777614, "grad_norm": 5.223644733428955, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8559743762016296, "num_tokens": 200246596.0, "step": 5246 }, { "epoch": 0.6674723317644066, "ewc_loss": 0.0419190376996994, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016284269804600626, "grad_norm": 5.078003883361816, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8752806782722473, "num_tokens": 200282761.0, "step": 5247 }, { "epoch": 0.6675995420429971, "ewc_loss": 0.041955504566431046, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001632073981454596, "grad_norm": 5.167016506195068, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8452891707420349, "num_tokens": 200321821.0, "step": 5248 }, { "epoch": 0.6677267523215876, "ewc_loss": 0.04201018065214157, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016375412815250456, "grad_norm": 5.0943403244018555, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8439021110534668, "num_tokens": 200369066.0, "step": 5249 }, { "epoch": 0.667853962600178, "ewc_loss": 0.04198416322469711, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016349399811588228, "grad_norm": 5.162199974060059, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8538338541984558, "num_tokens": 200408067.0, "step": 5250 }, { "epoch": 0.6679811728787686, "ewc_loss": 0.042044349014759064, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016409580712206662, "grad_norm": 5.084596157073975, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8770602941513062, "num_tokens": 200446693.0, "step": 5251 }, { "epoch": 0.6681083831573591, "ewc_loss": 0.04199855774641037, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016363791655749083, "grad_norm": 5.172229766845703, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8412173390388489, "num_tokens": 200485340.0, "step": 5252 }, { "epoch": 0.6682355934359496, "ewc_loss": 0.04205266386270523, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001641790004214272, "grad_norm": 5.129098892211914, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8563669919967651, "num_tokens": 200521978.0, "step": 5253 }, { "epoch": 0.6683628037145402, "ewc_loss": 0.042049117386341095, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016414349374826998, "grad_norm": 5.185017108917236, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8604936599731445, "num_tokens": 200560604.0, "step": 5254 }, { "epoch": 0.6684900139931307, "ewc_loss": 0.042083099484443665, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016448333917651325, "grad_norm": 5.111642837524414, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8562811613082886, "num_tokens": 200601718.0, "step": 5255 }, { "epoch": 0.6686172242717211, "ewc_loss": 0.042049601674079895, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016414836863987148, "grad_norm": 5.121812343597412, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8351917266845703, "num_tokens": 200646186.0, "step": 5256 }, { "epoch": 0.6687444345503116, "ewc_loss": 0.042057208716869354, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016422444605268538, "grad_norm": 5.1680006980896, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.86250901222229, "num_tokens": 200684849.0, "step": 5257 }, { "epoch": 0.6688716448289022, "ewc_loss": 0.041878677904605865, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001636598026379943, "grad_norm": 5.12561559677124, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8656657934188843, "num_tokens": 200719542.0, "step": 5258 }, { "epoch": 0.6689988551074927, "ewc_loss": 0.042061127722263336, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016426361980848014, "grad_norm": 5.139739513397217, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8613275289535522, "num_tokens": 200756212.0, "step": 5259 }, { "epoch": 0.6691260653860832, "ewc_loss": 0.04203855246305466, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016403789049945772, "grad_norm": 5.133406162261963, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8624715805053711, "num_tokens": 200791184.0, "step": 5260 }, { "epoch": 0.6692532756646737, "ewc_loss": 0.04208095744252205, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001644619187572971, "grad_norm": 5.227264881134033, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8431462645530701, "num_tokens": 200823966.0, "step": 5261 }, { "epoch": 0.6693804859432643, "ewc_loss": 0.04205092042684555, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016416153812315315, "grad_norm": 5.08868932723999, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8644267320632935, "num_tokens": 200862718.0, "step": 5262 }, { "epoch": 0.6695076962218547, "ewc_loss": 0.041932739317417145, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.0001642004499444738, "grad_norm": 5.1323723793029785, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8550856709480286, "num_tokens": 200901755.0, "step": 5263 }, { "epoch": 0.6696349065004452, "ewc_loss": 0.04197714477777481, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016464450163766742, "grad_norm": 5.135266304016113, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8717137575149536, "num_tokens": 200935788.0, "step": 5264 }, { "epoch": 0.6697621167790357, "ewc_loss": 0.042086437344551086, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001645167067181319, "grad_norm": 5.148495674133301, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8741767406463623, "num_tokens": 200975159.0, "step": 5265 }, { "epoch": 0.6698893270576263, "ewc_loss": 0.042075783014297485, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016441017214674503, "grad_norm": 5.129743576049805, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8718181848526001, "num_tokens": 201010813.0, "step": 5266 }, { "epoch": 0.6700165373362168, "ewc_loss": 0.04209287464618683, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001645810843911022, "grad_norm": 5.166871547698975, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8591418266296387, "num_tokens": 201051937.0, "step": 5267 }, { "epoch": 0.6701437476148073, "ewc_loss": 0.04213591665029526, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016501150093972683, "grad_norm": 5.134977340698242, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8665218353271484, "num_tokens": 201090926.0, "step": 5268 }, { "epoch": 0.6702709578933977, "ewc_loss": 0.041932448744773865, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016419755411334336, "grad_norm": 5.180325031280518, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8404292464256287, "num_tokens": 201126086.0, "step": 5269 }, { "epoch": 0.6703981681719883, "ewc_loss": 0.04198917746543884, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016476480232086033, "grad_norm": 5.1588134765625, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8495216369628906, "num_tokens": 201165933.0, "step": 5270 }, { "epoch": 0.6705253784505788, "ewc_loss": 0.04193366318941116, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016420969041064382, "grad_norm": 5.119470596313477, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8634263277053833, "num_tokens": 201198794.0, "step": 5271 }, { "epoch": 0.6706525887291693, "ewc_loss": 0.042228445410728455, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001647161116125062, "grad_norm": 5.167286396026611, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.850780725479126, "num_tokens": 201234081.0, "step": 5272 }, { "epoch": 0.6707797990077599, "ewc_loss": 0.041968367993831635, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016455670993309468, "grad_norm": 5.124492168426514, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8566036224365234, "num_tokens": 201271796.0, "step": 5273 }, { "epoch": 0.6709070092863504, "ewc_loss": 0.04211433231830597, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016479568148497492, "grad_norm": 5.153120517730713, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8414990305900574, "num_tokens": 201309526.0, "step": 5274 }, { "epoch": 0.6710342195649408, "ewc_loss": 0.0420815646648407, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016446800145786256, "grad_norm": 5.109941482543945, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8671563267707825, "num_tokens": 201344015.0, "step": 5275 }, { "epoch": 0.6711614298435313, "ewc_loss": 0.042125679552555084, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016490912821609527, "grad_norm": 5.11433744430542, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8602712154388428, "num_tokens": 201385201.0, "step": 5276 }, { "epoch": 0.6712886401221219, "ewc_loss": 0.04222901165485382, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016472177230753005, "grad_norm": 5.091866970062256, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8653213977813721, "num_tokens": 201422370.0, "step": 5277 }, { "epoch": 0.6714158504007124, "ewc_loss": 0.04211628437042236, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016481518105138093, "grad_norm": 5.09597635269165, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8446418046951294, "num_tokens": 201467085.0, "step": 5278 }, { "epoch": 0.6715430606793029, "ewc_loss": 0.04228059574961662, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016523759404662997, "grad_norm": 5.15228796005249, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8737329840660095, "num_tokens": 201503432.0, "step": 5279 }, { "epoch": 0.6716702709578934, "ewc_loss": 0.0422859713435173, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016529133426956832, "grad_norm": 5.175173282623291, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.866342306137085, "num_tokens": 201541358.0, "step": 5280 }, { "epoch": 0.6717974812364839, "ewc_loss": 0.04226457327604294, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001650773483561352, "grad_norm": 5.13286018371582, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8561370372772217, "num_tokens": 201580235.0, "step": 5281 }, { "epoch": 0.6719246915150744, "ewc_loss": 0.04212323576211929, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.00016488471010234207, "grad_norm": 5.075817108154297, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8598257303237915, "num_tokens": 201627703.0, "step": 5282 }, { "epoch": 0.6720519017936649, "ewc_loss": 0.04213942214846611, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001650465710554272, "grad_norm": 5.130436420440674, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8622421026229858, "num_tokens": 201667182.0, "step": 5283 }, { "epoch": 0.6721791120722554, "ewc_loss": 0.04224570840597153, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016488869732711464, "grad_norm": 5.18789529800415, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8616031408309937, "num_tokens": 201699876.0, "step": 5284 }, { "epoch": 0.672306322350846, "ewc_loss": 0.04227566719055176, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016518833581358194, "grad_norm": 5.214107036590576, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8500931859016418, "num_tokens": 201732770.0, "step": 5285 }, { "epoch": 0.6724335326294365, "ewc_loss": 0.04223944991827011, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001648261386435479, "grad_norm": 5.117486953735352, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8613002300262451, "num_tokens": 201771388.0, "step": 5286 }, { "epoch": 0.6725607429080269, "ewc_loss": 0.042222633957862854, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016465796215925366, "grad_norm": 5.157975196838379, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8542547821998596, "num_tokens": 201809283.0, "step": 5287 }, { "epoch": 0.6726879531866174, "ewc_loss": 0.0422629788517952, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016506142856087536, "grad_norm": 5.107954978942871, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8658046126365662, "num_tokens": 201854950.0, "step": 5288 }, { "epoch": 0.672815163465208, "ewc_loss": 0.04221005365252495, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016453217540401965, "grad_norm": 5.111008644104004, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.864436149597168, "num_tokens": 201895638.0, "step": 5289 }, { "epoch": 0.6729423737437985, "ewc_loss": 0.042232245206832886, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016475409211125225, "grad_norm": 5.16399621963501, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8460584878921509, "num_tokens": 201928243.0, "step": 5290 }, { "epoch": 0.673069584022389, "ewc_loss": 0.04224499315023422, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016488158144056797, "grad_norm": 5.1374711990356445, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.859587550163269, "num_tokens": 201964687.0, "step": 5291 }, { "epoch": 0.6731967943009796, "ewc_loss": 0.04222903028130531, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016472194693051279, "grad_norm": 5.13120174407959, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.872258186340332, "num_tokens": 201997166.0, "step": 5292 }, { "epoch": 0.67332400457957, "ewc_loss": 0.04223659634590149, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001647976168897003, "grad_norm": 5.1322832107543945, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8615297675132751, "num_tokens": 202030771.0, "step": 5293 }, { "epoch": 0.6734512148581605, "ewc_loss": 0.042227234691381454, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016470398986712098, "grad_norm": 5.122912406921387, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8521450161933899, "num_tokens": 202077280.0, "step": 5294 }, { "epoch": 0.673578425136751, "ewc_loss": 0.04224672168493271, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016489885456394404, "grad_norm": 5.143700122833252, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.851648211479187, "num_tokens": 202115273.0, "step": 5295 }, { "epoch": 0.6737056354153416, "ewc_loss": 0.042261045426130295, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016504210361745209, "grad_norm": 5.225897312164307, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8700198531150818, "num_tokens": 202145347.0, "step": 5296 }, { "epoch": 0.6738328456939321, "ewc_loss": 0.04223288595676422, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016476049495395273, "grad_norm": 5.123488426208496, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8641044497489929, "num_tokens": 202184621.0, "step": 5297 }, { "epoch": 0.6739600559725226, "ewc_loss": 0.042185574769973755, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016428741218987852, "grad_norm": 5.073570728302002, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8625268340110779, "num_tokens": 202230099.0, "step": 5298 }, { "epoch": 0.674087266251113, "ewc_loss": 0.04220150411128998, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016444669745396823, "grad_norm": 5.14558744430542, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8527832627296448, "num_tokens": 202267872.0, "step": 5299 }, { "epoch": 0.6742144765297036, "ewc_loss": 0.04223087430000305, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001647403696551919, "grad_norm": 5.1500067710876465, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.858208179473877, "num_tokens": 202305517.0, "step": 5300 }, { "epoch": 0.6743416868082941, "ewc_loss": 0.042199306190013885, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016442469495814294, "grad_norm": 5.157031059265137, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8457844853401184, "num_tokens": 202340671.0, "step": 5301 }, { "epoch": 0.6744688970868846, "ewc_loss": 0.04225227236747742, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001649543846724555, "grad_norm": 5.138241767883301, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8680205345153809, "num_tokens": 202379552.0, "step": 5302 }, { "epoch": 0.6745961073654752, "ewc_loss": 0.04222823306918144, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016471397248096764, "grad_norm": 5.099515914916992, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8547824621200562, "num_tokens": 202421404.0, "step": 5303 }, { "epoch": 0.6747233176440657, "ewc_loss": 0.042244985699653625, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001648814941290766, "grad_norm": 5.108845233917236, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8451599478721619, "num_tokens": 202463765.0, "step": 5304 }, { "epoch": 0.6748505279226561, "ewc_loss": 0.042234137654304504, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016477302415296435, "grad_norm": 5.119728088378906, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8546838760375977, "num_tokens": 202505872.0, "step": 5305 }, { "epoch": 0.6749777382012466, "ewc_loss": 0.042244333773851395, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001648749748710543, "grad_norm": 5.113502025604248, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8571651577949524, "num_tokens": 202544085.0, "step": 5306 }, { "epoch": 0.6751049484798372, "ewc_loss": 0.04224812239408493, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016491285350639373, "grad_norm": 5.116232395172119, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8565468192100525, "num_tokens": 202588237.0, "step": 5307 }, { "epoch": 0.6752321587584277, "ewc_loss": 0.04225872457027435, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016501889331266284, "grad_norm": 5.147700309753418, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8532323837280273, "num_tokens": 202628462.0, "step": 5308 }, { "epoch": 0.6753593690370182, "ewc_loss": 0.04226894676685333, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016512113506905735, "grad_norm": 5.088997840881348, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8615717887878418, "num_tokens": 202668149.0, "step": 5309 }, { "epoch": 0.6754865793156087, "ewc_loss": 0.04227295517921448, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016516121104359627, "grad_norm": 5.132463455200195, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.874953031539917, "num_tokens": 202708651.0, "step": 5310 }, { "epoch": 0.6756137895941993, "ewc_loss": 0.04223721846938133, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001648038305575028, "grad_norm": 5.1936421394348145, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8645427227020264, "num_tokens": 202738613.0, "step": 5311 }, { "epoch": 0.6757409998727897, "ewc_loss": 0.042016156017780304, "ewc_loss_diag": 2.5510787963867188e-05, "ewc_loss_parallel": 0.00016503459482919425, "grad_norm": 5.114800453186035, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.856066882610321, "num_tokens": 202779279.0, "step": 5312 }, { "epoch": 0.6758682101513802, "ewc_loss": 0.04227603226900101, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016519198834430426, "grad_norm": 5.218449592590332, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8597027659416199, "num_tokens": 202812095.0, "step": 5313 }, { "epoch": 0.6759954204299707, "ewc_loss": 0.04223690927028656, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016480073099955916, "grad_norm": 5.143627643585205, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8550432920455933, "num_tokens": 202855705.0, "step": 5314 }, { "epoch": 0.6761226307085613, "ewc_loss": 0.04215703904628754, "ewc_loss_diag": 2.562999725341797e-05, "ewc_loss_parallel": 0.0001652227365411818, "grad_norm": 5.19679594039917, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8427660465240479, "num_tokens": 202894180.0, "step": 5315 }, { "epoch": 0.6762498409871518, "ewc_loss": 0.042265817523002625, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001650898193474859, "grad_norm": 5.198945045471191, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8670047521591187, "num_tokens": 202925842.0, "step": 5316 }, { "epoch": 0.6763770512657423, "ewc_loss": 0.04223985970020294, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001648302422836423, "grad_norm": 5.167099952697754, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8651789426803589, "num_tokens": 202959103.0, "step": 5317 }, { "epoch": 0.6765042615443327, "ewc_loss": 0.042192939668893814, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016436104488093406, "grad_norm": 5.119406223297119, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8596417903900146, "num_tokens": 203000393.0, "step": 5318 }, { "epoch": 0.6766314718229233, "ewc_loss": 0.04224412143230438, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001648728793952614, "grad_norm": 5.177919387817383, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8601272106170654, "num_tokens": 203041613.0, "step": 5319 }, { "epoch": 0.6767586821015138, "ewc_loss": 0.04220793396234512, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016451097326353192, "grad_norm": 5.136401653289795, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8749860525131226, "num_tokens": 203073744.0, "step": 5320 }, { "epoch": 0.6768858923801043, "ewc_loss": 0.042232222855091095, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001647538592806086, "grad_norm": 5.1775312423706055, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8441274166107178, "num_tokens": 203108350.0, "step": 5321 }, { "epoch": 0.6770131026586949, "ewc_loss": 0.04220467060804367, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016447836242150515, "grad_norm": 5.206900596618652, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.862329363822937, "num_tokens": 203139149.0, "step": 5322 }, { "epoch": 0.6771403129372854, "ewc_loss": 0.042237769812345505, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016480934573337436, "grad_norm": 5.166902542114258, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8427845239639282, "num_tokens": 203173953.0, "step": 5323 }, { "epoch": 0.6772675232158758, "ewc_loss": 0.04224591702222824, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016489082190673798, "grad_norm": 5.143515586853027, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8692114949226379, "num_tokens": 203208915.0, "step": 5324 }, { "epoch": 0.6773947334944663, "ewc_loss": 0.04223814606666565, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001648131146794185, "grad_norm": 5.1384453773498535, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8473663330078125, "num_tokens": 203251906.0, "step": 5325 }, { "epoch": 0.6775219437730569, "ewc_loss": 0.04230411350727081, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016547278210055083, "grad_norm": 5.176910400390625, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8596861362457275, "num_tokens": 203289460.0, "step": 5326 }, { "epoch": 0.6776491540516474, "ewc_loss": 0.04225641489028931, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016499579942319542, "grad_norm": 5.134171485900879, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8685192465782166, "num_tokens": 203325647.0, "step": 5327 }, { "epoch": 0.6777763643302379, "ewc_loss": 0.04227280244231224, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016515966854058206, "grad_norm": 5.130953788757324, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8546432256698608, "num_tokens": 203364972.0, "step": 5328 }, { "epoch": 0.6779035746088284, "ewc_loss": 0.04228261113166809, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001652577775530517, "grad_norm": 5.110171794891357, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8596561551094055, "num_tokens": 203407778.0, "step": 5329 }, { "epoch": 0.6780307848874189, "ewc_loss": 0.04229441285133362, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016537579358555377, "grad_norm": 5.21785306930542, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8605867624282837, "num_tokens": 203443820.0, "step": 5330 }, { "epoch": 0.6781579951660094, "ewc_loss": 0.042316921055316925, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001656008535064757, "grad_norm": 5.191736221313477, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8651196956634521, "num_tokens": 203475742.0, "step": 5331 }, { "epoch": 0.6782852054445999, "ewc_loss": 0.04228930547833443, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016532468725927174, "grad_norm": 5.245864391326904, "learning_rate": 1e-06, "loss": 0.5398, "mean_token_accuracy": 0.8329477906227112, "num_tokens": 203512617.0, "step": 5332 }, { "epoch": 0.6784124157231904, "ewc_loss": 0.042304620146751404, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016547783161513507, "grad_norm": 5.192736625671387, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8445582985877991, "num_tokens": 203546778.0, "step": 5333 }, { "epoch": 0.678539626001781, "ewc_loss": 0.04223247990012169, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016475643496960402, "grad_norm": 5.1579484939575195, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8616756200790405, "num_tokens": 203585222.0, "step": 5334 }, { "epoch": 0.6786668362803715, "ewc_loss": 0.04225456714630127, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001649773184908554, "grad_norm": 5.154879570007324, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8531447649002075, "num_tokens": 203627014.0, "step": 5335 }, { "epoch": 0.6787940465589619, "ewc_loss": 0.042270224541425705, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001651338825467974, "grad_norm": 5.192175388336182, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8649416565895081, "num_tokens": 203659189.0, "step": 5336 }, { "epoch": 0.6789212568375524, "ewc_loss": 0.04224264621734619, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016485810920130461, "grad_norm": 5.130602836608887, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8693594932556152, "num_tokens": 203695355.0, "step": 5337 }, { "epoch": 0.679048467116143, "ewc_loss": 0.04227440804243088, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016517571930307895, "grad_norm": 5.156100273132324, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8681852221488953, "num_tokens": 203733446.0, "step": 5338 }, { "epoch": 0.6791756773947335, "ewc_loss": 0.04229462146759033, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001653778599575162, "grad_norm": 5.166973114013672, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8425983190536499, "num_tokens": 203775387.0, "step": 5339 }, { "epoch": 0.679302887673324, "ewc_loss": 0.04227335378527641, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016516516916453838, "grad_norm": 5.18997859954834, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8545328378677368, "num_tokens": 203810713.0, "step": 5340 }, { "epoch": 0.6794300979519146, "ewc_loss": 0.04239530488848686, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016516399045940489, "grad_norm": 5.151591777801514, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8716636896133423, "num_tokens": 203847144.0, "step": 5341 }, { "epoch": 0.679557308230505, "ewc_loss": 0.04240596666932106, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016527061234228313, "grad_norm": 5.176492691040039, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8654725551605225, "num_tokens": 203889716.0, "step": 5342 }, { "epoch": 0.6796845185090955, "ewc_loss": 0.042392224073410034, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.0001651331695029512, "grad_norm": 5.254000663757324, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8466760516166687, "num_tokens": 203927692.0, "step": 5343 }, { "epoch": 0.679811728787686, "ewc_loss": 0.04235939681529999, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016480492195114493, "grad_norm": 5.151388168334961, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8477643728256226, "num_tokens": 203960180.0, "step": 5344 }, { "epoch": 0.6799389390662766, "ewc_loss": 0.04232846200466156, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016449556278530508, "grad_norm": 5.132744789123535, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8709881901741028, "num_tokens": 203997375.0, "step": 5345 }, { "epoch": 0.6800661493448671, "ewc_loss": 0.042371124029159546, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016492219583597034, "grad_norm": 5.171864032745361, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8554574847221375, "num_tokens": 204037588.0, "step": 5346 }, { "epoch": 0.6801933596234576, "ewc_loss": 0.04219230264425278, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016435468569397926, "grad_norm": 5.075134754180908, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.852103590965271, "num_tokens": 204080993.0, "step": 5347 }, { "epoch": 0.680320569902048, "ewc_loss": 0.04236350953578949, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016484601655974984, "grad_norm": 5.237838268280029, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8608625531196594, "num_tokens": 204116863.0, "step": 5348 }, { "epoch": 0.6804477801806386, "ewc_loss": 0.04228733479976654, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016530499851796776, "grad_norm": 5.131155014038086, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8540764451026917, "num_tokens": 204156551.0, "step": 5349 }, { "epoch": 0.6805749904592291, "ewc_loss": 0.04223719239234924, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016480355407111347, "grad_norm": 5.132760524749756, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8525398373603821, "num_tokens": 204196210.0, "step": 5350 }, { "epoch": 0.6807022007378196, "ewc_loss": 0.04241422191262245, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016535315080545843, "grad_norm": 5.242837905883789, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8577588796615601, "num_tokens": 204234791.0, "step": 5351 }, { "epoch": 0.6808294110164101, "ewc_loss": 0.04242141544818878, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016542506637051702, "grad_norm": 5.17260217666626, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8627752065658569, "num_tokens": 204271631.0, "step": 5352 }, { "epoch": 0.6809566212950007, "ewc_loss": 0.042350850999355316, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016471942944917828, "grad_norm": 5.176810264587402, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8698793053627014, "num_tokens": 204311879.0, "step": 5353 }, { "epoch": 0.6810838315735911, "ewc_loss": 0.04236224666237831, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016483340004924685, "grad_norm": 5.130436420440674, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8632247447967529, "num_tokens": 204357123.0, "step": 5354 }, { "epoch": 0.6812110418521816, "ewc_loss": 0.04237627983093262, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.000164973724167794, "grad_norm": 5.159061431884766, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8578253388404846, "num_tokens": 204397196.0, "step": 5355 }, { "epoch": 0.6813382521307721, "ewc_loss": 0.04236795753240585, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016489050176460296, "grad_norm": 5.196433067321777, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8504027128219604, "num_tokens": 204434654.0, "step": 5356 }, { "epoch": 0.6814654624093627, "ewc_loss": 0.04240012168884277, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016521214274689555, "grad_norm": 5.195915222167969, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8815315961837769, "num_tokens": 204466581.0, "step": 5357 }, { "epoch": 0.6815926726879532, "ewc_loss": 0.04234381020069122, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016464902728330344, "grad_norm": 5.151706218719482, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8714704513549805, "num_tokens": 204503890.0, "step": 5358 }, { "epoch": 0.6817198829665437, "ewc_loss": 0.042374830693006516, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016495924501214176, "grad_norm": 5.255980014801025, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8552952408790588, "num_tokens": 204537160.0, "step": 5359 }, { "epoch": 0.6818470932451343, "ewc_loss": 0.0422695055603981, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001651267084525898, "grad_norm": 5.189550876617432, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8693797588348389, "num_tokens": 204570077.0, "step": 5360 }, { "epoch": 0.6819743035237247, "ewc_loss": 0.04224654287099838, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016489706467837095, "grad_norm": 5.156797409057617, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8747638463973999, "num_tokens": 204603638.0, "step": 5361 }, { "epoch": 0.6821015138023152, "ewc_loss": 0.042250003665685654, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016493168368469924, "grad_norm": 5.152050971984863, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8560453653335571, "num_tokens": 204644584.0, "step": 5362 }, { "epoch": 0.6822287240809057, "ewc_loss": 0.04239831492304802, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016519408382009715, "grad_norm": 5.223855018615723, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8463242053985596, "num_tokens": 204678459.0, "step": 5363 }, { "epoch": 0.6823559343594963, "ewc_loss": 0.042398106306791306, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.0001651920028962195, "grad_norm": 5.180074214935303, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8565842509269714, "num_tokens": 204712923.0, "step": 5364 }, { "epoch": 0.6824831446380868, "ewc_loss": 0.04239755868911743, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.0001651865168241784, "grad_norm": 5.15975284576416, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8422508239746094, "num_tokens": 204751303.0, "step": 5365 }, { "epoch": 0.6826103549166773, "ewc_loss": 0.042392242699861526, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.0001651333732297644, "grad_norm": 5.1139960289001465, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8602800369262695, "num_tokens": 204791685.0, "step": 5366 }, { "epoch": 0.6827375651952677, "ewc_loss": 0.0424635112285614, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016584603872615844, "grad_norm": 5.181475639343262, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.859588086605072, "num_tokens": 204827748.0, "step": 5367 }, { "epoch": 0.6828647754738583, "ewc_loss": 0.04247484356164932, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016595936904195696, "grad_norm": 5.152073383331299, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8764798641204834, "num_tokens": 204866568.0, "step": 5368 }, { "epoch": 0.6829919857524488, "ewc_loss": 0.04247015714645386, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016591252642683685, "grad_norm": 5.177205562591553, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8581239581108093, "num_tokens": 204908334.0, "step": 5369 }, { "epoch": 0.6831191960310393, "ewc_loss": 0.042490653693675995, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.0001661174901528284, "grad_norm": 5.158969402313232, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8622502088546753, "num_tokens": 204949929.0, "step": 5370 }, { "epoch": 0.6832464063096299, "ewc_loss": 0.04247880354523659, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016599897935520858, "grad_norm": 5.171602249145508, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8497114181518555, "num_tokens": 204991435.0, "step": 5371 }, { "epoch": 0.6833736165882204, "ewc_loss": 0.04246331751346588, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016584408876951784, "grad_norm": 5.188006401062012, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8657950162887573, "num_tokens": 205028434.0, "step": 5372 }, { "epoch": 0.6835008268668108, "ewc_loss": 0.04247410222887993, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016595196211710572, "grad_norm": 5.215411186218262, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8541474342346191, "num_tokens": 205064138.0, "step": 5373 }, { "epoch": 0.6836280371454013, "ewc_loss": 0.042448826134204865, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016569918079767376, "grad_norm": 5.143303871154785, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8530448079109192, "num_tokens": 205104058.0, "step": 5374 }, { "epoch": 0.6837552474239919, "ewc_loss": 0.042425185441970825, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016546277038287371, "grad_norm": 5.227048873901367, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8589804172515869, "num_tokens": 205142609.0, "step": 5375 }, { "epoch": 0.6838824577025824, "ewc_loss": 0.04247100651264191, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016592098108958453, "grad_norm": 5.181265354156494, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8694444894790649, "num_tokens": 205180407.0, "step": 5376 }, { "epoch": 0.6840096679811729, "ewc_loss": 0.04241596907377243, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016537061310373247, "grad_norm": 5.173628330230713, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8809463977813721, "num_tokens": 205217340.0, "step": 5377 }, { "epoch": 0.6841368782597634, "ewc_loss": 0.04241372272372246, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.0001653481594985351, "grad_norm": 5.141376972198486, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8472352027893066, "num_tokens": 205263671.0, "step": 5378 }, { "epoch": 0.6842640885383539, "ewc_loss": 0.04242384433746338, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016544939717277884, "grad_norm": 5.182187080383301, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8570466041564941, "num_tokens": 205302970.0, "step": 5379 }, { "epoch": 0.6843912988169444, "ewc_loss": 0.04238394647836685, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016505041276104748, "grad_norm": 5.145720481872559, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8639049530029297, "num_tokens": 205342247.0, "step": 5380 }, { "epoch": 0.6845185090955349, "ewc_loss": 0.04241460561752319, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016535699251107872, "grad_norm": 5.1981000900268555, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8442364931106567, "num_tokens": 205374897.0, "step": 5381 }, { "epoch": 0.6846457193741254, "ewc_loss": 0.04241364821791649, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016534741735085845, "grad_norm": 5.135973930358887, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8563389778137207, "num_tokens": 205411645.0, "step": 5382 }, { "epoch": 0.684772929652716, "ewc_loss": 0.042442791163921356, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016563884855713695, "grad_norm": 5.159802436828613, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8596498370170593, "num_tokens": 205450972.0, "step": 5383 }, { "epoch": 0.6849001399313065, "ewc_loss": 0.042415253818035126, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016536348266527057, "grad_norm": 5.10581636428833, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8608399629592896, "num_tokens": 205494314.0, "step": 5384 }, { "epoch": 0.6850273502098969, "ewc_loss": 0.04241744428873062, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016538536874577403, "grad_norm": 5.1594743728637695, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8474407196044922, "num_tokens": 205534696.0, "step": 5385 }, { "epoch": 0.6851545604884874, "ewc_loss": 0.0424446240067482, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016565716941840947, "grad_norm": 5.137024402618408, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8711077570915222, "num_tokens": 205575579.0, "step": 5386 }, { "epoch": 0.685281770767078, "ewc_loss": 0.042437758296728134, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016558851348236203, "grad_norm": 5.165506839752197, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.863190233707428, "num_tokens": 205609249.0, "step": 5387 }, { "epoch": 0.6854089810456685, "ewc_loss": 0.0424727126955986, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016593807958997786, "grad_norm": 5.185203552246094, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8564434051513672, "num_tokens": 205644135.0, "step": 5388 }, { "epoch": 0.685536191324259, "ewc_loss": 0.04246760532259941, "ewc_loss_diag": 2.586841583251953e-05, "ewc_loss_parallel": 0.00016588698781561106, "grad_norm": 5.199965476989746, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.85205078125, "num_tokens": 205685312.0, "step": 5389 }, { "epoch": 0.6856634016028496, "ewc_loss": 0.04237965866923332, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016622823022771627, "grad_norm": 5.216176986694336, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8531639575958252, "num_tokens": 205720525.0, "step": 5390 }, { "epoch": 0.68579061188144, "ewc_loss": 0.04235527664422989, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016598438378423452, "grad_norm": 5.135087966918945, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8398138284683228, "num_tokens": 205763358.0, "step": 5391 }, { "epoch": 0.6859178221600305, "ewc_loss": 0.04236622899770737, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001660939451539889, "grad_norm": 5.1947455406188965, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8617441058158875, "num_tokens": 205801413.0, "step": 5392 }, { "epoch": 0.686045032438621, "ewc_loss": 0.04237094894051552, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016614112246315926, "grad_norm": 5.201562404632568, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8613766431808472, "num_tokens": 205840442.0, "step": 5393 }, { "epoch": 0.6861722427172116, "ewc_loss": 0.04235953837633133, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001660270499996841, "grad_norm": 5.208142280578613, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.871766209602356, "num_tokens": 205877319.0, "step": 5394 }, { "epoch": 0.6862994529958021, "ewc_loss": 0.042347684502601624, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016590848099440336, "grad_norm": 5.190658092498779, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8576511740684509, "num_tokens": 205916186.0, "step": 5395 }, { "epoch": 0.6864266632743926, "ewc_loss": 0.042342402040958405, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.00016585564299020916, "grad_norm": 5.18333625793457, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.848676860332489, "num_tokens": 205956336.0, "step": 5396 }, { "epoch": 0.686553873552983, "ewc_loss": 0.04235846921801567, "ewc_loss_diag": 2.574920654296875e-05, "ewc_loss_parallel": 0.0001660163252381608, "grad_norm": 5.193202495574951, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8573026657104492, "num_tokens": 205989570.0, "step": 5397 }, { "epoch": 0.6866810838315736, "ewc_loss": 0.042610980570316315, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001661000569583848, "grad_norm": 5.178868293762207, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8510966300964355, "num_tokens": 206026139.0, "step": 5398 }, { "epoch": 0.6868082941101641, "ewc_loss": 0.042628876864910126, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016627900185994804, "grad_norm": 5.1492791175842285, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8538281917572021, "num_tokens": 206070059.0, "step": 5399 }, { "epoch": 0.6869355043887546, "ewc_loss": 0.04260503873229027, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016604062693659216, "grad_norm": 5.19221305847168, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8634344339370728, "num_tokens": 206104645.0, "step": 5400 }, { "epoch": 0.6870627146673451, "ewc_loss": 0.04266277700662613, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016661798872519284, "grad_norm": 5.183187007904053, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8568437695503235, "num_tokens": 206141220.0, "step": 5401 }, { "epoch": 0.6871899249459357, "ewc_loss": 0.042609695345163345, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016608719306532294, "grad_norm": 5.185894012451172, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8559473156929016, "num_tokens": 206177917.0, "step": 5402 }, { "epoch": 0.6873171352245261, "ewc_loss": 0.04264253377914429, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001664155861362815, "grad_norm": 5.202780723571777, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8573850989341736, "num_tokens": 206213419.0, "step": 5403 }, { "epoch": 0.6874443455031166, "ewc_loss": 0.04266420006752014, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016663224960211664, "grad_norm": 5.203232288360596, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8428908586502075, "num_tokens": 206252421.0, "step": 5404 }, { "epoch": 0.6875715557817071, "ewc_loss": 0.04261650890111923, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016615531058050692, "grad_norm": 5.175158500671387, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8641423583030701, "num_tokens": 206287195.0, "step": 5405 }, { "epoch": 0.6876987660602977, "ewc_loss": 0.042614735662937164, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016613761545158923, "grad_norm": 5.149324417114258, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.839799165725708, "num_tokens": 206328339.0, "step": 5406 }, { "epoch": 0.6878259763388882, "ewc_loss": 0.04264657571911812, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016645599680487067, "grad_norm": 5.197824954986572, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8443434238433838, "num_tokens": 206369523.0, "step": 5407 }, { "epoch": 0.6879531866174787, "ewc_loss": 0.04264316335320473, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016642187256366014, "grad_norm": 5.178561210632324, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.87862628698349, "num_tokens": 206402864.0, "step": 5408 }, { "epoch": 0.6880803968960693, "ewc_loss": 0.04264995455741882, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016648978635203093, "grad_norm": 5.224102973937988, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8469512462615967, "num_tokens": 206439004.0, "step": 5409 }, { "epoch": 0.6882076071746597, "ewc_loss": 0.042600926011800766, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016599948867224157, "grad_norm": 5.191864013671875, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8611572980880737, "num_tokens": 206472320.0, "step": 5410 }, { "epoch": 0.6883348174532502, "ewc_loss": 0.042559683322906494, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001655870582908392, "grad_norm": 5.108804702758789, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8506755232810974, "num_tokens": 206518649.0, "step": 5411 }, { "epoch": 0.6884620277318407, "ewc_loss": 0.0425783172249794, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001657733810134232, "grad_norm": 5.238504409790039, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8736057281494141, "num_tokens": 206551318.0, "step": 5412 }, { "epoch": 0.6885892380104313, "ewc_loss": 0.04264633357524872, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016645358118694276, "grad_norm": 5.160905838012695, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8474346399307251, "num_tokens": 206588173.0, "step": 5413 }, { "epoch": 0.6887164482890218, "ewc_loss": 0.04258347675204277, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016582499665673822, "grad_norm": 5.24456787109375, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8575376868247986, "num_tokens": 206624013.0, "step": 5414 }, { "epoch": 0.6888436585676123, "ewc_loss": 0.04265667498111725, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016655698709655553, "grad_norm": 5.115228652954102, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8657025098800659, "num_tokens": 206668180.0, "step": 5415 }, { "epoch": 0.6889708688462027, "ewc_loss": 0.04260259494185448, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001660161797190085, "grad_norm": 5.1755900382995605, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8546102046966553, "num_tokens": 206710619.0, "step": 5416 }, { "epoch": 0.6890980791247933, "ewc_loss": 0.04267860949039459, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001667763281147927, "grad_norm": 5.167952537536621, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.863745391368866, "num_tokens": 206750121.0, "step": 5417 }, { "epoch": 0.6892252894033838, "ewc_loss": 0.042621761560440063, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001662078284425661, "grad_norm": 5.180651664733887, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8698303699493408, "num_tokens": 206789538.0, "step": 5418 }, { "epoch": 0.6893524996819743, "ewc_loss": 0.04265613853931427, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001665516319917515, "grad_norm": 5.189986228942871, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8592613339424133, "num_tokens": 206827513.0, "step": 5419 }, { "epoch": 0.6894797099605648, "ewc_loss": 0.042659446597099304, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001665847230469808, "grad_norm": 5.21718692779541, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8648075461387634, "num_tokens": 206866563.0, "step": 5420 }, { "epoch": 0.6896069202391554, "ewc_loss": 0.042665205895900726, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016664229042362422, "grad_norm": 5.158866882324219, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8697705268859863, "num_tokens": 206906281.0, "step": 5421 }, { "epoch": 0.6897341305177458, "ewc_loss": 0.04261365160346031, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016612675972282887, "grad_norm": 5.2055253982543945, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8400983214378357, "num_tokens": 206944055.0, "step": 5422 }, { "epoch": 0.6898613407963363, "ewc_loss": 0.04269706830382347, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016696091915946454, "grad_norm": 5.2092461585998535, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8595579266548157, "num_tokens": 206984421.0, "step": 5423 }, { "epoch": 0.6899885510749268, "ewc_loss": 0.042592138051986694, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016591159510426223, "grad_norm": 5.185403823852539, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8479690551757812, "num_tokens": 207020319.0, "step": 5424 }, { "epoch": 0.6901157613535174, "ewc_loss": 0.04264570027589798, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001664472365519032, "grad_norm": 5.193535327911377, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8622821569442749, "num_tokens": 207061487.0, "step": 5425 }, { "epoch": 0.6902429716321079, "ewc_loss": 0.04262661933898926, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016625641728751361, "grad_norm": 5.200388431549072, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.860728919506073, "num_tokens": 207098343.0, "step": 5426 }, { "epoch": 0.6903701819106984, "ewc_loss": 0.04262913763523102, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016628163575660437, "grad_norm": 5.184268951416016, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8553966879844666, "num_tokens": 207140381.0, "step": 5427 }, { "epoch": 0.6904973921892888, "ewc_loss": 0.042674873024225235, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016673895879648626, "grad_norm": 5.208932399749756, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8723991513252258, "num_tokens": 207173923.0, "step": 5428 }, { "epoch": 0.6906246024678794, "ewc_loss": 0.04261545091867447, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016614471678622067, "grad_norm": 5.196823596954346, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8658319711685181, "num_tokens": 207214840.0, "step": 5429 }, { "epoch": 0.6907518127464699, "ewc_loss": 0.04262019693851471, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001661922287894413, "grad_norm": 5.208086013793945, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8587203621864319, "num_tokens": 207247850.0, "step": 5430 }, { "epoch": 0.6908790230250604, "ewc_loss": 0.042638737708330154, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016637762018945068, "grad_norm": 5.20301628112793, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.859565019607544, "num_tokens": 207286698.0, "step": 5431 }, { "epoch": 0.691006233303651, "ewc_loss": 0.0426095686852932, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016608591249678284, "grad_norm": 5.1957550048828125, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8542265892028809, "num_tokens": 207322470.0, "step": 5432 }, { "epoch": 0.6911334435822415, "ewc_loss": 0.042640864849090576, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016639889508951455, "grad_norm": 5.232324600219727, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8448392152786255, "num_tokens": 207361517.0, "step": 5433 }, { "epoch": 0.6912606538608319, "ewc_loss": 0.042656563222408295, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016655586659908295, "grad_norm": 5.2393293380737305, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8693763017654419, "num_tokens": 207394504.0, "step": 5434 }, { "epoch": 0.6913878641394224, "ewc_loss": 0.04261869564652443, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016617718210909516, "grad_norm": 5.230485439300537, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8410738110542297, "num_tokens": 207430148.0, "step": 5435 }, { "epoch": 0.691515074418013, "ewc_loss": 0.04258481785655022, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016583841352257878, "grad_norm": 5.246991157531738, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.857527494430542, "num_tokens": 207461094.0, "step": 5436 }, { "epoch": 0.6916422846966035, "ewc_loss": 0.04263286665081978, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016631890321150422, "grad_norm": 5.172448635101318, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8656539916992188, "num_tokens": 207500572.0, "step": 5437 }, { "epoch": 0.691769494975194, "ewc_loss": 0.04258387163281441, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016582895477768034, "grad_norm": 5.268452167510986, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.851784348487854, "num_tokens": 207534977.0, "step": 5438 }, { "epoch": 0.6918967052537845, "ewc_loss": 0.04267371445894241, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001667273900238797, "grad_norm": 5.185892105102539, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8648444414138794, "num_tokens": 207569809.0, "step": 5439 }, { "epoch": 0.692023915532375, "ewc_loss": 0.042566828429698944, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016565850819461048, "grad_norm": 5.223633289337158, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8601661324501038, "num_tokens": 207606785.0, "step": 5440 }, { "epoch": 0.6921511258109655, "ewc_loss": 0.0426555797457695, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016654601495247334, "grad_norm": 5.161006450653076, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8586104512214661, "num_tokens": 207648344.0, "step": 5441 }, { "epoch": 0.692278336089556, "ewc_loss": 0.04270009696483612, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.000165770499734208, "grad_norm": 5.270664691925049, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8673598766326904, "num_tokens": 207680962.0, "step": 5442 }, { "epoch": 0.6924055463681466, "ewc_loss": 0.04265301674604416, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016652040358167142, "grad_norm": 5.201901435852051, "learning_rate": 1e-06, "loss": 0.3521, "mean_token_accuracy": 0.8911187648773193, "num_tokens": 207709162.0, "step": 5443 }, { "epoch": 0.6925327566467371, "ewc_loss": 0.0425838902592659, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001658291439525783, "grad_norm": 5.210660457611084, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8556899428367615, "num_tokens": 207746089.0, "step": 5444 }, { "epoch": 0.6926599669253276, "ewc_loss": 0.042624443769454956, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001662346621742472, "grad_norm": 5.190371513366699, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8703410625457764, "num_tokens": 207791235.0, "step": 5445 }, { "epoch": 0.692787177203918, "ewc_loss": 0.04262648522853851, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016625506395939738, "grad_norm": 5.290065288543701, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8517118692398071, "num_tokens": 207824527.0, "step": 5446 }, { "epoch": 0.6929143874825086, "ewc_loss": 0.04267650097608566, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016675527149345726, "grad_norm": 5.171664714813232, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.869699239730835, "num_tokens": 207863535.0, "step": 5447 }, { "epoch": 0.6930415977610991, "ewc_loss": 0.042617782950401306, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016616808716207743, "grad_norm": 5.221733093261719, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8530312776565552, "num_tokens": 207899694.0, "step": 5448 }, { "epoch": 0.6931688080396896, "ewc_loss": 0.04277123510837555, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016648188466206193, "grad_norm": 5.242512226104736, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8508103489875793, "num_tokens": 207931440.0, "step": 5449 }, { "epoch": 0.6932960183182801, "ewc_loss": 0.04278292506933212, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016659879474900663, "grad_norm": 5.2463178634643555, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8580796718597412, "num_tokens": 207963758.0, "step": 5450 }, { "epoch": 0.6934232285968707, "ewc_loss": 0.042586445808410645, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016585469711571932, "grad_norm": 5.167383193969727, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8642165660858154, "num_tokens": 208002383.0, "step": 5451 }, { "epoch": 0.6935504388754611, "ewc_loss": 0.04261612892150879, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016615152708254755, "grad_norm": 5.222444534301758, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8637248873710632, "num_tokens": 208038723.0, "step": 5452 }, { "epoch": 0.6936776491540516, "ewc_loss": 0.042608968913555145, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001660799462115392, "grad_norm": 5.186115264892578, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.864957332611084, "num_tokens": 208071851.0, "step": 5453 }, { "epoch": 0.6938048594326421, "ewc_loss": 0.042616188526153564, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016615213826298714, "grad_norm": 5.213198661804199, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8415584564208984, "num_tokens": 208106916.0, "step": 5454 }, { "epoch": 0.6939320697112327, "ewc_loss": 0.042633917182683945, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001663294096942991, "grad_norm": 5.200058460235596, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8689889311790466, "num_tokens": 208146883.0, "step": 5455 }, { "epoch": 0.6940592799898232, "ewc_loss": 0.04262522608041763, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016624249110464007, "grad_norm": 5.220247745513916, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8617205619812012, "num_tokens": 208187364.0, "step": 5456 }, { "epoch": 0.6941864902684137, "ewc_loss": 0.04260765016078949, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001660667621763423, "grad_norm": 5.19432258605957, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8662862777709961, "num_tokens": 208226087.0, "step": 5457 }, { "epoch": 0.6943137005470043, "ewc_loss": 0.042580097913742065, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001657912362134084, "grad_norm": 5.227369785308838, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8574546575546265, "num_tokens": 208265193.0, "step": 5458 }, { "epoch": 0.6944409108255947, "ewc_loss": 0.04263487458229065, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016633898485451937, "grad_norm": 5.184329986572266, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8737168908119202, "num_tokens": 208305163.0, "step": 5459 }, { "epoch": 0.6945681211041852, "ewc_loss": 0.04261953756213188, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016618560766801238, "grad_norm": 5.2792158126831055, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8487146496772766, "num_tokens": 208342229.0, "step": 5460 }, { "epoch": 0.6946953313827757, "ewc_loss": 0.04262837395071983, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016627398144919425, "grad_norm": 5.13463020324707, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8504832983016968, "num_tokens": 208384606.0, "step": 5461 }, { "epoch": 0.6948225416613663, "ewc_loss": 0.04256141558289528, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016560438962187618, "grad_norm": 5.2528510093688965, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.87774658203125, "num_tokens": 208416777.0, "step": 5462 }, { "epoch": 0.6949497519399568, "ewc_loss": 0.04269241541624069, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016691441123839468, "grad_norm": 5.222354412078857, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8446704149246216, "num_tokens": 208456101.0, "step": 5463 }, { "epoch": 0.6950769622185473, "ewc_loss": 0.04261183738708496, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016610859893262386, "grad_norm": 5.249607086181641, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.868820071220398, "num_tokens": 208489201.0, "step": 5464 }, { "epoch": 0.6952041724971377, "ewc_loss": 0.04263865202665329, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016637677617836744, "grad_norm": 5.20955753326416, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8743480443954468, "num_tokens": 208525006.0, "step": 5465 }, { "epoch": 0.6953313827757283, "ewc_loss": 0.04260725528001785, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001660628040554002, "grad_norm": 5.189778804779053, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8563144207000732, "num_tokens": 208568997.0, "step": 5466 }, { "epoch": 0.6954585930543188, "ewc_loss": 0.04261476546525955, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001661378628341481, "grad_norm": 5.232118606567383, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8507685661315918, "num_tokens": 208605925.0, "step": 5467 }, { "epoch": 0.6955858033329093, "ewc_loss": 0.04263736307621002, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016636383952572942, "grad_norm": 5.225569248199463, "learning_rate": 1e-06, "loss": 0.5479, "mean_token_accuracy": 0.8343147039413452, "num_tokens": 208646290.0, "step": 5468 }, { "epoch": 0.6957130136114998, "ewc_loss": 0.042609307914972305, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001660833222558722, "grad_norm": 5.164570331573486, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8601748943328857, "num_tokens": 208686424.0, "step": 5469 }, { "epoch": 0.6958402238900904, "ewc_loss": 0.042645812034606934, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.000166448371601291, "grad_norm": 5.262602806091309, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8616966009140015, "num_tokens": 208719643.0, "step": 5470 }, { "epoch": 0.6959674341686808, "ewc_loss": 0.04265906661748886, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016658089589327574, "grad_norm": 5.2015228271484375, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8440986275672913, "num_tokens": 208760804.0, "step": 5471 }, { "epoch": 0.6960946444472713, "ewc_loss": 0.04265552759170532, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001665455347392708, "grad_norm": 5.180618762969971, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8551433086395264, "num_tokens": 208803022.0, "step": 5472 }, { "epoch": 0.6962218547258618, "ewc_loss": 0.04269097000360489, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001668999611865729, "grad_norm": 5.201588153839111, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.860374927520752, "num_tokens": 208844205.0, "step": 5473 }, { "epoch": 0.6963490650044524, "ewc_loss": 0.04268177971243858, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001668080221861601, "grad_norm": 5.182861804962158, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8719439506530762, "num_tokens": 208886882.0, "step": 5474 }, { "epoch": 0.6964762752830429, "ewc_loss": 0.04270131513476372, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001670033816481009, "grad_norm": 5.255993366241455, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.856683611869812, "num_tokens": 208926649.0, "step": 5475 }, { "epoch": 0.6966034855616334, "ewc_loss": 0.042708974331617355, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001670799683779478, "grad_norm": 5.202885627746582, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8534660339355469, "num_tokens": 208963403.0, "step": 5476 }, { "epoch": 0.6967306958402238, "ewc_loss": 0.04269055277109146, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016689578478690237, "grad_norm": 5.271214962005615, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8523354530334473, "num_tokens": 209002475.0, "step": 5477 }, { "epoch": 0.6968579061188144, "ewc_loss": 0.04267166554927826, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016670690092723817, "grad_norm": 5.184581756591797, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8598395586013794, "num_tokens": 209044216.0, "step": 5478 }, { "epoch": 0.6969851163974049, "ewc_loss": 0.042669717222452164, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016668740136083215, "grad_norm": 5.269233703613281, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8494860529899597, "num_tokens": 209080774.0, "step": 5479 }, { "epoch": 0.6971123266759954, "ewc_loss": 0.04274871200323105, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016747736663091928, "grad_norm": 5.269887924194336, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8651527166366577, "num_tokens": 209117167.0, "step": 5480 }, { "epoch": 0.697239536954586, "ewc_loss": 0.0426890105009079, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016688034520484507, "grad_norm": 5.3112592697143555, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8544789552688599, "num_tokens": 209148872.0, "step": 5481 }, { "epoch": 0.6973667472331765, "ewc_loss": 0.04273717850446701, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016736201359890401, "grad_norm": 5.242493629455566, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8605747222900391, "num_tokens": 209187912.0, "step": 5482 }, { "epoch": 0.6974939575117669, "ewc_loss": 0.042730629444122314, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001672965445322916, "grad_norm": 5.260700225830078, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8659151792526245, "num_tokens": 209227055.0, "step": 5483 }, { "epoch": 0.6976211677903574, "ewc_loss": 0.0427425354719162, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001674155646469444, "grad_norm": 5.323410987854004, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.873367428779602, "num_tokens": 209255343.0, "step": 5484 }, { "epoch": 0.697748378068948, "ewc_loss": 0.042758308351039886, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016757333651185036, "grad_norm": 5.238813877105713, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8532580137252808, "num_tokens": 209294621.0, "step": 5485 }, { "epoch": 0.6978755883475385, "ewc_loss": 0.04267473518848419, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016673756181262434, "grad_norm": 5.180312633514404, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8752086758613586, "num_tokens": 209336519.0, "step": 5486 }, { "epoch": 0.698002798626129, "ewc_loss": 0.042728763073682785, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001672778744250536, "grad_norm": 5.267606258392334, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8569223880767822, "num_tokens": 209372913.0, "step": 5487 }, { "epoch": 0.6981300089047195, "ewc_loss": 0.04275645688176155, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016755481192376465, "grad_norm": 5.179919719696045, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8518166542053223, "num_tokens": 209417768.0, "step": 5488 }, { "epoch": 0.69825721918331, "ewc_loss": 0.042728498578071594, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001672752114245668, "grad_norm": 5.300550937652588, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8619140386581421, "num_tokens": 209451221.0, "step": 5489 }, { "epoch": 0.6983844294619005, "ewc_loss": 0.04283950477838516, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016838528972584754, "grad_norm": 5.216589450836182, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8587285280227661, "num_tokens": 209488710.0, "step": 5490 }, { "epoch": 0.698511639740491, "ewc_loss": 0.04276198893785477, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016761012375354767, "grad_norm": 5.272157192230225, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8558517694473267, "num_tokens": 209525166.0, "step": 5491 }, { "epoch": 0.6986388500190815, "ewc_loss": 0.042907971888780594, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001678492408245802, "grad_norm": 5.218993663787842, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8614228963851929, "num_tokens": 209564804.0, "step": 5492 }, { "epoch": 0.6987660602976721, "ewc_loss": 0.04288420453667641, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001676115789450705, "grad_norm": 5.2100701332092285, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8629759550094604, "num_tokens": 209607155.0, "step": 5493 }, { "epoch": 0.6988932705762626, "ewc_loss": 0.042879845947027206, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016756799595896155, "grad_norm": 5.312216281890869, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8496034741401672, "num_tokens": 209641226.0, "step": 5494 }, { "epoch": 0.699020480854853, "ewc_loss": 0.042927466332912445, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016804422193672508, "grad_norm": 5.198284149169922, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8562490940093994, "num_tokens": 209679180.0, "step": 5495 }, { "epoch": 0.6991476911334435, "ewc_loss": 0.042877744883298874, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016754698299337178, "grad_norm": 5.217027187347412, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8519750833511353, "num_tokens": 209716412.0, "step": 5496 }, { "epoch": 0.6992749014120341, "ewc_loss": 0.04280360788106918, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016802630852907896, "grad_norm": 5.224592685699463, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8506300449371338, "num_tokens": 209755697.0, "step": 5497 }, { "epoch": 0.6994021116906246, "ewc_loss": 0.04287499561905861, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016874019638635218, "grad_norm": 5.241575241088867, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8660068511962891, "num_tokens": 209793654.0, "step": 5498 }, { "epoch": 0.6995293219692151, "ewc_loss": 0.04276544600725174, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001676446700002998, "grad_norm": 5.261416912078857, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.857567548751831, "num_tokens": 209830703.0, "step": 5499 }, { "epoch": 0.6996565322478057, "ewc_loss": 0.04279857873916626, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016797604621388018, "grad_norm": 5.2337212562561035, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8586618304252625, "num_tokens": 209868994.0, "step": 5500 }, { "epoch": 0.6997837425263961, "ewc_loss": 0.04274754226207733, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016746565233916044, "grad_norm": 5.263142108917236, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8686068058013916, "num_tokens": 209909605.0, "step": 5501 }, { "epoch": 0.6999109528049866, "ewc_loss": 0.04277835786342621, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001677737891441211, "grad_norm": 5.259526252746582, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8658549785614014, "num_tokens": 209945196.0, "step": 5502 }, { "epoch": 0.7000381630835771, "ewc_loss": 0.04278168827295303, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016780711302999407, "grad_norm": 5.239598274230957, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8587937951087952, "num_tokens": 209988585.0, "step": 5503 }, { "epoch": 0.7001653733621677, "ewc_loss": 0.04275049269199371, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016749516362324357, "grad_norm": 5.22409200668335, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8596044778823853, "num_tokens": 210032084.0, "step": 5504 }, { "epoch": 0.7002925836407582, "ewc_loss": 0.042747072875499725, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016746093751862645, "grad_norm": 5.241667747497559, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8496506214141846, "num_tokens": 210070639.0, "step": 5505 }, { "epoch": 0.7004197939193487, "ewc_loss": 0.04273208603262901, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016731109644751996, "grad_norm": 5.201488494873047, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8677126169204712, "num_tokens": 210113793.0, "step": 5506 }, { "epoch": 0.7005470041979391, "ewc_loss": 0.04273761436343193, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016736637917347252, "grad_norm": 5.293203830718994, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8514297008514404, "num_tokens": 210153517.0, "step": 5507 }, { "epoch": 0.7006742144765297, "ewc_loss": 0.04277334734797478, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001677237160038203, "grad_norm": 5.321560382843018, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8540723919868469, "num_tokens": 210186818.0, "step": 5508 }, { "epoch": 0.7008014247551202, "ewc_loss": 0.04268595576286316, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001668497861828655, "grad_norm": 5.234135627746582, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.851844310760498, "num_tokens": 210224283.0, "step": 5509 }, { "epoch": 0.7009286350337107, "ewc_loss": 0.04269859567284584, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001669761841185391, "grad_norm": 5.265103816986084, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8553065061569214, "num_tokens": 210263439.0, "step": 5510 }, { "epoch": 0.7010558453123013, "ewc_loss": 0.04271087795495987, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016709903138689697, "grad_norm": 5.185418128967285, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8606972694396973, "num_tokens": 210305887.0, "step": 5511 }, { "epoch": 0.7011830555908918, "ewc_loss": 0.04274724796414375, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001674627128522843, "grad_norm": 5.274371147155762, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8587967753410339, "num_tokens": 210340118.0, "step": 5512 }, { "epoch": 0.7013102658694823, "ewc_loss": 0.04278739541769028, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016786421474535018, "grad_norm": 5.213665008544922, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8600003719329834, "num_tokens": 210376098.0, "step": 5513 }, { "epoch": 0.7014374761480727, "ewc_loss": 0.042699187994003296, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016698210674803704, "grad_norm": 5.220715522766113, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8632254600524902, "num_tokens": 210418801.0, "step": 5514 }, { "epoch": 0.7015646864266633, "ewc_loss": 0.04278607666492462, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016785101615823805, "grad_norm": 5.2986650466918945, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8482560515403748, "num_tokens": 210452392.0, "step": 5515 }, { "epoch": 0.7016918967052538, "ewc_loss": 0.04277745634317398, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016776479606050998, "grad_norm": 5.2307209968566895, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8669230937957764, "num_tokens": 210489697.0, "step": 5516 }, { "epoch": 0.7018191069838443, "ewc_loss": 0.04275064915418625, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016749670612625778, "grad_norm": 5.222342491149902, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8435024619102478, "num_tokens": 210530212.0, "step": 5517 }, { "epoch": 0.7019463172624348, "ewc_loss": 0.04277653247117996, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016775555559433997, "grad_norm": 5.2647480964660645, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8658272624015808, "num_tokens": 210564974.0, "step": 5518 }, { "epoch": 0.7020735275410254, "ewc_loss": 0.0427989587187767, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016797982971183956, "grad_norm": 5.263949871063232, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8600859642028809, "num_tokens": 210601272.0, "step": 5519 }, { "epoch": 0.7022007378196158, "ewc_loss": 0.042803872376680374, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016802895697765052, "grad_norm": 5.273077487945557, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8620024919509888, "num_tokens": 210636908.0, "step": 5520 }, { "epoch": 0.7023279480982063, "ewc_loss": 0.04278619959950447, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016785223851911724, "grad_norm": 5.299032211303711, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8561866283416748, "num_tokens": 210672629.0, "step": 5521 }, { "epoch": 0.7024551583767968, "ewc_loss": 0.04271836578845978, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016717390099074692, "grad_norm": 5.254083156585693, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8740963339805603, "num_tokens": 210703531.0, "step": 5522 }, { "epoch": 0.7025823686553874, "ewc_loss": 0.04276411980390549, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016763144230935723, "grad_norm": 5.273824691772461, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8726220726966858, "num_tokens": 210742946.0, "step": 5523 }, { "epoch": 0.7027095789339779, "ewc_loss": 0.042719997465610504, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016719022823963314, "grad_norm": 5.218209266662598, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8689300417900085, "num_tokens": 210778345.0, "step": 5524 }, { "epoch": 0.7028367892125684, "ewc_loss": 0.04271113872528076, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001671016070758924, "grad_norm": 5.246738433837891, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8657195568084717, "num_tokens": 210821544.0, "step": 5525 }, { "epoch": 0.7029639994911588, "ewc_loss": 0.04274269938468933, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016741723811719567, "grad_norm": 5.259296417236328, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8644437789916992, "num_tokens": 210858060.0, "step": 5526 }, { "epoch": 0.7030912097697494, "ewc_loss": 0.042708903551101685, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016707926988601685, "grad_norm": 5.257900238037109, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8542895317077637, "num_tokens": 210900365.0, "step": 5527 }, { "epoch": 0.7032184200483399, "ewc_loss": 0.042850326746702194, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001672727958066389, "grad_norm": 5.217004776000977, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8547960519790649, "num_tokens": 210941118.0, "step": 5528 }, { "epoch": 0.7033456303269304, "ewc_loss": 0.04285099357366562, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016727947513572872, "grad_norm": 5.200438022613525, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8665193319320679, "num_tokens": 210979865.0, "step": 5529 }, { "epoch": 0.703472840605521, "ewc_loss": 0.042770467698574066, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001676949323154986, "grad_norm": 5.239782810211182, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8596529364585876, "num_tokens": 211016385.0, "step": 5530 }, { "epoch": 0.7036000508841115, "ewc_loss": 0.04279051721096039, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016789538494776934, "grad_norm": 5.247272491455078, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8476465940475464, "num_tokens": 211059071.0, "step": 5531 }, { "epoch": 0.7037272611627019, "ewc_loss": 0.04285498708486557, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001673193764872849, "grad_norm": 5.223349571228027, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8629509210586548, "num_tokens": 211094189.0, "step": 5532 }, { "epoch": 0.7038544714412924, "ewc_loss": 0.042938947677612305, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016815902199596167, "grad_norm": 5.269806385040283, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8739509582519531, "num_tokens": 211130524.0, "step": 5533 }, { "epoch": 0.703981681719883, "ewc_loss": 0.042905282229185104, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016782234888523817, "grad_norm": 5.19661808013916, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8527392745018005, "num_tokens": 211172335.0, "step": 5534 }, { "epoch": 0.7041088919984735, "ewc_loss": 0.0429178886115551, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016794842667877674, "grad_norm": 5.325311183929443, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.861215353012085, "num_tokens": 211205406.0, "step": 5535 }, { "epoch": 0.704236102277064, "ewc_loss": 0.04282345622777939, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001682248112047091, "grad_norm": 5.230581760406494, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8552099466323853, "num_tokens": 211246443.0, "step": 5536 }, { "epoch": 0.7043633125556545, "ewc_loss": 0.04277351498603821, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016772537492215633, "grad_norm": 5.211732387542725, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8735495805740356, "num_tokens": 211286707.0, "step": 5537 }, { "epoch": 0.704490522834245, "ewc_loss": 0.042821384966373444, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001682040747255087, "grad_norm": 5.2683820724487305, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8697498440742493, "num_tokens": 211319448.0, "step": 5538 }, { "epoch": 0.7046177331128355, "ewc_loss": 0.04279740899801254, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016796431737020612, "grad_norm": 5.215631484985352, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8546804189682007, "num_tokens": 211363747.0, "step": 5539 }, { "epoch": 0.704744943391426, "ewc_loss": 0.04280334711074829, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001680237182881683, "grad_norm": 5.25493049621582, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8655303120613098, "num_tokens": 211408080.0, "step": 5540 }, { "epoch": 0.7048721536700165, "ewc_loss": 0.042838066816329956, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016837088332977146, "grad_norm": 5.219913005828857, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8497481346130371, "num_tokens": 211445656.0, "step": 5541 }, { "epoch": 0.7049993639486071, "ewc_loss": 0.04280626028776169, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016805283667054027, "grad_norm": 5.261544704437256, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8686606884002686, "num_tokens": 211484342.0, "step": 5542 }, { "epoch": 0.7051265742271976, "ewc_loss": 0.04285065457224846, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016849677194841206, "grad_norm": 5.246955394744873, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8540210127830505, "num_tokens": 211518535.0, "step": 5543 }, { "epoch": 0.705253784505788, "ewc_loss": 0.04283633083105087, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016835355199873447, "grad_norm": 5.243347644805908, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8535748720169067, "num_tokens": 211557236.0, "step": 5544 }, { "epoch": 0.7053809947843785, "ewc_loss": 0.04286152869462967, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016860553296282887, "grad_norm": 5.217014789581299, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8705118894577026, "num_tokens": 211599216.0, "step": 5545 }, { "epoch": 0.7055082050629691, "ewc_loss": 0.0428328737616539, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016831896209623665, "grad_norm": 5.256842136383057, "learning_rate": 1e-06, "loss": 0.5091, "mean_token_accuracy": 0.8409788608551025, "num_tokens": 211638930.0, "step": 5546 }, { "epoch": 0.7056354153415596, "ewc_loss": 0.04279955476522446, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016798578144516796, "grad_norm": 5.211558818817139, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8526268005371094, "num_tokens": 211680765.0, "step": 5547 }, { "epoch": 0.7057626256201501, "ewc_loss": 0.04281998798251152, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016819011943880469, "grad_norm": 5.210888862609863, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8739223480224609, "num_tokens": 211719037.0, "step": 5548 }, { "epoch": 0.7058898358987407, "ewc_loss": 0.04283265024423599, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016831675020512193, "grad_norm": 5.259043216705322, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.854163408279419, "num_tokens": 211759655.0, "step": 5549 }, { "epoch": 0.7060170461773311, "ewc_loss": 0.042849235236644745, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016848256927914917, "grad_norm": 5.214137077331543, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8416321277618408, "num_tokens": 211800405.0, "step": 5550 }, { "epoch": 0.7061442564559216, "ewc_loss": 0.04283799231052399, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016837015573401004, "grad_norm": 5.24982213973999, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8420858383178711, "num_tokens": 211846057.0, "step": 5551 }, { "epoch": 0.7062714667345121, "ewc_loss": 0.042887791991233826, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001688681513769552, "grad_norm": 5.262376308441162, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8566523194313049, "num_tokens": 211885944.0, "step": 5552 }, { "epoch": 0.7063986770131027, "ewc_loss": 0.042887140065431595, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001688616321189329, "grad_norm": 5.189603805541992, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8637239336967468, "num_tokens": 211928727.0, "step": 5553 }, { "epoch": 0.7065258872916932, "ewc_loss": 0.04303335025906563, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016910303384065628, "grad_norm": 5.276865005493164, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8500116467475891, "num_tokens": 211967593.0, "step": 5554 }, { "epoch": 0.7066530975702837, "ewc_loss": 0.04305132478475571, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001692827936494723, "grad_norm": 5.292945384979248, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8544766902923584, "num_tokens": 212001023.0, "step": 5555 }, { "epoch": 0.7067803078488741, "ewc_loss": 0.043062277138233185, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016939228225965053, "grad_norm": 5.221638202667236, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8507099151611328, "num_tokens": 212045611.0, "step": 5556 }, { "epoch": 0.7069075181274647, "ewc_loss": 0.042923372238874435, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.000169223960256204, "grad_norm": 5.246471881866455, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8544620275497437, "num_tokens": 212087167.0, "step": 5557 }, { "epoch": 0.7070347284060552, "ewc_loss": 0.04304799810051918, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016924951341934502, "grad_norm": 10.34942626953125, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8607298135757446, "num_tokens": 212124254.0, "step": 5558 }, { "epoch": 0.7071619386846457, "ewc_loss": 0.04900273308157921, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00022757616534363478, "grad_norm": 6.1681623458862305, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8535898327827454, "num_tokens": 212162706.0, "step": 5559 }, { "epoch": 0.7072891489632362, "ewc_loss": 0.041971854865550995, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00015726737910881639, "grad_norm": 4.864704132080078, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8591259717941284, "num_tokens": 212197359.0, "step": 5560 }, { "epoch": 0.7074163592418268, "ewc_loss": 0.04395841062068939, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017957434465643018, "grad_norm": 5.67367696762085, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8666428327560425, "num_tokens": 212237300.0, "step": 5561 }, { "epoch": 0.7075435695204173, "ewc_loss": 0.04367809742689133, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017677120922598988, "grad_norm": 5.181343078613281, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8399840593338013, "num_tokens": 212283472.0, "step": 5562 }, { "epoch": 0.7076707797990077, "ewc_loss": 0.04322336986660957, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001722239248920232, "grad_norm": 5.429670810699463, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8546042442321777, "num_tokens": 212321433.0, "step": 5563 }, { "epoch": 0.7077979900775982, "ewc_loss": 0.043488118797540665, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001748714130371809, "grad_norm": 5.314827919006348, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8572941422462463, "num_tokens": 212362234.0, "step": 5564 }, { "epoch": 0.7079252003561888, "ewc_loss": 0.04315996170043945, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017158982518594712, "grad_norm": 5.4233174324035645, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8489366769790649, "num_tokens": 212399259.0, "step": 5565 }, { "epoch": 0.7080524106347793, "ewc_loss": 0.043306615203619, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017305638175457716, "grad_norm": 5.303403377532959, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8589655160903931, "num_tokens": 212438489.0, "step": 5566 }, { "epoch": 0.7081796209133698, "ewc_loss": 0.04303431510925293, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017033341282512993, "grad_norm": 5.271255016326904, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8509129881858826, "num_tokens": 212482283.0, "step": 5567 }, { "epoch": 0.7083068311919604, "ewc_loss": 0.043098486959934235, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017097507952712476, "grad_norm": 5.326991558074951, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8543570637702942, "num_tokens": 212522749.0, "step": 5568 }, { "epoch": 0.7084340414705508, "ewc_loss": 0.04301384836435318, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017012874013744295, "grad_norm": 5.301992893218994, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8508807420730591, "num_tokens": 212559937.0, "step": 5569 }, { "epoch": 0.7085612517491413, "ewc_loss": 0.04299210011959076, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016991123266052455, "grad_norm": 5.340950965881348, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8638814687728882, "num_tokens": 212595578.0, "step": 5570 }, { "epoch": 0.7086884620277318, "ewc_loss": 0.04297845810651779, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001697748084552586, "grad_norm": 5.299800395965576, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.869522213935852, "num_tokens": 212634085.0, "step": 5571 }, { "epoch": 0.7088156723063224, "ewc_loss": 0.04288540035486221, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016884422802831978, "grad_norm": 5.297574520111084, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8481887578964233, "num_tokens": 212673557.0, "step": 5572 }, { "epoch": 0.7089428825849129, "ewc_loss": 0.04294204339385033, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016941066132858396, "grad_norm": 5.280723571777344, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8692684173583984, "num_tokens": 212711142.0, "step": 5573 }, { "epoch": 0.7090700928635034, "ewc_loss": 0.04289345443248749, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001689247728791088, "grad_norm": 5.3294830322265625, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8588363528251648, "num_tokens": 212745570.0, "step": 5574 }, { "epoch": 0.7091973031420938, "ewc_loss": 0.04298161715269089, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001698064006632194, "grad_norm": 5.318304538726807, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8616042137145996, "num_tokens": 212783048.0, "step": 5575 }, { "epoch": 0.7093245134206844, "ewc_loss": 0.04288904368877411, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016888068057596684, "grad_norm": 5.240208148956299, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8765658140182495, "num_tokens": 212821231.0, "step": 5576 }, { "epoch": 0.7094517236992749, "ewc_loss": 0.04303876310586929, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016915718151722103, "grad_norm": 5.30801248550415, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8570605516433716, "num_tokens": 212861715.0, "step": 5577 }, { "epoch": 0.7095789339778654, "ewc_loss": 0.042944349348545074, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001694337115623057, "grad_norm": 5.357118606567383, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8495355844497681, "num_tokens": 212900655.0, "step": 5578 }, { "epoch": 0.709706144256456, "ewc_loss": 0.04288138821721077, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016880410839803517, "grad_norm": 5.310795783996582, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8510959148406982, "num_tokens": 212939308.0, "step": 5579 }, { "epoch": 0.7098333545350465, "ewc_loss": 0.042898401618003845, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016897426394280046, "grad_norm": 5.262633800506592, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8625718951225281, "num_tokens": 212981174.0, "step": 5580 }, { "epoch": 0.7099605648136369, "ewc_loss": 0.04286925494670868, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016868277452886105, "grad_norm": 5.287601947784424, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.869127631187439, "num_tokens": 213016401.0, "step": 5581 }, { "epoch": 0.7100877750922274, "ewc_loss": 0.042881473898887634, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001688049960648641, "grad_norm": 5.273917198181152, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8464895486831665, "num_tokens": 213058480.0, "step": 5582 }, { "epoch": 0.710214985370818, "ewc_loss": 0.04299069941043854, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016867650265339762, "grad_norm": 5.268240928649902, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8703706860542297, "num_tokens": 213093527.0, "step": 5583 }, { "epoch": 0.7103421956494085, "ewc_loss": 0.04290522634983063, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001690425124252215, "grad_norm": 5.276546001434326, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.856541633605957, "num_tokens": 213126778.0, "step": 5584 }, { "epoch": 0.710469405927999, "ewc_loss": 0.042969442903995514, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001696846738923341, "grad_norm": 5.2074875831604, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8621608018875122, "num_tokens": 213170863.0, "step": 5585 }, { "epoch": 0.7105966162065895, "ewc_loss": 0.04290942847728729, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016908449470065534, "grad_norm": 5.2832512855529785, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8352442979812622, "num_tokens": 213211336.0, "step": 5586 }, { "epoch": 0.71072382648518, "ewc_loss": 0.04297351837158203, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016972543380688876, "grad_norm": 5.272019386291504, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8653882145881653, "num_tokens": 213245262.0, "step": 5587 }, { "epoch": 0.7108510367637705, "ewc_loss": 0.042981620877981186, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016980644431896508, "grad_norm": 5.286431312561035, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8498360514640808, "num_tokens": 213282150.0, "step": 5588 }, { "epoch": 0.710978247042361, "ewc_loss": 0.04299544915556908, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016994471661746502, "grad_norm": 5.2141828536987305, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8640040755271912, "num_tokens": 213325573.0, "step": 5589 }, { "epoch": 0.7111054573209515, "ewc_loss": 0.04297932982444763, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016978353960439563, "grad_norm": 5.3378586769104, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8669363260269165, "num_tokens": 213363738.0, "step": 5590 }, { "epoch": 0.7112326675995421, "ewc_loss": 0.04302153363823891, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017020557424984872, "grad_norm": 5.212493419647217, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8645379543304443, "num_tokens": 213404796.0, "step": 5591 }, { "epoch": 0.7113598778781326, "ewc_loss": 0.04296091943979263, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016959944332484156, "grad_norm": 5.302739143371582, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.851285457611084, "num_tokens": 213436711.0, "step": 5592 }, { "epoch": 0.711487088156723, "ewc_loss": 0.043070655316114426, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017069678870029747, "grad_norm": 5.25653600692749, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8620760440826416, "num_tokens": 213473458.0, "step": 5593 }, { "epoch": 0.7116142984353135, "ewc_loss": 0.04301326349377632, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017012287571560591, "grad_norm": 5.29244327545166, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8685468435287476, "num_tokens": 213507355.0, "step": 5594 }, { "epoch": 0.7117415087139041, "ewc_loss": 0.04305211082100868, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017051133909262717, "grad_norm": 5.320356369018555, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8417207598686218, "num_tokens": 213543771.0, "step": 5595 }, { "epoch": 0.7118687189924946, "ewc_loss": 0.04305196553468704, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017050989845301956, "grad_norm": 5.258433818817139, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.855782151222229, "num_tokens": 213578317.0, "step": 5596 }, { "epoch": 0.7119959292710851, "ewc_loss": 0.043001268059015274, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001700029242783785, "grad_norm": 5.271946430206299, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.855937659740448, "num_tokens": 213615795.0, "step": 5597 }, { "epoch": 0.7121231395496757, "ewc_loss": 0.043043822050094604, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017042848048731685, "grad_norm": 5.235614776611328, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8687903881072998, "num_tokens": 213655452.0, "step": 5598 }, { "epoch": 0.7122503498282661, "ewc_loss": 0.043061159551143646, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017060183745343238, "grad_norm": 5.371890068054199, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8495584726333618, "num_tokens": 213688079.0, "step": 5599 }, { "epoch": 0.7123775601068566, "ewc_loss": 0.043064311146736145, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017063337145373225, "grad_norm": 5.2103047370910645, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8757152557373047, "num_tokens": 213725310.0, "step": 5600 }, { "epoch": 0.7125047703854471, "ewc_loss": 0.04301004856824875, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017009074508678168, "grad_norm": 5.300591945648193, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8532795906066895, "num_tokens": 213757950.0, "step": 5601 }, { "epoch": 0.7126319806640377, "ewc_loss": 0.0430896133184433, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001708863564999774, "grad_norm": 5.298056125640869, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8464218974113464, "num_tokens": 213794146.0, "step": 5602 }, { "epoch": 0.7127591909426282, "ewc_loss": 0.0430276021361351, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.000170266255736351, "grad_norm": 5.239252090454102, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8633527755737305, "num_tokens": 213834922.0, "step": 5603 }, { "epoch": 0.7128864012212187, "ewc_loss": 0.04331105202436447, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017065937572624534, "grad_norm": 5.20537805557251, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8772815465927124, "num_tokens": 213879651.0, "step": 5604 }, { "epoch": 0.7130136114998091, "ewc_loss": 0.04303191602230072, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017030937306117266, "grad_norm": 5.2554473876953125, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8584306240081787, "num_tokens": 213921833.0, "step": 5605 }, { "epoch": 0.7131408217783997, "ewc_loss": 0.04307204484939575, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001707106566755101, "grad_norm": 5.221866607666016, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8646672368049622, "num_tokens": 213962909.0, "step": 5606 }, { "epoch": 0.7132680320569902, "ewc_loss": 0.04307924956083298, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017078270320780575, "grad_norm": 5.266300678253174, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8614888191223145, "num_tokens": 214002030.0, "step": 5607 }, { "epoch": 0.7133952423355807, "ewc_loss": 0.04309172183275223, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001709074422251433, "grad_norm": 5.290736198425293, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.852196991443634, "num_tokens": 214037094.0, "step": 5608 }, { "epoch": 0.7135224526141712, "ewc_loss": 0.04302167892456055, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001702070439932868, "grad_norm": 5.266306400299072, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8631281852722168, "num_tokens": 214076649.0, "step": 5609 }, { "epoch": 0.7136496628927618, "ewc_loss": 0.04306349158287048, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017062516417354345, "grad_norm": 5.252044677734375, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8543961048126221, "num_tokens": 214114194.0, "step": 5610 }, { "epoch": 0.7137768731713523, "ewc_loss": 0.043023958802223206, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017022980318870395, "grad_norm": 5.302082538604736, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8605673313140869, "num_tokens": 214149652.0, "step": 5611 }, { "epoch": 0.7139040834499427, "ewc_loss": 0.04305244982242584, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017051471513696015, "grad_norm": 5.230446815490723, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8681255578994751, "num_tokens": 214189068.0, "step": 5612 }, { "epoch": 0.7140312937285332, "ewc_loss": 0.04331151396036148, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017066395957954228, "grad_norm": 5.361782073974609, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8473911285400391, "num_tokens": 214220251.0, "step": 5613 }, { "epoch": 0.7141585040071238, "ewc_loss": 0.04330873489379883, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001706361654214561, "grad_norm": 5.295477390289307, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8561569452285767, "num_tokens": 214256705.0, "step": 5614 }, { "epoch": 0.7142857142857143, "ewc_loss": 0.043283700942993164, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017038581427186728, "grad_norm": 5.2644476890563965, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8723196983337402, "num_tokens": 214292563.0, "step": 5615 }, { "epoch": 0.7144129245643048, "ewc_loss": 0.043320849537849426, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017075732466764748, "grad_norm": 5.270201683044434, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8523272275924683, "num_tokens": 214333843.0, "step": 5616 }, { "epoch": 0.7145401348428954, "ewc_loss": 0.043266408145427704, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017021292296703905, "grad_norm": 5.265570640563965, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8703938126564026, "num_tokens": 214372272.0, "step": 5617 }, { "epoch": 0.7146673451214858, "ewc_loss": 0.04301072657108307, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017009748262353241, "grad_norm": 5.201002597808838, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8649452924728394, "num_tokens": 214416364.0, "step": 5618 }, { "epoch": 0.7147945554000763, "ewc_loss": 0.04303757846355438, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001703660236671567, "grad_norm": 5.322942733764648, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8570667505264282, "num_tokens": 214453102.0, "step": 5619 }, { "epoch": 0.7149217656786668, "ewc_loss": 0.0430733785033226, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017072401533368975, "grad_norm": 5.252065181732178, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.863470196723938, "num_tokens": 214492374.0, "step": 5620 }, { "epoch": 0.7150489759572574, "ewc_loss": 0.04301679879426956, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017015822231769562, "grad_norm": 5.311556339263916, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8593674898147583, "num_tokens": 214530918.0, "step": 5621 }, { "epoch": 0.7151761862358479, "ewc_loss": 0.0430787019431591, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017077726079151034, "grad_norm": 5.270460605621338, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8748855590820312, "num_tokens": 214570227.0, "step": 5622 }, { "epoch": 0.7153033965144384, "ewc_loss": 0.04297780245542526, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016976827464532107, "grad_norm": 5.273107051849365, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8622050285339355, "num_tokens": 214609519.0, "step": 5623 }, { "epoch": 0.7154306067930288, "ewc_loss": 0.04306236654520035, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017061388643924147, "grad_norm": 5.349304676055908, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8568565845489502, "num_tokens": 214643564.0, "step": 5624 }, { "epoch": 0.7155578170716194, "ewc_loss": 0.0432504266500473, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017005308473017067, "grad_norm": 5.30216121673584, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8673746585845947, "num_tokens": 214677632.0, "step": 5625 }, { "epoch": 0.7156850273502099, "ewc_loss": 0.043014250695705414, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017013272736221552, "grad_norm": 5.34006929397583, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8608560562133789, "num_tokens": 214710363.0, "step": 5626 }, { "epoch": 0.7158122376288004, "ewc_loss": 0.042972102761268616, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001697112893452868, "grad_norm": 5.307326793670654, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8669975996017456, "num_tokens": 214746982.0, "step": 5627 }, { "epoch": 0.715939447907391, "ewc_loss": 0.042985472828149796, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016984496323857456, "grad_norm": 5.297261714935303, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8499785661697388, "num_tokens": 214786447.0, "step": 5628 }, { "epoch": 0.7160666581859815, "ewc_loss": 0.042965542525053024, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016964566020760685, "grad_norm": 5.295085906982422, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8668245077133179, "num_tokens": 214824627.0, "step": 5629 }, { "epoch": 0.7161938684645719, "ewc_loss": 0.0429687574505806, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00016967779083643109, "grad_norm": 5.299890518188477, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8406808376312256, "num_tokens": 214863051.0, "step": 5630 }, { "epoch": 0.7163210787431624, "ewc_loss": 0.0430084764957428, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.0001700749999145046, "grad_norm": 5.3744659423828125, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8465401530265808, "num_tokens": 214900524.0, "step": 5631 }, { "epoch": 0.716448289021753, "ewc_loss": 0.043023645877838135, "ewc_loss_diag": 2.5987625122070312e-05, "ewc_loss_parallel": 0.00017022671818267554, "grad_norm": 5.273975849151611, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.865547776222229, "num_tokens": 214943949.0, "step": 5632 }, { "epoch": 0.7165754993003435, "ewc_loss": 0.04311864450573921, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016995597979985178, "grad_norm": 5.345844268798828, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8629209995269775, "num_tokens": 214979126.0, "step": 5633 }, { "epoch": 0.716702709578934, "ewc_loss": 0.04313983768224716, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017016788478940725, "grad_norm": 5.273799419403076, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8606990575790405, "num_tokens": 215017740.0, "step": 5634 }, { "epoch": 0.7168299198575245, "ewc_loss": 0.0431114062666893, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016988359857350588, "grad_norm": 5.314408779144287, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8672192692756653, "num_tokens": 215054472.0, "step": 5635 }, { "epoch": 0.716957130136115, "ewc_loss": 0.04314802587032318, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017024979752022773, "grad_norm": 5.312090873718262, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8624442219734192, "num_tokens": 215086894.0, "step": 5636 }, { "epoch": 0.7170843404147055, "ewc_loss": 0.04309064522385597, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016967598639894277, "grad_norm": 5.293736934661865, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8604307174682617, "num_tokens": 215128157.0, "step": 5637 }, { "epoch": 0.717211550693296, "ewc_loss": 0.043138906359672546, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001701585715636611, "grad_norm": 5.295650005340576, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.867185652256012, "num_tokens": 215161008.0, "step": 5638 }, { "epoch": 0.7173387609718865, "ewc_loss": 0.04315640777349472, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017033358744811267, "grad_norm": 5.32856559753418, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.843012809753418, "num_tokens": 215197157.0, "step": 5639 }, { "epoch": 0.7174659712504771, "ewc_loss": 0.043166205286979675, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017043156549334526, "grad_norm": 5.234868049621582, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8662506341934204, "num_tokens": 215240270.0, "step": 5640 }, { "epoch": 0.7175931815290676, "ewc_loss": 0.04316890612244606, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017045858839992434, "grad_norm": 5.3062849044799805, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.867268443107605, "num_tokens": 215279828.0, "step": 5641 }, { "epoch": 0.717720391807658, "ewc_loss": 0.04314999282360077, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017026944260578603, "grad_norm": 5.2798848152160645, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8551266193389893, "num_tokens": 215315271.0, "step": 5642 }, { "epoch": 0.7178476020862485, "ewc_loss": 0.04320768266916275, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001708463387330994, "grad_norm": 5.326373100280762, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8611184358596802, "num_tokens": 215352621.0, "step": 5643 }, { "epoch": 0.7179748123648391, "ewc_loss": 0.04318692162632942, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017063875566236675, "grad_norm": 5.284745693206787, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8615118265151978, "num_tokens": 215390231.0, "step": 5644 }, { "epoch": 0.7181020226434296, "ewc_loss": 0.04313899949193001, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017015951743815094, "grad_norm": 5.268370151519775, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8741021752357483, "num_tokens": 215426812.0, "step": 5645 }, { "epoch": 0.7182292329220201, "ewc_loss": 0.043168626725673676, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001704557944322005, "grad_norm": 5.321624279022217, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8491051197052002, "num_tokens": 215467551.0, "step": 5646 }, { "epoch": 0.7183564432006107, "ewc_loss": 0.04317261278629303, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017049566667992622, "grad_norm": 5.295164108276367, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8678680658340454, "num_tokens": 215508559.0, "step": 5647 }, { "epoch": 0.7184836534792011, "ewc_loss": 0.043151747435331345, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017028700676746666, "grad_norm": 5.296682834625244, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8540197610855103, "num_tokens": 215548709.0, "step": 5648 }, { "epoch": 0.7186108637577916, "ewc_loss": 0.04310263693332672, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016979587962850928, "grad_norm": 5.301835536956787, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8597998023033142, "num_tokens": 215593844.0, "step": 5649 }, { "epoch": 0.7187380740363821, "ewc_loss": 0.043246664106845856, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017001546802930534, "grad_norm": 5.298689842224121, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8649780750274658, "num_tokens": 215633853.0, "step": 5650 }, { "epoch": 0.7188652843149727, "ewc_loss": 0.0431225523352623, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001699950371403247, "grad_norm": 5.348979949951172, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.85866779088974, "num_tokens": 215667272.0, "step": 5651 }, { "epoch": 0.7189924945935632, "ewc_loss": 0.0430908203125, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001696777471806854, "grad_norm": 5.312941551208496, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8421558141708374, "num_tokens": 215705889.0, "step": 5652 }, { "epoch": 0.7191197048721537, "ewc_loss": 0.04308384284377098, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016960795619525015, "grad_norm": 5.309040069580078, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8694864511489868, "num_tokens": 215749143.0, "step": 5653 }, { "epoch": 0.7192469151507441, "ewc_loss": 0.04319306090474129, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016947943367995322, "grad_norm": 5.243505954742432, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8550790548324585, "num_tokens": 215787583.0, "step": 5654 }, { "epoch": 0.7193741254293347, "ewc_loss": 0.043231576681137085, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016986460832413286, "grad_norm": 5.366326808929443, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8600040674209595, "num_tokens": 215825552.0, "step": 5655 }, { "epoch": 0.7195013357079252, "ewc_loss": 0.043267931789159775, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017022814427036792, "grad_norm": 5.315340995788574, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8686556816101074, "num_tokens": 215858310.0, "step": 5656 }, { "epoch": 0.7196285459865157, "ewc_loss": 0.04322272911667824, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016977611812762916, "grad_norm": 5.335224628448486, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8487762212753296, "num_tokens": 215898403.0, "step": 5657 }, { "epoch": 0.7197557562651062, "ewc_loss": 0.043131567537784576, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017008518625516444, "grad_norm": 5.3462724685668945, "learning_rate": 1e-06, "loss": 0.5512, "mean_token_accuracy": 0.8369195461273193, "num_tokens": 215935930.0, "step": 5658 }, { "epoch": 0.7198829665436968, "ewc_loss": 0.04309106990695, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00016968022100627422, "grad_norm": 5.3022847175598145, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8564997315406799, "num_tokens": 215971509.0, "step": 5659 }, { "epoch": 0.7200101768222873, "ewc_loss": 0.043215226382017136, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001697010884527117, "grad_norm": 5.374319076538086, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8659368753433228, "num_tokens": 216008696.0, "step": 5660 }, { "epoch": 0.7201373871008777, "ewc_loss": 0.04324058070778847, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016995462647173554, "grad_norm": 5.268568515777588, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8597158193588257, "num_tokens": 216045861.0, "step": 5661 }, { "epoch": 0.7202645973794682, "ewc_loss": 0.04321826249361038, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016973147285170853, "grad_norm": 5.347606182098389, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8691571950912476, "num_tokens": 216081307.0, "step": 5662 }, { "epoch": 0.7203918076580588, "ewc_loss": 0.0432843342423439, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017039215890690684, "grad_norm": 5.25719690322876, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8692511916160583, "num_tokens": 216118506.0, "step": 5663 }, { "epoch": 0.7205190179366493, "ewc_loss": 0.043178990483284, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016933874576352537, "grad_norm": 5.272451400756836, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8537066578865051, "num_tokens": 216159000.0, "step": 5664 }, { "epoch": 0.7206462282152398, "ewc_loss": 0.043249040842056274, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017003926041070372, "grad_norm": 5.280331611633301, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8523292541503906, "num_tokens": 216204712.0, "step": 5665 }, { "epoch": 0.7207734384938304, "ewc_loss": 0.043283406645059586, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001703829038888216, "grad_norm": 5.388280391693115, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8522555828094482, "num_tokens": 216235001.0, "step": 5666 }, { "epoch": 0.7209006487724208, "ewc_loss": 0.04329223930835724, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017047120491042733, "grad_norm": 5.324126720428467, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8533594012260437, "num_tokens": 216271383.0, "step": 5667 }, { "epoch": 0.7210278590510113, "ewc_loss": 0.04325631260871887, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017011196177918464, "grad_norm": 5.341136455535889, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8609004020690918, "num_tokens": 216307923.0, "step": 5668 }, { "epoch": 0.7211550693296018, "ewc_loss": 0.04317941144108772, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017056365322787315, "grad_norm": 5.2823615074157715, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8788096904754639, "num_tokens": 216345147.0, "step": 5669 }, { "epoch": 0.7212822796081924, "ewc_loss": 0.043139491230249405, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017016445053741336, "grad_norm": 5.302070617675781, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8525948524475098, "num_tokens": 216386991.0, "step": 5670 }, { "epoch": 0.7214094898867829, "ewc_loss": 0.04327448457479477, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017029367154464126, "grad_norm": 5.300215244293213, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8623948693275452, "num_tokens": 216425534.0, "step": 5671 }, { "epoch": 0.7215367001653734, "ewc_loss": 0.04326334595680237, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017018230573739856, "grad_norm": 5.320643424987793, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8597733974456787, "num_tokens": 216466686.0, "step": 5672 }, { "epoch": 0.7216639104439638, "ewc_loss": 0.04327556490898132, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017030445451382548, "grad_norm": 5.298666477203369, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8719431757926941, "num_tokens": 216505245.0, "step": 5673 }, { "epoch": 0.7217911207225544, "ewc_loss": 0.043237339705228806, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016992221935652196, "grad_norm": 5.355827331542969, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8565985560417175, "num_tokens": 216538024.0, "step": 5674 }, { "epoch": 0.7219183310011449, "ewc_loss": 0.0431586354970932, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017035589553415775, "grad_norm": 5.37782621383667, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8584589958190918, "num_tokens": 216573722.0, "step": 5675 }, { "epoch": 0.7220455412797354, "ewc_loss": 0.043200790882110596, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016955674800556153, "grad_norm": 5.1981377601623535, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8655242919921875, "num_tokens": 216610362.0, "step": 5676 }, { "epoch": 0.7221727515583259, "ewc_loss": 0.043236397206783295, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00016991277516353875, "grad_norm": 5.304446220397949, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8525241613388062, "num_tokens": 216648732.0, "step": 5677 }, { "epoch": 0.7222999618369165, "ewc_loss": 0.04331562668085098, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017070509784389287, "grad_norm": 5.308168888092041, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8683623671531677, "num_tokens": 216680324.0, "step": 5678 }, { "epoch": 0.7224271721155069, "ewc_loss": 0.04329206049442291, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017046941502485424, "grad_norm": 5.341585636138916, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.862485408782959, "num_tokens": 216712773.0, "step": 5679 }, { "epoch": 0.7225543823940974, "ewc_loss": 0.0432668961584568, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017021778330672532, "grad_norm": 5.241025924682617, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8524662852287292, "num_tokens": 216752538.0, "step": 5680 }, { "epoch": 0.722681592672688, "ewc_loss": 0.04330509901046753, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017059982928913087, "grad_norm": 5.289097785949707, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8631907105445862, "num_tokens": 216791795.0, "step": 5681 }, { "epoch": 0.7228088029512785, "ewc_loss": 0.0433337464928627, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017088626918848604, "grad_norm": 5.219400882720947, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8569599986076355, "num_tokens": 216831729.0, "step": 5682 }, { "epoch": 0.722936013229869, "ewc_loss": 0.043360352516174316, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017115238006226718, "grad_norm": 5.375487804412842, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8600452542304993, "num_tokens": 216869494.0, "step": 5683 }, { "epoch": 0.7230632235084595, "ewc_loss": 0.043383583426475525, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017138464318122715, "grad_norm": 5.295973300933838, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8414608836174011, "num_tokens": 216906672.0, "step": 5684 }, { "epoch": 0.72319043378705, "ewc_loss": 0.04331390932202339, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017068791203200817, "grad_norm": 5.323339939117432, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8661818504333496, "num_tokens": 216941587.0, "step": 5685 }, { "epoch": 0.7233176440656405, "ewc_loss": 0.04324069619178772, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017117647803388536, "grad_norm": 5.294339656829834, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8531008958816528, "num_tokens": 216981812.0, "step": 5686 }, { "epoch": 0.723444854344231, "ewc_loss": 0.043227966874837875, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017104919243138283, "grad_norm": 5.2575578689575195, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8703826069831848, "num_tokens": 217024137.0, "step": 5687 }, { "epoch": 0.7235720646228215, "ewc_loss": 0.04323218762874603, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017109140753746033, "grad_norm": 5.287068843841553, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8728333115577698, "num_tokens": 217063753.0, "step": 5688 }, { "epoch": 0.7236992749014121, "ewc_loss": 0.04324612021446228, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017123075667768717, "grad_norm": 5.285096645355225, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8582541942596436, "num_tokens": 217105611.0, "step": 5689 }, { "epoch": 0.7238264851800026, "ewc_loss": 0.04327487200498581, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.000171518258866854, "grad_norm": 5.345475196838379, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8404848575592041, "num_tokens": 217145989.0, "step": 5690 }, { "epoch": 0.723953695458593, "ewc_loss": 0.04323512315750122, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017112075875047594, "grad_norm": 5.290063858032227, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8780807256698608, "num_tokens": 217183726.0, "step": 5691 }, { "epoch": 0.7240809057371835, "ewc_loss": 0.043241191655397415, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017118144023697823, "grad_norm": 5.282118797302246, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8581357002258301, "num_tokens": 217226644.0, "step": 5692 }, { "epoch": 0.7242081160157741, "ewc_loss": 0.043193232268095016, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017070185276679695, "grad_norm": 5.378574848175049, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8447836637496948, "num_tokens": 217263327.0, "step": 5693 }, { "epoch": 0.7243353262943646, "ewc_loss": 0.04320728778839111, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017084239516407251, "grad_norm": 5.287705421447754, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.86482834815979, "num_tokens": 217300595.0, "step": 5694 }, { "epoch": 0.7244625365729551, "ewc_loss": 0.04319270700216293, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.0001706965995253995, "grad_norm": 5.342841625213623, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8465808629989624, "num_tokens": 217336385.0, "step": 5695 }, { "epoch": 0.7245897468515456, "ewc_loss": 0.043176278471946716, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017053230840247124, "grad_norm": 5.263081073760986, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8674387335777283, "num_tokens": 217374309.0, "step": 5696 }, { "epoch": 0.7247169571301361, "ewc_loss": 0.04315539076924324, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017032343021128327, "grad_norm": 5.350582599639893, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8503562211990356, "num_tokens": 217408386.0, "step": 5697 }, { "epoch": 0.7248441674087266, "ewc_loss": 0.04321166127920151, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017088616732507944, "grad_norm": 5.348971366882324, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8467019200325012, "num_tokens": 217442855.0, "step": 5698 }, { "epoch": 0.7249713776873171, "ewc_loss": 0.04315735399723053, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017034307529684156, "grad_norm": 5.284384250640869, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.864608883857727, "num_tokens": 217476892.0, "step": 5699 }, { "epoch": 0.7250985879659076, "ewc_loss": 0.04314238578081131, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017019339429680258, "grad_norm": 5.307903289794922, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8584181070327759, "num_tokens": 217513569.0, "step": 5700 }, { "epoch": 0.7252257982444982, "ewc_loss": 0.04319780692458153, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017074760398827493, "grad_norm": 5.391555309295654, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8553904294967651, "num_tokens": 217545585.0, "step": 5701 }, { "epoch": 0.7253530085230887, "ewc_loss": 0.043167855590581894, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017044808191712946, "grad_norm": 5.289263725280762, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8631869554519653, "num_tokens": 217583156.0, "step": 5702 }, { "epoch": 0.7254802188016791, "ewc_loss": 0.04319264367222786, "ewc_loss_diag": 2.6106834411621094e-05, "ewc_loss_parallel": 0.00017069595924112946, "grad_norm": 5.277092456817627, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8603360652923584, "num_tokens": 217619630.0, "step": 5703 }, { "epoch": 0.7256074290802697, "ewc_loss": 0.04328475520014763, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017039637896232307, "grad_norm": 5.302511215209961, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8708375692367554, "num_tokens": 217653616.0, "step": 5704 }, { "epoch": 0.7257346393588602, "ewc_loss": 0.0433565229177475, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017111405031755567, "grad_norm": 5.2715678215026855, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8557223677635193, "num_tokens": 217692758.0, "step": 5705 }, { "epoch": 0.7258618496374507, "ewc_loss": 0.04335659369826317, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017111476336140186, "grad_norm": 5.26568603515625, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8442484140396118, "num_tokens": 217734175.0, "step": 5706 }, { "epoch": 0.7259890599160412, "ewc_loss": 0.04338637366890907, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017141256830655038, "grad_norm": 5.339938640594482, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.838170051574707, "num_tokens": 217773372.0, "step": 5707 }, { "epoch": 0.7261162701946318, "ewc_loss": 0.04341007396578789, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017164956079795957, "grad_norm": 5.321661949157715, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.868701696395874, "num_tokens": 217809174.0, "step": 5708 }, { "epoch": 0.7262434804732223, "ewc_loss": 0.043344367295503616, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017099249816965312, "grad_norm": 5.2617034912109375, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8533363342285156, "num_tokens": 217845882.0, "step": 5709 }, { "epoch": 0.7263706907518127, "ewc_loss": 0.04338737577199936, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017142260912805796, "grad_norm": 5.299510478973389, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8555707335472107, "num_tokens": 217885798.0, "step": 5710 }, { "epoch": 0.7264979010304032, "ewc_loss": 0.04337335377931595, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017128234321717173, "grad_norm": 5.328970432281494, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8559799194335938, "num_tokens": 217924967.0, "step": 5711 }, { "epoch": 0.7266251113089938, "ewc_loss": 0.04335916042327881, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017114044749177992, "grad_norm": 5.271597385406494, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8711962699890137, "num_tokens": 217967381.0, "step": 5712 }, { "epoch": 0.7267523215875843, "ewc_loss": 0.04332629591226578, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017081177793443203, "grad_norm": 5.304647922515869, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8658688068389893, "num_tokens": 218001097.0, "step": 5713 }, { "epoch": 0.7268795318661748, "ewc_loss": 0.043340571224689484, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017095451767090708, "grad_norm": 5.259533882141113, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8617405295372009, "num_tokens": 218040705.0, "step": 5714 }, { "epoch": 0.7270067421447653, "ewc_loss": 0.04339275509119034, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.000171476393006742, "grad_norm": 5.346419334411621, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8599303364753723, "num_tokens": 218077549.0, "step": 5715 }, { "epoch": 0.7271339524233558, "ewc_loss": 0.043379589915275574, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017134474182967097, "grad_norm": 5.2400288581848145, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8593864440917969, "num_tokens": 218116251.0, "step": 5716 }, { "epoch": 0.7272611627019463, "ewc_loss": 0.04335766285657883, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017112542991526425, "grad_norm": 5.325772762298584, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.8324525356292725, "num_tokens": 218155798.0, "step": 5717 }, { "epoch": 0.7273883729805368, "ewc_loss": 0.04342612251639366, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017181005387101322, "grad_norm": 5.285851955413818, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8649932146072388, "num_tokens": 218191277.0, "step": 5718 }, { "epoch": 0.7275155832591274, "ewc_loss": 0.04342717304825783, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001718205603538081, "grad_norm": 5.305066108703613, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8660439848899841, "num_tokens": 218233165.0, "step": 5719 }, { "epoch": 0.7276427935377179, "ewc_loss": 0.04343508183956146, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001718996645649895, "grad_norm": 5.331236362457275, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8396974802017212, "num_tokens": 218271263.0, "step": 5720 }, { "epoch": 0.7277700038163084, "ewc_loss": 0.04340677708387375, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017161660070996732, "grad_norm": 5.331120014190674, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8610950708389282, "num_tokens": 218309081.0, "step": 5721 }, { "epoch": 0.7278972140948988, "ewc_loss": 0.04340190812945366, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001715679100016132, "grad_norm": 5.31065034866333, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8543891310691833, "num_tokens": 218348314.0, "step": 5722 }, { "epoch": 0.7280244243734894, "ewc_loss": 0.0433754026889801, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017130287596955895, "grad_norm": 5.285799503326416, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8637465834617615, "num_tokens": 218387556.0, "step": 5723 }, { "epoch": 0.7281516346520799, "ewc_loss": 0.04342115670442581, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017176037363242358, "grad_norm": 5.338372230529785, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8521366119384766, "num_tokens": 218423130.0, "step": 5724 }, { "epoch": 0.7282788449306704, "ewc_loss": 0.043393462896347046, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017148343613371253, "grad_norm": 5.348592758178711, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8417220115661621, "num_tokens": 218464079.0, "step": 5725 }, { "epoch": 0.7284060552092609, "ewc_loss": 0.043390631675720215, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001714551617624238, "grad_norm": 5.303455829620361, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8679149746894836, "num_tokens": 218501307.0, "step": 5726 }, { "epoch": 0.7285332654878515, "ewc_loss": 0.043346062302589417, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017100945115089417, "grad_norm": 5.312347888946533, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8686022758483887, "num_tokens": 218537710.0, "step": 5727 }, { "epoch": 0.7286604757664419, "ewc_loss": 0.043388884514570236, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017143767036031932, "grad_norm": 5.298792839050293, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8560178875923157, "num_tokens": 218576026.0, "step": 5728 }, { "epoch": 0.7287876860450324, "ewc_loss": 0.043355103582143784, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.000171099862200208, "grad_norm": 5.301116466522217, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8565453290939331, "num_tokens": 218612491.0, "step": 5729 }, { "epoch": 0.7289148963236229, "ewc_loss": 0.0433816984295845, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017136581300292164, "grad_norm": 5.3205437660217285, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8677239418029785, "num_tokens": 218642541.0, "step": 5730 }, { "epoch": 0.7290421066022135, "ewc_loss": 0.04339699447154999, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017151878273580223, "grad_norm": 5.339155197143555, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8454822897911072, "num_tokens": 218678514.0, "step": 5731 }, { "epoch": 0.729169316880804, "ewc_loss": 0.04342377930879593, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017178661073558033, "grad_norm": 5.296243667602539, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8634355664253235, "num_tokens": 218715415.0, "step": 5732 }, { "epoch": 0.7292965271593945, "ewc_loss": 0.0434473417699337, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017202224989887327, "grad_norm": 5.327989101409912, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8672717213630676, "num_tokens": 218754422.0, "step": 5733 }, { "epoch": 0.7294237374379849, "ewc_loss": 0.04344088211655617, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017195765394717455, "grad_norm": 5.47411584854126, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8503705859184265, "num_tokens": 218787141.0, "step": 5734 }, { "epoch": 0.7295509477165755, "ewc_loss": 0.043438129127025604, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001719301362754777, "grad_norm": 5.2582926750183105, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8741223812103271, "num_tokens": 218824764.0, "step": 5735 }, { "epoch": 0.729678157995166, "ewc_loss": 0.043366625905036926, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.000171215069713071, "grad_norm": 5.340662479400635, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.857958197593689, "num_tokens": 218866142.0, "step": 5736 }, { "epoch": 0.7298053682737565, "ewc_loss": 0.043443284928798676, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017198167915921658, "grad_norm": 5.2284674644470215, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8537979125976562, "num_tokens": 218909208.0, "step": 5737 }, { "epoch": 0.7299325785523471, "ewc_loss": 0.04342871531844139, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017183598538395017, "grad_norm": 5.370253562927246, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8674414157867432, "num_tokens": 218942469.0, "step": 5738 }, { "epoch": 0.7300597888309376, "ewc_loss": 0.04348739609122276, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017242279136553407, "grad_norm": 5.322129249572754, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8522478938102722, "num_tokens": 218982934.0, "step": 5739 }, { "epoch": 0.730186999109528, "ewc_loss": 0.04346098378300667, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017215865955222398, "grad_norm": 5.323296546936035, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8564732074737549, "num_tokens": 219027055.0, "step": 5740 }, { "epoch": 0.7303142093881185, "ewc_loss": 0.0435054749250412, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017260355525650084, "grad_norm": 5.34174108505249, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8602854013442993, "num_tokens": 219069639.0, "step": 5741 }, { "epoch": 0.7304414196667091, "ewc_loss": 0.04342063516378403, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017175517859868705, "grad_norm": 5.306422710418701, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8536461591720581, "num_tokens": 219111774.0, "step": 5742 }, { "epoch": 0.7305686299452996, "ewc_loss": 0.04345054179430008, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017205422045663, "grad_norm": 5.308782577514648, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8470566868782043, "num_tokens": 219156962.0, "step": 5743 }, { "epoch": 0.7306958402238901, "ewc_loss": 0.04355078190565109, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017183592717628926, "grad_norm": 5.4049201011657715, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8682292699813843, "num_tokens": 219193352.0, "step": 5744 }, { "epoch": 0.7308230505024806, "ewc_loss": 0.043517760932445526, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017150571511592716, "grad_norm": 5.4427595138549805, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.845419704914093, "num_tokens": 219230041.0, "step": 5745 }, { "epoch": 0.7309502607810711, "ewc_loss": 0.0435311459004879, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001716395781841129, "grad_norm": 5.4192609786987305, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8368407487869263, "num_tokens": 219262748.0, "step": 5746 }, { "epoch": 0.7310774710596616, "ewc_loss": 0.04345003515481949, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001708284835331142, "grad_norm": 5.32794713973999, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.871408224105835, "num_tokens": 219296556.0, "step": 5747 }, { "epoch": 0.7312046813382521, "ewc_loss": 0.043459612876176834, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017092424968723208, "grad_norm": 5.339409828186035, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8461490869522095, "num_tokens": 219330801.0, "step": 5748 }, { "epoch": 0.7313318916168426, "ewc_loss": 0.04345196485519409, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017084776482079178, "grad_norm": 5.260159969329834, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8591442108154297, "num_tokens": 219367470.0, "step": 5749 }, { "epoch": 0.7314591018954332, "ewc_loss": 0.043600164353847504, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017110905901063234, "grad_norm": 5.3354811668396, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8467367887496948, "num_tokens": 219405291.0, "step": 5750 }, { "epoch": 0.7315863121740237, "ewc_loss": 0.04366366192698479, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017174403183162212, "grad_norm": 5.260254383087158, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8612120151519775, "num_tokens": 219443365.0, "step": 5751 }, { "epoch": 0.7317135224526141, "ewc_loss": 0.043677181005477905, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017187921912409365, "grad_norm": 5.370626449584961, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8615380525588989, "num_tokens": 219477524.0, "step": 5752 }, { "epoch": 0.7318407327312046, "ewc_loss": 0.04372674226760864, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001723748428048566, "grad_norm": 5.324278354644775, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8498531579971313, "num_tokens": 219514576.0, "step": 5753 }, { "epoch": 0.7319679430097952, "ewc_loss": 0.04369175061583519, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001720249274512753, "grad_norm": 5.3014235496521, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8601551055908203, "num_tokens": 219558253.0, "step": 5754 }, { "epoch": 0.7320951532883857, "ewc_loss": 0.043752528727054596, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017263268819078803, "grad_norm": 5.307522773742676, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8549069166183472, "num_tokens": 219601156.0, "step": 5755 }, { "epoch": 0.7322223635669762, "ewc_loss": 0.0437580831348896, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017268824740312994, "grad_norm": 5.429958343505859, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8496952056884766, "num_tokens": 219638241.0, "step": 5756 }, { "epoch": 0.7323495738455668, "ewc_loss": 0.0437546968460083, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017265441420022398, "grad_norm": 5.269836902618408, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.860795259475708, "num_tokens": 219678168.0, "step": 5757 }, { "epoch": 0.7324767841241572, "ewc_loss": 0.04369674623012543, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017207488417625427, "grad_norm": 5.425578594207764, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8559517860412598, "num_tokens": 219718115.0, "step": 5758 }, { "epoch": 0.7326039944027477, "ewc_loss": 0.04387007653713226, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.00017258746083825827, "grad_norm": 5.327715873718262, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8587707877159119, "num_tokens": 219754221.0, "step": 5759 }, { "epoch": 0.7327312046813382, "ewc_loss": 0.04358401894569397, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017216829292010516, "grad_norm": 5.430694103240967, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8650195598602295, "num_tokens": 219786880.0, "step": 5760 }, { "epoch": 0.7328584149599288, "ewc_loss": 0.04366553574800491, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001717627892503515, "grad_norm": 5.254907131195068, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8628303408622742, "num_tokens": 219830585.0, "step": 5761 }, { "epoch": 0.7329856252385193, "ewc_loss": 0.04350172355771065, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017134535301011056, "grad_norm": 5.353662490844727, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8631293177604675, "num_tokens": 219867249.0, "step": 5762 }, { "epoch": 0.7331128355171098, "ewc_loss": 0.043480969965457916, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017235854465980083, "grad_norm": 5.328463077545166, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8507952690124512, "num_tokens": 219906046.0, "step": 5763 }, { "epoch": 0.7332400457957003, "ewc_loss": 0.04336246848106384, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017117353854700923, "grad_norm": 5.3467583656311035, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8499557971954346, "num_tokens": 219947466.0, "step": 5764 }, { "epoch": 0.7333672560742908, "ewc_loss": 0.04342329502105713, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017178179405163974, "grad_norm": 5.309940338134766, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8612728118896484, "num_tokens": 219985698.0, "step": 5765 }, { "epoch": 0.7334944663528813, "ewc_loss": 0.04342372715473175, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017178610141854733, "grad_norm": 5.3414106369018555, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.858943521976471, "num_tokens": 220026158.0, "step": 5766 }, { "epoch": 0.7336216766314718, "ewc_loss": 0.043402425944805145, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017157307593151927, "grad_norm": 5.271428108215332, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.85733562707901, "num_tokens": 220071849.0, "step": 5767 }, { "epoch": 0.7337488869100623, "ewc_loss": 0.04341009631752968, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017164979362860322, "grad_norm": 5.348781585693359, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8648794889450073, "num_tokens": 220110255.0, "step": 5768 }, { "epoch": 0.7338760971886529, "ewc_loss": 0.04344861954450607, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.000172035041032359, "grad_norm": 5.359545707702637, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8433037996292114, "num_tokens": 220144962.0, "step": 5769 }, { "epoch": 0.7340033074672434, "ewc_loss": 0.04344530403614044, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001720018481137231, "grad_norm": 5.376286029815674, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8496600389480591, "num_tokens": 220182118.0, "step": 5770 }, { "epoch": 0.7341305177458338, "ewc_loss": 0.043396737426519394, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001715162070468068, "grad_norm": 5.3009748458862305, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8498596549034119, "num_tokens": 220218954.0, "step": 5771 }, { "epoch": 0.7342577280244243, "ewc_loss": 0.04342412203550339, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017179005953948945, "grad_norm": 5.339503288269043, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8579692840576172, "num_tokens": 220259116.0, "step": 5772 }, { "epoch": 0.7343849383030149, "ewc_loss": 0.043392281979322433, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017147164908237755, "grad_norm": 5.352842807769775, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.855425238609314, "num_tokens": 220301742.0, "step": 5773 }, { "epoch": 0.7345121485816054, "ewc_loss": 0.04341360554099083, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017168489284813404, "grad_norm": 5.2694807052612305, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8624929189682007, "num_tokens": 220342572.0, "step": 5774 }, { "epoch": 0.7346393588601959, "ewc_loss": 0.04340549558401108, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001716037659207359, "grad_norm": 5.341131687164307, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8644793629646301, "num_tokens": 220376373.0, "step": 5775 }, { "epoch": 0.7347665691387865, "ewc_loss": 0.043421801179647446, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001717668492347002, "grad_norm": 5.257737636566162, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8755241632461548, "num_tokens": 220413290.0, "step": 5776 }, { "epoch": 0.7348937794173769, "ewc_loss": 0.04344164580106735, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017196530825458467, "grad_norm": 5.359438896179199, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8549685478210449, "num_tokens": 220453219.0, "step": 5777 }, { "epoch": 0.7350209896959674, "ewc_loss": 0.04344112426042557, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017196006956510246, "grad_norm": 5.300332546234131, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8597537875175476, "num_tokens": 220488369.0, "step": 5778 }, { "epoch": 0.7351481999745579, "ewc_loss": 0.04341541975736618, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017170303908642381, "grad_norm": 5.292661666870117, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8629327416419983, "num_tokens": 220527798.0, "step": 5779 }, { "epoch": 0.7352754102531485, "ewc_loss": 0.043507784605026245, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017262667824979872, "grad_norm": 5.340935707092285, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8768777847290039, "num_tokens": 220565398.0, "step": 5780 }, { "epoch": 0.735402620531739, "ewc_loss": 0.043497614562511444, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017252496036235243, "grad_norm": 5.2843217849731445, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8686727285385132, "num_tokens": 220603903.0, "step": 5781 }, { "epoch": 0.7355298308103295, "ewc_loss": 0.043472111225128174, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017226995259989053, "grad_norm": 5.319276809692383, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8418064713478088, "num_tokens": 220644168.0, "step": 5782 }, { "epoch": 0.7356570410889199, "ewc_loss": 0.043507665395736694, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017262547044083476, "grad_norm": 5.344909191131592, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8719370365142822, "num_tokens": 220679709.0, "step": 5783 }, { "epoch": 0.7357842513675105, "ewc_loss": 0.04348978400230408, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017244668561033905, "grad_norm": 5.293294906616211, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8524425625801086, "num_tokens": 220721184.0, "step": 5784 }, { "epoch": 0.735911461646101, "ewc_loss": 0.043489523231983185, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017244408081751317, "grad_norm": 5.354361057281494, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8631699085235596, "num_tokens": 220755027.0, "step": 5785 }, { "epoch": 0.7360386719246915, "ewc_loss": 0.043551601469516754, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001730648218654096, "grad_norm": 5.368910312652588, "learning_rate": 1e-06, "loss": 0.5329, "mean_token_accuracy": 0.834805965423584, "num_tokens": 220793597.0, "step": 5786 }, { "epoch": 0.736165882203282, "ewc_loss": 0.043459758162498474, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017214639228768647, "grad_norm": 5.3056793212890625, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8577183485031128, "num_tokens": 220833549.0, "step": 5787 }, { "epoch": 0.7362930924818726, "ewc_loss": 0.04360561445355415, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017238427244592458, "grad_norm": 5.367736339569092, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8655707240104675, "num_tokens": 220865551.0, "step": 5788 }, { "epoch": 0.736420302760463, "ewc_loss": 0.04348067194223404, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.000172355561517179, "grad_norm": 5.252452850341797, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8711608052253723, "num_tokens": 220911626.0, "step": 5789 }, { "epoch": 0.7365475130390535, "ewc_loss": 0.04350313916802406, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017258021398447454, "grad_norm": 5.4254937171936035, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8604670166969299, "num_tokens": 220949839.0, "step": 5790 }, { "epoch": 0.736674723317644, "ewc_loss": 0.043588489294052124, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017343372746836394, "grad_norm": 5.344031810760498, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8424501419067383, "num_tokens": 220984069.0, "step": 5791 }, { "epoch": 0.7368019335962346, "ewc_loss": 0.043479204177856445, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001723408786347136, "grad_norm": 5.302455425262451, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8793869614601135, "num_tokens": 221020999.0, "step": 5792 }, { "epoch": 0.7369291438748251, "ewc_loss": 0.043529096990823746, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017283979104831815, "grad_norm": 5.416556358337402, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8497027158737183, "num_tokens": 221051082.0, "step": 5793 }, { "epoch": 0.7370563541534156, "ewc_loss": 0.0435294434428215, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017284328350797296, "grad_norm": 5.333152770996094, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8411304354667664, "num_tokens": 221095444.0, "step": 5794 }, { "epoch": 0.737183564432006, "ewc_loss": 0.04360683262348175, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017239646695088595, "grad_norm": 5.3532938957214355, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8616000413894653, "num_tokens": 221136433.0, "step": 5795 }, { "epoch": 0.7373107747105966, "ewc_loss": 0.04356151819229126, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017316400771960616, "grad_norm": 5.3783793449401855, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8517202138900757, "num_tokens": 221171278.0, "step": 5796 }, { "epoch": 0.7374379849891871, "ewc_loss": 0.043626029044389725, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001725884067127481, "grad_norm": 5.345006942749023, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8584551811218262, "num_tokens": 221209380.0, "step": 5797 }, { "epoch": 0.7375651952677776, "ewc_loss": 0.0436243861913681, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017257196304854006, "grad_norm": 5.334991931915283, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8574380278587341, "num_tokens": 221253076.0, "step": 5798 }, { "epoch": 0.7376924055463682, "ewc_loss": 0.04352923482656479, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017284118803218007, "grad_norm": 5.387129783630371, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.854362428188324, "num_tokens": 221290348.0, "step": 5799 }, { "epoch": 0.7378196158249587, "ewc_loss": 0.04349137097597122, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017246253264602274, "grad_norm": 5.301153659820557, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8653228878974915, "num_tokens": 221329635.0, "step": 5800 }, { "epoch": 0.7379468261035491, "ewc_loss": 0.0435105599462986, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001726544287521392, "grad_norm": 5.343048095703125, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8663583993911743, "num_tokens": 221368521.0, "step": 5801 }, { "epoch": 0.7380740363821396, "ewc_loss": 0.043764878064394, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001727562048472464, "grad_norm": 5.347996711730957, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8707469701766968, "num_tokens": 221407090.0, "step": 5802 }, { "epoch": 0.7382012466607302, "ewc_loss": 0.043593212962150574, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017226027557626367, "grad_norm": 5.288631916046143, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8682390451431274, "num_tokens": 221450340.0, "step": 5803 }, { "epoch": 0.7383284569393207, "ewc_loss": 0.04360412433743477, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017236937128473073, "grad_norm": 5.3866472244262695, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8602876663208008, "num_tokens": 221491996.0, "step": 5804 }, { "epoch": 0.7384556672179112, "ewc_loss": 0.04362610727548599, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001725892216200009, "grad_norm": 5.346292972564697, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8677435517311096, "num_tokens": 221531466.0, "step": 5805 }, { "epoch": 0.7385828774965018, "ewc_loss": 0.04341770336031914, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017172587104141712, "grad_norm": 5.36710786819458, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8404008150100708, "num_tokens": 221567003.0, "step": 5806 }, { "epoch": 0.7387100877750922, "ewc_loss": 0.04349007457494736, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017244959599338472, "grad_norm": 5.350073337554932, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8711243867874146, "num_tokens": 221607396.0, "step": 5807 }, { "epoch": 0.7388372980536827, "ewc_loss": 0.04346352070569992, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017218403809238225, "grad_norm": 5.366916179656982, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8518598079681396, "num_tokens": 221644776.0, "step": 5808 }, { "epoch": 0.7389645083322732, "ewc_loss": 0.04342644661664963, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017181331350002438, "grad_norm": 5.350644588470459, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8711652755737305, "num_tokens": 221685643.0, "step": 5809 }, { "epoch": 0.7390917186108638, "ewc_loss": 0.0434957854449749, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001725066831568256, "grad_norm": 5.345078945159912, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8522958159446716, "num_tokens": 221724673.0, "step": 5810 }, { "epoch": 0.7392189288894543, "ewc_loss": 0.043445736169815063, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017200618458446115, "grad_norm": 5.325092792510986, "learning_rate": 1e-06, "loss": 0.5444, "mean_token_accuracy": 0.8356449604034424, "num_tokens": 221766287.0, "step": 5811 }, { "epoch": 0.7393461391680448, "ewc_loss": 0.04347926378250122, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017234146071132272, "grad_norm": 5.337121486663818, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8575485944747925, "num_tokens": 221805357.0, "step": 5812 }, { "epoch": 0.7394733494466353, "ewc_loss": 0.04348741099238396, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017242293688468635, "grad_norm": 5.431175231933594, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8567047715187073, "num_tokens": 221838513.0, "step": 5813 }, { "epoch": 0.7396005597252258, "ewc_loss": 0.04351097345352173, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001726585760479793, "grad_norm": 5.303292274475098, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8481655120849609, "num_tokens": 221882808.0, "step": 5814 }, { "epoch": 0.7397277700038163, "ewc_loss": 0.04349274933338165, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017247629875782877, "grad_norm": 5.360666275024414, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8725498914718628, "num_tokens": 221918505.0, "step": 5815 }, { "epoch": 0.7398549802824068, "ewc_loss": 0.043642349541187286, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001727516355458647, "grad_norm": 5.397480487823486, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8600620031356812, "num_tokens": 221955983.0, "step": 5816 }, { "epoch": 0.7399821905609973, "ewc_loss": 0.0434943363070488, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017249217489734292, "grad_norm": 5.350926876068115, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8585214018821716, "num_tokens": 221997363.0, "step": 5817 }, { "epoch": 0.7401094008395879, "ewc_loss": 0.04361206665635109, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017244879563804716, "grad_norm": 5.351129531860352, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8443676233291626, "num_tokens": 222039302.0, "step": 5818 }, { "epoch": 0.7402366111181784, "ewc_loss": 0.043641336262226105, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017274146375712007, "grad_norm": 5.382843494415283, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8577712774276733, "num_tokens": 222075779.0, "step": 5819 }, { "epoch": 0.7403638213967688, "ewc_loss": 0.04366721957921982, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017300034232903272, "grad_norm": 5.332341194152832, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8785193562507629, "num_tokens": 222115120.0, "step": 5820 }, { "epoch": 0.7404910316753593, "ewc_loss": 0.043611086905002594, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017243900219909847, "grad_norm": 5.3177008628845215, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.850655198097229, "num_tokens": 222156570.0, "step": 5821 }, { "epoch": 0.7406182419539499, "ewc_loss": 0.04368183761835098, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001731464872136712, "grad_norm": 5.371840953826904, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8590227365493774, "num_tokens": 222189487.0, "step": 5822 }, { "epoch": 0.7407454522325404, "ewc_loss": 0.04366195946931839, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001729476934997365, "grad_norm": 5.4049530029296875, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8574732542037964, "num_tokens": 222223163.0, "step": 5823 }, { "epoch": 0.7408726625111309, "ewc_loss": 0.04366826266050339, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017301076150033623, "grad_norm": 5.317004680633545, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8713125586509705, "num_tokens": 222259042.0, "step": 5824 }, { "epoch": 0.7409998727897215, "ewc_loss": 0.04355474188923836, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017309623945038766, "grad_norm": 5.336732387542725, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.8397918939590454, "num_tokens": 222300497.0, "step": 5825 }, { "epoch": 0.7411270830683119, "ewc_loss": 0.0437319315969944, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017364743689540774, "grad_norm": 5.38918924331665, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8556797504425049, "num_tokens": 222333630.0, "step": 5826 }, { "epoch": 0.7412542933469024, "ewc_loss": 0.043713293969631195, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017346105596516281, "grad_norm": 5.3469157218933105, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.854377031326294, "num_tokens": 222371715.0, "step": 5827 }, { "epoch": 0.7413815036254929, "ewc_loss": 0.04371964558959007, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017352458962704986, "grad_norm": 5.304870128631592, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8598164319992065, "num_tokens": 222412044.0, "step": 5828 }, { "epoch": 0.7415087139040835, "ewc_loss": 0.043754275888204575, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001738708815537393, "grad_norm": 5.326321125030518, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8616981506347656, "num_tokens": 222452668.0, "step": 5829 }, { "epoch": 0.741635924182674, "ewc_loss": 0.04377857223153114, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017411384033039212, "grad_norm": 5.388466835021973, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8608759045600891, "num_tokens": 222485631.0, "step": 5830 }, { "epoch": 0.7417631344612645, "ewc_loss": 0.04379623383283615, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017429047147743404, "grad_norm": 5.319869041442871, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8738066554069519, "num_tokens": 222522967.0, "step": 5831 }, { "epoch": 0.7418903447398549, "ewc_loss": 0.04376204311847687, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017394853057339787, "grad_norm": 5.3471360206604, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8483154773712158, "num_tokens": 222563258.0, "step": 5832 }, { "epoch": 0.7420175550184455, "ewc_loss": 0.0438346341252327, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017467446741648018, "grad_norm": 5.432305812835693, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8523449897766113, "num_tokens": 222599248.0, "step": 5833 }, { "epoch": 0.742144765297036, "ewc_loss": 0.043768078088760376, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001740089210215956, "grad_norm": 5.303246021270752, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8657780885696411, "num_tokens": 222642575.0, "step": 5834 }, { "epoch": 0.7422719755756265, "ewc_loss": 0.04379204288125038, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017424854740966111, "grad_norm": 5.351680278778076, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8360975384712219, "num_tokens": 222687330.0, "step": 5835 }, { "epoch": 0.742399185854217, "ewc_loss": 0.04377450421452522, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017407316772732884, "grad_norm": 5.44079065322876, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8339471817016602, "num_tokens": 222720020.0, "step": 5836 }, { "epoch": 0.7425263961328076, "ewc_loss": 0.04381438344717026, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.000174471948412247, "grad_norm": 5.434568881988525, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8718351125717163, "num_tokens": 222758634.0, "step": 5837 }, { "epoch": 0.742653606411398, "ewc_loss": 0.04375652223825455, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017389336426276714, "grad_norm": 5.37813138961792, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8654321432113647, "num_tokens": 222794823.0, "step": 5838 }, { "epoch": 0.7427808166899885, "ewc_loss": 0.044094108045101166, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017360711353830993, "grad_norm": 5.329777240753174, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8584060668945312, "num_tokens": 222833836.0, "step": 5839 }, { "epoch": 0.742908026968579, "ewc_loss": 0.04402162507176399, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.00017410297004971653, "grad_norm": 5.36532735824585, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8603647947311401, "num_tokens": 222875712.0, "step": 5840 }, { "epoch": 0.7430352372471696, "ewc_loss": 0.0437496080994606, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001738241990096867, "grad_norm": 5.380305290222168, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8477522134780884, "num_tokens": 222915494.0, "step": 5841 }, { "epoch": 0.7431624475257601, "ewc_loss": 0.04389600455760956, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017406746337655932, "grad_norm": 5.384721755981445, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8751064538955688, "num_tokens": 222951675.0, "step": 5842 }, { "epoch": 0.7432896578043506, "ewc_loss": 0.04369952529668808, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017332335119135678, "grad_norm": 5.317256927490234, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8549209237098694, "num_tokens": 222993245.0, "step": 5843 }, { "epoch": 0.743416868082941, "ewc_loss": 0.04370935261249542, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001734216493787244, "grad_norm": 5.3081440925598145, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8650739192962646, "num_tokens": 223029876.0, "step": 5844 }, { "epoch": 0.7435440783615316, "ewc_loss": 0.04373998939990997, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001737280108500272, "grad_norm": 5.347294807434082, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8642975687980652, "num_tokens": 223072550.0, "step": 5845 }, { "epoch": 0.7436712886401221, "ewc_loss": 0.0437297597527504, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.000173625725437887, "grad_norm": 5.350668430328369, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.857353687286377, "num_tokens": 223112892.0, "step": 5846 }, { "epoch": 0.7437984989187126, "ewc_loss": 0.04377148300409317, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017404297250322998, "grad_norm": 5.311345100402832, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8640979528427124, "num_tokens": 223157093.0, "step": 5847 }, { "epoch": 0.7439257091973032, "ewc_loss": 0.043733417987823486, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017366230895277113, "grad_norm": 5.3861470222473145, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8495447635650635, "num_tokens": 223190756.0, "step": 5848 }, { "epoch": 0.7440529194758937, "ewc_loss": 0.04377954453229904, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017412359011359513, "grad_norm": 5.396280765533447, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8490622639656067, "num_tokens": 223226013.0, "step": 5849 }, { "epoch": 0.7441801297544841, "ewc_loss": 0.04371549189090729, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017348302935715765, "grad_norm": 5.352588653564453, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8708447217941284, "num_tokens": 223262858.0, "step": 5850 }, { "epoch": 0.7443073400330746, "ewc_loss": 0.043700363487005234, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017333176219835877, "grad_norm": 5.321907043457031, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8626725077629089, "num_tokens": 223310528.0, "step": 5851 }, { "epoch": 0.7444345503116652, "ewc_loss": 0.043677397072315216, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017310208932030946, "grad_norm": 5.367408752441406, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8607364892959595, "num_tokens": 223345021.0, "step": 5852 }, { "epoch": 0.7445617605902557, "ewc_loss": 0.04371010139584541, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.000173429143615067, "grad_norm": 5.327630043029785, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8689000010490417, "num_tokens": 223382055.0, "step": 5853 }, { "epoch": 0.7446889708688462, "ewc_loss": 0.043730415403842926, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.000173632288351655, "grad_norm": 5.379898548126221, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8492213487625122, "num_tokens": 223418441.0, "step": 5854 }, { "epoch": 0.7448161811474368, "ewc_loss": 0.04378320276737213, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017416014452464879, "grad_norm": 5.514174461364746, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8573520183563232, "num_tokens": 223458150.0, "step": 5855 }, { "epoch": 0.7449433914260272, "ewc_loss": 0.043753936886787415, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017386752006132156, "grad_norm": 5.297566890716553, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8554055690765381, "num_tokens": 223496916.0, "step": 5856 }, { "epoch": 0.7450706017046177, "ewc_loss": 0.043655864894390106, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001728867500787601, "grad_norm": 5.382899284362793, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8649915456771851, "num_tokens": 223534808.0, "step": 5857 }, { "epoch": 0.7451978119832082, "ewc_loss": 0.04377865791320801, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017411471344530582, "grad_norm": 5.3858256340026855, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8679724931716919, "num_tokens": 223571255.0, "step": 5858 }, { "epoch": 0.7453250222617988, "ewc_loss": 0.04359915852546692, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017354039300698787, "grad_norm": 5.3287672996521, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8623846769332886, "num_tokens": 223616608.0, "step": 5859 }, { "epoch": 0.7454522325403893, "ewc_loss": 0.04368888959288597, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001732170203467831, "grad_norm": 5.354536056518555, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8588800430297852, "num_tokens": 223656188.0, "step": 5860 }, { "epoch": 0.7455794428189798, "ewc_loss": 0.043628886342048645, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001738376886351034, "grad_norm": 5.382932186126709, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8543576002120972, "num_tokens": 223693787.0, "step": 5861 }, { "epoch": 0.7457066530975703, "ewc_loss": 0.04371492564678192, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017347739776596427, "grad_norm": 5.355869770050049, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8711445927619934, "num_tokens": 223729350.0, "step": 5862 }, { "epoch": 0.7458338633761608, "ewc_loss": 0.043540097773075104, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001729498035274446, "grad_norm": 5.345451354980469, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.8476746082305908, "num_tokens": 223770846.0, "step": 5863 }, { "epoch": 0.7459610736547513, "ewc_loss": 0.043602585792541504, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017357469187118113, "grad_norm": 5.31899356842041, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8588215708732605, "num_tokens": 223814194.0, "step": 5864 }, { "epoch": 0.7460882839333418, "ewc_loss": 0.04363350197672844, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017388384731020778, "grad_norm": 5.416762351989746, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8524379730224609, "num_tokens": 223849141.0, "step": 5865 }, { "epoch": 0.7462154942119323, "ewc_loss": 0.04374763369560242, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017380448116455227, "grad_norm": 5.292428970336914, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.881239652633667, "num_tokens": 223888654.0, "step": 5866 }, { "epoch": 0.7463427044905229, "ewc_loss": 0.043706439435482025, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017339250189252198, "grad_norm": 5.3730149269104, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8621218204498291, "num_tokens": 223924509.0, "step": 5867 }, { "epoch": 0.7464699147691134, "ewc_loss": 0.04376165568828583, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017394470341969281, "grad_norm": 5.3430023193359375, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8446266055107117, "num_tokens": 223966191.0, "step": 5868 }, { "epoch": 0.7465971250477038, "ewc_loss": 0.04370886832475662, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017341678903903812, "grad_norm": 5.281925678253174, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8617292046546936, "num_tokens": 224001415.0, "step": 5869 }, { "epoch": 0.7467243353262943, "ewc_loss": 0.04380108416080475, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017433898756280541, "grad_norm": 5.349571228027344, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8658502101898193, "num_tokens": 224042820.0, "step": 5870 }, { "epoch": 0.7468515456048849, "ewc_loss": 0.04377845302224159, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017411266162525862, "grad_norm": 5.341549396514893, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8645301461219788, "num_tokens": 224078411.0, "step": 5871 }, { "epoch": 0.7469787558834754, "ewc_loss": 0.043774522840976715, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017407332779839635, "grad_norm": 5.337069988250732, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.86579430103302, "num_tokens": 224117136.0, "step": 5872 }, { "epoch": 0.7471059661620659, "ewc_loss": 0.04377599060535431, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017408803978469223, "grad_norm": 5.3382039070129395, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8530794382095337, "num_tokens": 224155879.0, "step": 5873 }, { "epoch": 0.7472331764406565, "ewc_loss": 0.04377807304263115, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001741088490234688, "grad_norm": 5.335572719573975, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8706681728363037, "num_tokens": 224195186.0, "step": 5874 }, { "epoch": 0.7473603867192469, "ewc_loss": 0.043804511427879333, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017437325732316822, "grad_norm": 5.334712505340576, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8557448983192444, "num_tokens": 224235722.0, "step": 5875 }, { "epoch": 0.7474875969978374, "ewc_loss": 0.043769098818302155, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017401909281034023, "grad_norm": 5.37943172454834, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8677375316619873, "num_tokens": 224268461.0, "step": 5876 }, { "epoch": 0.7476148072764279, "ewc_loss": 0.043837420642375946, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001747023343341425, "grad_norm": 5.395961284637451, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8723928928375244, "num_tokens": 224302921.0, "step": 5877 }, { "epoch": 0.7477420175550185, "ewc_loss": 0.04380303621292114, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017435848712921143, "grad_norm": 5.288426399230957, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8602628707885742, "num_tokens": 224344081.0, "step": 5878 }, { "epoch": 0.747869227833609, "ewc_loss": 0.043681271374225616, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017436152847949415, "grad_norm": 5.379327297210693, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8470678329467773, "num_tokens": 224384464.0, "step": 5879 }, { "epoch": 0.7479964381121995, "ewc_loss": 0.043738435953855515, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017493318591732532, "grad_norm": 5.317054271697998, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8672791123390198, "num_tokens": 224425542.0, "step": 5880 }, { "epoch": 0.7481236483907899, "ewc_loss": 0.043654702603816986, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017409585416316986, "grad_norm": 5.338139057159424, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8763924837112427, "num_tokens": 224465342.0, "step": 5881 }, { "epoch": 0.7482508586693805, "ewc_loss": 0.04381413012742996, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001744694309309125, "grad_norm": 5.374416351318359, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.863350510597229, "num_tokens": 224504787.0, "step": 5882 }, { "epoch": 0.748378068947971, "ewc_loss": 0.04380505904555321, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017437871429137886, "grad_norm": 5.352777004241943, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8604809045791626, "num_tokens": 224548534.0, "step": 5883 }, { "epoch": 0.7485052792265615, "ewc_loss": 0.043625738471746445, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017380621284246445, "grad_norm": 5.329251289367676, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8511477708816528, "num_tokens": 224590251.0, "step": 5884 }, { "epoch": 0.748632489505152, "ewc_loss": 0.04369356855750084, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001744845212670043, "grad_norm": 5.3580169677734375, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8694746494293213, "num_tokens": 224633745.0, "step": 5885 }, { "epoch": 0.7487596997837426, "ewc_loss": 0.04365932196378708, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017414205649401993, "grad_norm": 5.381134986877441, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8436176776885986, "num_tokens": 224671319.0, "step": 5886 }, { "epoch": 0.748886910062333, "ewc_loss": 0.04369974881410599, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017454629414714873, "grad_norm": 5.3568339347839355, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8597369194030762, "num_tokens": 224712670.0, "step": 5887 }, { "epoch": 0.7490141203409235, "ewc_loss": 0.043655261397361755, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017410144209861755, "grad_norm": 5.382236003875732, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8566467761993408, "num_tokens": 224747840.0, "step": 5888 }, { "epoch": 0.749141330619514, "ewc_loss": 0.043713368475437164, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017468248552177101, "grad_norm": 5.37818717956543, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8703452348709106, "num_tokens": 224783274.0, "step": 5889 }, { "epoch": 0.7492685408981046, "ewc_loss": 0.04370982199907303, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001746470370562747, "grad_norm": 5.352313995361328, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.855548620223999, "num_tokens": 224820456.0, "step": 5890 }, { "epoch": 0.7493957511766951, "ewc_loss": 0.04367389529943466, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001742877793731168, "grad_norm": 5.363062858581543, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8633875846862793, "num_tokens": 224858569.0, "step": 5891 }, { "epoch": 0.7495229614552856, "ewc_loss": 0.04372330382466316, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017478186055086553, "grad_norm": 5.358098983764648, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8579710721969604, "num_tokens": 224901033.0, "step": 5892 }, { "epoch": 0.749650171733876, "ewc_loss": 0.04368956387042999, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017444445984438062, "grad_norm": 5.3833112716674805, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8739141821861267, "num_tokens": 224934345.0, "step": 5893 }, { "epoch": 0.7497773820124666, "ewc_loss": 0.04372694715857506, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017481829854659736, "grad_norm": 5.390567779541016, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8521076440811157, "num_tokens": 224972479.0, "step": 5894 }, { "epoch": 0.7499045922910571, "ewc_loss": 0.04372671619057655, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017481598479207605, "grad_norm": 5.35319709777832, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8757424354553223, "num_tokens": 225014213.0, "step": 5895 }, { "epoch": 0.7500318025696476, "ewc_loss": 0.04373042285442352, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001748530485201627, "grad_norm": 5.3621063232421875, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8670089840888977, "num_tokens": 225049929.0, "step": 5896 }, { "epoch": 0.7501590128482382, "ewc_loss": 0.04374070465564728, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017495588690508157, "grad_norm": 5.426296234130859, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8450751304626465, "num_tokens": 225086409.0, "step": 5897 }, { "epoch": 0.7502862231268287, "ewc_loss": 0.0437585674226284, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017513451166450977, "grad_norm": 5.395678520202637, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.842131495475769, "num_tokens": 225124443.0, "step": 5898 }, { "epoch": 0.7504134334054191, "ewc_loss": 0.04373233765363693, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017487221339251846, "grad_norm": 5.364790439605713, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.84868323802948, "num_tokens": 225166665.0, "step": 5899 }, { "epoch": 0.7505406436840096, "ewc_loss": 0.04369335621595383, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017448239668738097, "grad_norm": 5.364707946777344, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8353459239006042, "num_tokens": 225203841.0, "step": 5900 }, { "epoch": 0.7506678539626002, "ewc_loss": 0.043724872171878815, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017479753296356648, "grad_norm": 5.442560195922852, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8429192900657654, "num_tokens": 225234844.0, "step": 5901 }, { "epoch": 0.7507950642411907, "ewc_loss": 0.043762728571891785, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017517608648631722, "grad_norm": 5.431029796600342, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8418567180633545, "num_tokens": 225269828.0, "step": 5902 }, { "epoch": 0.7509222745197812, "ewc_loss": 0.043728455901145935, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001748334034346044, "grad_norm": 5.329214572906494, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8635814189910889, "num_tokens": 225312361.0, "step": 5903 }, { "epoch": 0.7510494847983717, "ewc_loss": 0.04374245926737785, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017497342196293175, "grad_norm": 5.3739471435546875, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8489633798599243, "num_tokens": 225353708.0, "step": 5904 }, { "epoch": 0.7511766950769622, "ewc_loss": 0.04371919110417366, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017474073683843017, "grad_norm": 5.296419143676758, "learning_rate": 1e-06, "loss": 0.3928, "mean_token_accuracy": 0.8779915571212769, "num_tokens": 225399730.0, "step": 5905 }, { "epoch": 0.7513039053555527, "ewc_loss": 0.04377349466085434, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017528377065900713, "grad_norm": 5.384571075439453, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8535126447677612, "num_tokens": 225441836.0, "step": 5906 }, { "epoch": 0.7514311156341432, "ewc_loss": 0.043781816959381104, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017536697851028293, "grad_norm": 5.408867835998535, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8667723536491394, "num_tokens": 225471004.0, "step": 5907 }, { "epoch": 0.7515583259127337, "ewc_loss": 0.04390615224838257, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017538966494612396, "grad_norm": 5.472219944000244, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.841488778591156, "num_tokens": 225505789.0, "step": 5908 }, { "epoch": 0.7516855361913243, "ewc_loss": 0.04415694624185562, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.00017545618175063282, "grad_norm": 9.019861221313477, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8614222407341003, "num_tokens": 225541138.0, "step": 5909 }, { "epoch": 0.7518127464699148, "ewc_loss": 0.047705940902233124, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00021338752412702888, "grad_norm": 5.9232940673828125, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8625804781913757, "num_tokens": 225581695.0, "step": 5910 }, { "epoch": 0.7519399567485053, "ewc_loss": 0.042855154722929, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00016487967513967305, "grad_norm": 5.168241500854492, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8651912212371826, "num_tokens": 225619532.0, "step": 5911 }, { "epoch": 0.7520671670270958, "ewc_loss": 0.044389113783836365, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00018021925643552095, "grad_norm": 5.61051607131958, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8515043258666992, "num_tokens": 225657403.0, "step": 5912 }, { "epoch": 0.7521943773056863, "ewc_loss": 0.044022317975759506, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001765513006830588, "grad_norm": 5.380157470703125, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8550023436546326, "num_tokens": 225693402.0, "step": 5913 }, { "epoch": 0.7523215875842768, "ewc_loss": 0.04390529915690422, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001753811229718849, "grad_norm": 5.499664783477783, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8578079342842102, "num_tokens": 225727208.0, "step": 5914 }, { "epoch": 0.7524487978628673, "ewc_loss": 0.04400525614619255, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001763806794770062, "grad_norm": 5.372666835784912, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8634760975837708, "num_tokens": 225762117.0, "step": 5915 }, { "epoch": 0.7525760081414579, "ewc_loss": 0.043879084289073944, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.00017511895566713065, "grad_norm": 5.513871192932129, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8441885709762573, "num_tokens": 225794737.0, "step": 5916 }, { "epoch": 0.7527032184200484, "ewc_loss": 0.044025324285030365, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001765813649399206, "grad_norm": 5.372437000274658, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8587111234664917, "num_tokens": 225829074.0, "step": 5917 }, { "epoch": 0.7528304286986388, "ewc_loss": 0.0439131073653698, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001754591939970851, "grad_norm": 5.372063159942627, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8721477389335632, "num_tokens": 225871344.0, "step": 5918 }, { "epoch": 0.7529576389772293, "ewc_loss": 0.04402969405055046, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001766250643413514, "grad_norm": 5.422471523284912, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8545867204666138, "num_tokens": 225911321.0, "step": 5919 }, { "epoch": 0.7530848492558199, "ewc_loss": 0.04395364969968796, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001758646103553474, "grad_norm": 5.405383110046387, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8678299188613892, "num_tokens": 225944613.0, "step": 5920 }, { "epoch": 0.7532120595344104, "ewc_loss": 0.044281721115112305, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017548323376104236, "grad_norm": 9.03585147857666, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8638625741004944, "num_tokens": 225989501.0, "step": 5921 }, { "epoch": 0.7533392698130009, "ewc_loss": 0.04814312607049942, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0002165386831620708, "grad_norm": 5.99903678894043, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8620703220367432, "num_tokens": 226024984.0, "step": 5922 }, { "epoch": 0.7534664800915915, "ewc_loss": 0.04324638843536377, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00016512992442585528, "grad_norm": 5.11566686630249, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8720167875289917, "num_tokens": 226063255.0, "step": 5923 }, { "epoch": 0.7535936903701819, "ewc_loss": 0.0448903851211071, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00018156986334361136, "grad_norm": 5.6558942794799805, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.847614586353302, "num_tokens": 226105538.0, "step": 5924 }, { "epoch": 0.7537209006487724, "ewc_loss": 0.044450148940086365, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.00017838820349425077, "grad_norm": 5.363369941711426, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8623281717300415, "num_tokens": 226137449.0, "step": 5925 }, { "epoch": 0.7538481109273629, "ewc_loss": 0.0443585105240345, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017625112377572805, "grad_norm": 5.552022457122803, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8534897565841675, "num_tokens": 226171078.0, "step": 5926 }, { "epoch": 0.7539753212059535, "ewc_loss": 0.04451422393321991, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001778082805685699, "grad_norm": 5.3812785148620605, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8614753484725952, "num_tokens": 226208060.0, "step": 5927 }, { "epoch": 0.754102531484544, "ewc_loss": 0.04434064030647278, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017607241170480847, "grad_norm": 5.419219970703125, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.869819164276123, "num_tokens": 226245330.0, "step": 5928 }, { "epoch": 0.7542297417631345, "ewc_loss": 0.04445180296897888, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001771840179571882, "grad_norm": 5.4402174949646, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8805149793624878, "num_tokens": 226281974.0, "step": 5929 }, { "epoch": 0.7543569520417249, "ewc_loss": 0.0439433753490448, "ewc_loss_diag": 2.6345252990722656e-05, "ewc_loss_parallel": 0.0001757618592819199, "grad_norm": 5.371913433074951, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8499222993850708, "num_tokens": 226323302.0, "step": 5930 }, { "epoch": 0.7544841623203155, "ewc_loss": 0.04433802515268326, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017604629101697356, "grad_norm": 5.480063438415527, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8460922837257385, "num_tokens": 226361346.0, "step": 5931 }, { "epoch": 0.754611372598906, "ewc_loss": 0.044323816895484924, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017590417701285332, "grad_norm": 5.394452095031738, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8551624417304993, "num_tokens": 226397744.0, "step": 5932 }, { "epoch": 0.7547385828774965, "ewc_loss": 0.04426414519548416, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017530747572891414, "grad_norm": 5.420022010803223, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8668187856674194, "num_tokens": 226436745.0, "step": 5933 }, { "epoch": 0.754865793156087, "ewc_loss": 0.04404148459434509, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017552226199768484, "grad_norm": 5.3766398429870605, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8667858839035034, "num_tokens": 226470302.0, "step": 5934 }, { "epoch": 0.7549930034346776, "ewc_loss": 0.044056862592697144, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017567606118973345, "grad_norm": 5.456581115722656, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8613103628158569, "num_tokens": 226500090.0, "step": 5935 }, { "epoch": 0.755120213713268, "ewc_loss": 0.044068120419979095, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017578862025402486, "grad_norm": 5.459712028503418, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8599702715873718, "num_tokens": 226533359.0, "step": 5936 }, { "epoch": 0.7552474239918585, "ewc_loss": 0.04431027173995972, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017576874233782291, "grad_norm": 5.439211368560791, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8510197997093201, "num_tokens": 226567060.0, "step": 5937 }, { "epoch": 0.755374634270449, "ewc_loss": 0.04425875097513199, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017525351722724736, "grad_norm": 5.347075462341309, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8571565747261047, "num_tokens": 226604270.0, "step": 5938 }, { "epoch": 0.7555018445490396, "ewc_loss": 0.04430286958813667, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017569470219314098, "grad_norm": 5.37606954574585, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8574299812316895, "num_tokens": 226641310.0, "step": 5939 }, { "epoch": 0.7556290548276301, "ewc_loss": 0.04530338570475578, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00017593424126971513, "grad_norm": 35.664798736572266, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8481277227401733, "num_tokens": 226678305.0, "step": 5940 }, { "epoch": 0.7557562651062206, "ewc_loss": 0.06147448346018791, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00034741085255518556, "grad_norm": 8.2728271484375, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8667006492614746, "num_tokens": 226715628.0, "step": 5941 }, { "epoch": 0.755883475384811, "ewc_loss": 0.046770866960287094, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001979332882910967, "grad_norm": 4.866841793060303, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8705940246582031, "num_tokens": 226755346.0, "step": 5942 }, { "epoch": 0.7560106856634016, "ewc_loss": 0.050545480102300644, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00023812081781215966, "grad_norm": 7.2821221351623535, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.84807288646698, "num_tokens": 226792243.0, "step": 5943 }, { "epoch": 0.7561378959419921, "ewc_loss": 0.05556853115558624, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00028835132252424955, "grad_norm": 7.253495693206787, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.873332142829895, "num_tokens": 226829033.0, "step": 5944 }, { "epoch": 0.7562651062205826, "ewc_loss": 0.048222459852695465, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00021244920208118856, "grad_norm": 5.589329719543457, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8739926218986511, "num_tokens": 226870645.0, "step": 5945 }, { "epoch": 0.7563923164991732, "ewc_loss": 0.04803045466542244, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0002129705680999905, "grad_norm": 6.442723751068115, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.864483654499054, "num_tokens": 226902395.0, "step": 5946 }, { "epoch": 0.7565195267777637, "ewc_loss": 0.049603939056396484, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00022870537941344082, "grad_norm": 6.143797874450684, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8661952018737793, "num_tokens": 226942620.0, "step": 5947 }, { "epoch": 0.7566467370563541, "ewc_loss": 0.04670718312263489, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00019973785674665123, "grad_norm": 5.855533123016357, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8664894700050354, "num_tokens": 226976472.0, "step": 5948 }, { "epoch": 0.7567739473349446, "ewc_loss": 0.04687415808439255, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0002014075726037845, "grad_norm": 5.927578926086426, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8497565984725952, "num_tokens": 227013626.0, "step": 5949 }, { "epoch": 0.7569011576135352, "ewc_loss": 0.04651903361082077, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00019785636686719954, "grad_norm": 5.7391133308410645, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8471077680587769, "num_tokens": 227049336.0, "step": 5950 }, { "epoch": 0.7570283678921257, "ewc_loss": 0.04577957093715668, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00019046170928049833, "grad_norm": 5.6911420822143555, "learning_rate": 1e-06, "loss": 0.3915, "mean_token_accuracy": 0.882996678352356, "num_tokens": 227087371.0, "step": 5951 }, { "epoch": 0.7571555781707162, "ewc_loss": 0.04546631500124931, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00018977058061864227, "grad_norm": 5.618076801300049, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.861721396446228, "num_tokens": 227129188.0, "step": 5952 }, { "epoch": 0.7572827884493067, "ewc_loss": 0.045088447630405426, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001859919138951227, "grad_norm": 5.66559362411499, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8535673022270203, "num_tokens": 227161807.0, "step": 5953 }, { "epoch": 0.7574099987278972, "ewc_loss": 0.04493232071399689, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.000184430624358356, "grad_norm": 5.537432670593262, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8512386679649353, "num_tokens": 227198431.0, "step": 5954 }, { "epoch": 0.7575372090064877, "ewc_loss": 0.04453083500266075, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001828571839723736, "grad_norm": 5.540848731994629, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8790184259414673, "num_tokens": 227236046.0, "step": 5955 }, { "epoch": 0.7576644192850782, "ewc_loss": 0.044339194893836975, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00018094076949637383, "grad_norm": 5.498627662658691, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.8749610185623169, "num_tokens": 227272925.0, "step": 5956 }, { "epoch": 0.7577916295636687, "ewc_loss": 0.04428640753030777, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00018041292787529528, "grad_norm": 5.487307548522949, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8652478456497192, "num_tokens": 227312311.0, "step": 5957 }, { "epoch": 0.7579188398422593, "ewc_loss": 0.04414328932762146, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001789817470125854, "grad_norm": 5.431413650512695, "learning_rate": 1e-06, "loss": 0.3952, "mean_token_accuracy": 0.8798950910568237, "num_tokens": 227349632.0, "step": 5958 }, { "epoch": 0.7580460501208498, "ewc_loss": 0.04406866431236267, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001782354956958443, "grad_norm": 5.457040309906006, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8590602874755859, "num_tokens": 227392982.0, "step": 5959 }, { "epoch": 0.7581732603994403, "ewc_loss": 0.044038861989974976, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017793744336813688, "grad_norm": 5.4812140464782715, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8571279048919678, "num_tokens": 227431878.0, "step": 5960 }, { "epoch": 0.7583004706780307, "ewc_loss": 0.04398633539676666, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017741219198796898, "grad_norm": 5.433164596557617, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8502941131591797, "num_tokens": 227470401.0, "step": 5961 }, { "epoch": 0.7584276809566213, "ewc_loss": 0.04392373561859131, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017678616859484464, "grad_norm": 5.428037643432617, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8652978539466858, "num_tokens": 227507189.0, "step": 5962 }, { "epoch": 0.7585548912352118, "ewc_loss": 0.04391690343618393, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017671787645667791, "grad_norm": 5.383473873138428, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8640229105949402, "num_tokens": 227548070.0, "step": 5963 }, { "epoch": 0.7586821015138023, "ewc_loss": 0.043882180005311966, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001763706241035834, "grad_norm": 5.39296293258667, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8649587631225586, "num_tokens": 227584360.0, "step": 5964 }, { "epoch": 0.7588093117923929, "ewc_loss": 0.043906114995479584, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017660995945334435, "grad_norm": 5.436702251434326, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8659406900405884, "num_tokens": 227618961.0, "step": 5965 }, { "epoch": 0.7589365220709834, "ewc_loss": 0.04388781636953354, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017642701277509332, "grad_norm": 5.405133247375488, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8615624308586121, "num_tokens": 227658304.0, "step": 5966 }, { "epoch": 0.7590637323495738, "ewc_loss": 0.04388967156410217, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001764455228112638, "grad_norm": 5.397352695465088, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8581905961036682, "num_tokens": 227701526.0, "step": 5967 }, { "epoch": 0.7591909426281643, "ewc_loss": 0.04388780891895294, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001764268963597715, "grad_norm": 5.3400139808654785, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8651460409164429, "num_tokens": 227744930.0, "step": 5968 }, { "epoch": 0.7593181529067549, "ewc_loss": 0.043876536190509796, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017631416267249733, "grad_norm": 5.423717021942139, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8715227842330933, "num_tokens": 227780607.0, "step": 5969 }, { "epoch": 0.7594453631853454, "ewc_loss": 0.04391907528042793, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001767395733622834, "grad_norm": 5.358969688415527, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.857428789138794, "num_tokens": 227825742.0, "step": 5970 }, { "epoch": 0.7595725734639359, "ewc_loss": 0.04387080669403076, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017625688633415848, "grad_norm": 5.425203800201416, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8406152725219727, "num_tokens": 227865278.0, "step": 5971 }, { "epoch": 0.7596997837425264, "ewc_loss": 0.04393213987350464, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017687020590528846, "grad_norm": 5.393163681030273, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8568609356880188, "num_tokens": 227906912.0, "step": 5972 }, { "epoch": 0.7598269940211169, "ewc_loss": 0.04387431591749191, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017629201465751976, "grad_norm": 5.35884428024292, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8577263355255127, "num_tokens": 227947067.0, "step": 5973 }, { "epoch": 0.7599542042997074, "ewc_loss": 0.04392837733030319, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001768325746525079, "grad_norm": 5.412703514099121, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8422710299491882, "num_tokens": 227988563.0, "step": 5974 }, { "epoch": 0.7600814145782979, "ewc_loss": 0.04393633082509041, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017691212997306138, "grad_norm": 5.408658504486084, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8540072441101074, "num_tokens": 228028556.0, "step": 5975 }, { "epoch": 0.7602086248568884, "ewc_loss": 0.04392106086015701, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001767594221746549, "grad_norm": 5.370810031890869, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8750705718994141, "num_tokens": 228066265.0, "step": 5976 }, { "epoch": 0.760335835135479, "ewc_loss": 0.04389573633670807, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017650617519393563, "grad_norm": 5.393884181976318, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.857398509979248, "num_tokens": 228103732.0, "step": 5977 }, { "epoch": 0.7604630454140695, "ewc_loss": 0.04390573501586914, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001766061905073002, "grad_norm": 5.340404510498047, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8576627969741821, "num_tokens": 228149609.0, "step": 5978 }, { "epoch": 0.7605902556926599, "ewc_loss": 0.04394001513719559, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017694896087050438, "grad_norm": 5.42614221572876, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8637727499008179, "num_tokens": 228188462.0, "step": 5979 }, { "epoch": 0.7607174659712505, "ewc_loss": 0.04419826716184616, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017709008534438908, "grad_norm": 5.377688407897949, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8740683197975159, "num_tokens": 228229252.0, "step": 5980 }, { "epoch": 0.760844676249841, "ewc_loss": 0.04419146850705147, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017702209879644215, "grad_norm": 5.424798011779785, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8568211197853088, "num_tokens": 228267397.0, "step": 5981 }, { "epoch": 0.7609718865284315, "ewc_loss": 0.043974436819553375, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017729320097714663, "grad_norm": 5.371132850646973, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8678514957427979, "num_tokens": 228307561.0, "step": 5982 }, { "epoch": 0.761099096807022, "ewc_loss": 0.04418395459651947, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001769469672581181, "grad_norm": 5.426375389099121, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8618342876434326, "num_tokens": 228347667.0, "step": 5983 }, { "epoch": 0.7612263070856126, "ewc_loss": 0.043967120349407196, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.0001772200339473784, "grad_norm": 5.417262554168701, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8575104475021362, "num_tokens": 228386820.0, "step": 5984 }, { "epoch": 0.761353517364203, "ewc_loss": 0.04390868544578552, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017663570179138333, "grad_norm": 5.369978427886963, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8697509765625, "num_tokens": 228427304.0, "step": 5985 }, { "epoch": 0.7614807276427935, "ewc_loss": 0.04395926743745804, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017714148270897567, "grad_norm": 5.420481204986572, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8507725596427917, "num_tokens": 228470779.0, "step": 5986 }, { "epoch": 0.761607937921384, "ewc_loss": 0.04395586997270584, "ewc_loss_diag": 2.6226043701171875e-05, "ewc_loss_parallel": 0.00017710751853883266, "grad_norm": 5.4141716957092285, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8762072324752808, "num_tokens": 228500454.0, "step": 5987 }, { "epoch": 0.7617351481999746, "ewc_loss": 0.04422065615653992, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001773139665601775, "grad_norm": 5.418532371520996, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8673453330993652, "num_tokens": 228537374.0, "step": 5988 }, { "epoch": 0.7618623584785651, "ewc_loss": 0.04421476647257805, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017725508951116353, "grad_norm": 5.35638427734375, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8608438968658447, "num_tokens": 228577215.0, "step": 5989 }, { "epoch": 0.7619895687571556, "ewc_loss": 0.044250134378671646, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017760875925887376, "grad_norm": 5.481637954711914, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8646657466888428, "num_tokens": 228611693.0, "step": 5990 }, { "epoch": 0.762116779035746, "ewc_loss": 0.04427182674407959, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017782569921109825, "grad_norm": 5.430298328399658, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8601731061935425, "num_tokens": 228644231.0, "step": 5991 }, { "epoch": 0.7622439893143366, "ewc_loss": 0.044189453125, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017700194439385086, "grad_norm": 5.347533702850342, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8715860843658447, "num_tokens": 228684776.0, "step": 5992 }, { "epoch": 0.7623711995929271, "ewc_loss": 0.04421914741396904, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017729889077600092, "grad_norm": 5.397524356842041, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8668322563171387, "num_tokens": 228724214.0, "step": 5993 }, { "epoch": 0.7624984098715176, "ewc_loss": 0.04427418112754822, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017784922965802252, "grad_norm": 5.381252288818359, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8603923320770264, "num_tokens": 228765098.0, "step": 5994 }, { "epoch": 0.7626256201501082, "ewc_loss": 0.0441989004611969, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017709641542751342, "grad_norm": 5.388706684112549, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8614016771316528, "num_tokens": 228802423.0, "step": 5995 }, { "epoch": 0.7627528304286987, "ewc_loss": 0.04424645006656647, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017757191380951554, "grad_norm": 5.3940253257751465, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8511521220207214, "num_tokens": 228840218.0, "step": 5996 }, { "epoch": 0.7628800407072891, "ewc_loss": 0.04447629302740097, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001774289266904816, "grad_norm": 5.357305526733398, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8660330772399902, "num_tokens": 228882810.0, "step": 5997 }, { "epoch": 0.7630072509858796, "ewc_loss": 0.0443742536008358, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.0001776292483555153, "grad_norm": 5.423760414123535, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8674445152282715, "num_tokens": 228917664.0, "step": 5998 }, { "epoch": 0.7631344612644702, "ewc_loss": 0.04448753967881203, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017754141299519688, "grad_norm": 10.39104175567627, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8676134347915649, "num_tokens": 228951017.0, "step": 5999 }, { "epoch": 0.7632616715430607, "ewc_loss": 0.05029598996043205, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0002356259064981714, "grad_norm": 6.26575231552124, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8624069690704346, "num_tokens": 228990949.0, "step": 6000 }, { "epoch": 0.7633888818216512, "ewc_loss": 0.04323519021272659, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001650179037824273, "grad_norm": 4.9493584632873535, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8660388588905334, "num_tokens": 229026604.0, "step": 6001 }, { "epoch": 0.7635160921002417, "ewc_loss": 0.04561851918697357, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00018885120516642928, "grad_norm": 5.906740188598633, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8557184934616089, "num_tokens": 229061810.0, "step": 6002 }, { "epoch": 0.7636433023788322, "ewc_loss": 0.045333512127399445, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00018600111070554703, "grad_norm": 5.292059898376465, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8381826877593994, "num_tokens": 229107765.0, "step": 6003 }, { "epoch": 0.7637705126574227, "ewc_loss": 0.04491811990737915, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00018184723739977926, "grad_norm": 5.657134532928467, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8483541011810303, "num_tokens": 229147822.0, "step": 6004 }, { "epoch": 0.7638977229360132, "ewc_loss": 0.04520750790834427, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001847410894697532, "grad_norm": 5.474246501922607, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8701834678649902, "num_tokens": 229184421.0, "step": 6005 }, { "epoch": 0.7640249332146037, "ewc_loss": 0.044785015285015106, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001805161446100101, "grad_norm": 5.513121128082275, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8615598678588867, "num_tokens": 229221737.0, "step": 6006 }, { "epoch": 0.7641521434931943, "ewc_loss": 0.04496035724878311, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00018226957763545215, "grad_norm": 5.498192310333252, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.855167031288147, "num_tokens": 229254727.0, "step": 6007 }, { "epoch": 0.7642793537717848, "ewc_loss": 0.04471973329782486, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017986336024478078, "grad_norm": 5.487563610076904, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8708357214927673, "num_tokens": 229292648.0, "step": 6008 }, { "epoch": 0.7644065640503753, "ewc_loss": 0.044744379818439484, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00018010979692917317, "grad_norm": 5.385447978973389, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8655160665512085, "num_tokens": 229335708.0, "step": 6009 }, { "epoch": 0.7645337743289657, "ewc_loss": 0.04463627189397812, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017902874969877303, "grad_norm": 5.447958946228027, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8694843053817749, "num_tokens": 229373791.0, "step": 6010 }, { "epoch": 0.7646609846075563, "ewc_loss": 0.044699523597955704, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.000179661248694174, "grad_norm": 5.4113569259643555, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8705077171325684, "num_tokens": 229409891.0, "step": 6011 }, { "epoch": 0.7647881948861468, "ewc_loss": 0.04434772580862045, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.00017858468345366418, "grad_norm": 5.399815559387207, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8633086681365967, "num_tokens": 229449859.0, "step": 6012 }, { "epoch": 0.7649154051647373, "ewc_loss": 0.044544778764247894, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.00017933449998963624, "grad_norm": 5.467471122741699, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8686901330947876, "num_tokens": 229484224.0, "step": 6013 }, { "epoch": 0.7650426154433279, "ewc_loss": 0.04448702931404114, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.0001787570072337985, "grad_norm": 5.454843997955322, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8563119173049927, "num_tokens": 229521104.0, "step": 6014 }, { "epoch": 0.7651698257219184, "ewc_loss": 0.04460529237985611, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017871895397547632, "grad_norm": 5.454653263092041, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8454166054725647, "num_tokens": 229558918.0, "step": 6015 }, { "epoch": 0.7652970360005088, "ewc_loss": 0.04432845115661621, "ewc_loss_diag": 2.6464462280273438e-05, "ewc_loss_parallel": 0.0001783919578883797, "grad_norm": 5.470131874084473, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8469700813293457, "num_tokens": 229596931.0, "step": 6016 }, { "epoch": 0.7654242462790993, "ewc_loss": 0.04450325667858124, "ewc_loss_diag": 2.658367156982422e-05, "ewc_loss_parallel": 0.0001789192610885948, "grad_norm": 5.454064846038818, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8457543253898621, "num_tokens": 229633903.0, "step": 6017 }, { "epoch": 0.7655514565576899, "ewc_loss": 0.044549670070409775, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.000178162707015872, "grad_norm": 5.450572967529297, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8579185605049133, "num_tokens": 229674224.0, "step": 6018 }, { "epoch": 0.7656786668362804, "ewc_loss": 0.04454897344112396, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017815576575230807, "grad_norm": 5.508044719696045, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8594463467597961, "num_tokens": 229710547.0, "step": 6019 }, { "epoch": 0.7658058771148709, "ewc_loss": 0.04453567415475845, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001780227612471208, "grad_norm": 5.3888349533081055, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.875901460647583, "num_tokens": 229752020.0, "step": 6020 }, { "epoch": 0.7659330873934614, "ewc_loss": 0.04449052736163139, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017757128807716072, "grad_norm": 5.525230407714844, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8586064577102661, "num_tokens": 229785710.0, "step": 6021 }, { "epoch": 0.7660602976720519, "ewc_loss": 0.04455041140317917, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017817011394072324, "grad_norm": 5.44395112991333, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8677533268928528, "num_tokens": 229824118.0, "step": 6022 }, { "epoch": 0.7661875079506424, "ewc_loss": 0.04446067661046982, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017727275553625077, "grad_norm": 5.466490745544434, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8731924295425415, "num_tokens": 229854231.0, "step": 6023 }, { "epoch": 0.7663147182292329, "ewc_loss": 0.04449281096458435, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017759413458406925, "grad_norm": 5.433872222900391, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8518388867378235, "num_tokens": 229888102.0, "step": 6024 }, { "epoch": 0.7664419285078234, "ewc_loss": 0.04447273537516594, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017739336180966347, "grad_norm": 5.443609237670898, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8654928207397461, "num_tokens": 229927634.0, "step": 6025 }, { "epoch": 0.766569138786414, "ewc_loss": 0.04446987062692642, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017736470908857882, "grad_norm": 5.356213569641113, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8692235946655273, "num_tokens": 229966526.0, "step": 6026 }, { "epoch": 0.7666963490650045, "ewc_loss": 0.04448432847857475, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.00017750929691828787, "grad_norm": 5.46946382522583, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8707218170166016, "num_tokens": 230002761.0, "step": 6027 }, { "epoch": 0.7668235593435949, "ewc_loss": 0.044639356434345245, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017783888324629515, "grad_norm": 5.439681053161621, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8661693334579468, "num_tokens": 230041303.0, "step": 6028 }, { "epoch": 0.7669507696221854, "ewc_loss": 0.044456325471401215, "ewc_loss_diag": 2.6702880859375e-05, "ewc_loss_parallel": 0.0001772292744135484, "grad_norm": 5.370547294616699, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8656575679779053, "num_tokens": 230081117.0, "step": 6029 }, { "epoch": 0.767077979900776, "ewc_loss": 0.04471622407436371, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017738682799972594, "grad_norm": 5.481484889984131, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8555341958999634, "num_tokens": 230119411.0, "step": 6030 }, { "epoch": 0.7672051901793665, "ewc_loss": 0.04462330415844917, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001776783465174958, "grad_norm": 5.407960414886475, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8682456612586975, "num_tokens": 230161111.0, "step": 6031 }, { "epoch": 0.767332400457957, "ewc_loss": 0.044565677642822266, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017710207612253726, "grad_norm": 5.446821212768555, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8566292524337769, "num_tokens": 230196999.0, "step": 6032 }, { "epoch": 0.7674596107365476, "ewc_loss": 0.044611550867557526, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017756079614628106, "grad_norm": 5.451138019561768, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8625222444534302, "num_tokens": 230231495.0, "step": 6033 }, { "epoch": 0.767586821015138, "ewc_loss": 0.04457869380712509, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017723222845233977, "grad_norm": 5.473782062530518, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8543375730514526, "num_tokens": 230267732.0, "step": 6034 }, { "epoch": 0.7677140312937285, "ewc_loss": 0.044544100761413574, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017688634397927672, "grad_norm": 5.440272331237793, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8604896664619446, "num_tokens": 230297559.0, "step": 6035 }, { "epoch": 0.767841241572319, "ewc_loss": 0.04459548369050026, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001774001430021599, "grad_norm": 5.45125150680542, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8491160869598389, "num_tokens": 230336396.0, "step": 6036 }, { "epoch": 0.7679684518509096, "ewc_loss": 0.044546034187078476, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017690565437078476, "grad_norm": 5.441118240356445, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8611928224563599, "num_tokens": 230368152.0, "step": 6037 }, { "epoch": 0.7680956621295001, "ewc_loss": 0.04459039494395256, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001773492549546063, "grad_norm": 5.379783630371094, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8674084544181824, "num_tokens": 230408081.0, "step": 6038 }, { "epoch": 0.7682228724080906, "ewc_loss": 0.04459580034017563, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001774033298715949, "grad_norm": 5.520410537719727, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8544490337371826, "num_tokens": 230441691.0, "step": 6039 }, { "epoch": 0.768350082686681, "ewc_loss": 0.044619105756282806, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017763639334589243, "grad_norm": 5.350102424621582, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8593626022338867, "num_tokens": 230480089.0, "step": 6040 }, { "epoch": 0.7684772929652716, "ewc_loss": 0.04462561756372452, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017770146951079369, "grad_norm": 5.42490291595459, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.855918288230896, "num_tokens": 230518655.0, "step": 6041 }, { "epoch": 0.7686045032438621, "ewc_loss": 0.04469241574406624, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017836946062743664, "grad_norm": 5.433741569519043, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8646745681762695, "num_tokens": 230555717.0, "step": 6042 }, { "epoch": 0.7687317135224526, "ewc_loss": 0.04466778412461281, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001781231549102813, "grad_norm": 5.391921043395996, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8695150017738342, "num_tokens": 230596699.0, "step": 6043 }, { "epoch": 0.7688589238010431, "ewc_loss": 0.04461544752120972, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017759978072717786, "grad_norm": 5.3487138748168945, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8696234226226807, "num_tokens": 230633284.0, "step": 6044 }, { "epoch": 0.7689861340796337, "ewc_loss": 0.044672757387161255, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017817289335653186, "grad_norm": 5.426449775695801, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8648780584335327, "num_tokens": 230673133.0, "step": 6045 }, { "epoch": 0.7691133443582241, "ewc_loss": 0.04469538852572441, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001783991901902482, "grad_norm": 5.3878068923950195, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.862163245677948, "num_tokens": 230719003.0, "step": 6046 }, { "epoch": 0.7692405546368146, "ewc_loss": 0.044638097286224365, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017782626673579216, "grad_norm": 5.386951923370361, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8708288669586182, "num_tokens": 230754805.0, "step": 6047 }, { "epoch": 0.7693677649154052, "ewc_loss": 0.04466357082128525, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017808101256377995, "grad_norm": 5.443264484405518, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8479865789413452, "num_tokens": 230793249.0, "step": 6048 }, { "epoch": 0.7694949751939957, "ewc_loss": 0.04464457184076309, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017789103731047362, "grad_norm": 5.408101558685303, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.87800133228302, "num_tokens": 230826025.0, "step": 6049 }, { "epoch": 0.7696221854725862, "ewc_loss": 0.044629331678152084, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017773863510228693, "grad_norm": 5.391885280609131, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8730236291885376, "num_tokens": 230866615.0, "step": 6050 }, { "epoch": 0.7697493957511767, "ewc_loss": 0.044722363352775574, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017744823708198965, "grad_norm": 5.460483551025391, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8456043601036072, "num_tokens": 230901587.0, "step": 6051 }, { "epoch": 0.7698766060297672, "ewc_loss": 0.04463794827461243, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001778247969923541, "grad_norm": 5.342308521270752, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8670564889907837, "num_tokens": 230944267.0, "step": 6052 }, { "epoch": 0.7700038163083577, "ewc_loss": 0.044612444937229156, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017756977467797697, "grad_norm": 5.471498489379883, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8529666662216187, "num_tokens": 230980022.0, "step": 6053 }, { "epoch": 0.7701310265869482, "ewc_loss": 0.04465656727552414, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017801098874770105, "grad_norm": 5.375787734985352, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8680803775787354, "num_tokens": 231018110.0, "step": 6054 }, { "epoch": 0.7702582368655387, "ewc_loss": 0.044580310583114624, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017724843928590417, "grad_norm": 5.900983810424805, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8666321635246277, "num_tokens": 231056531.0, "step": 6055 }, { "epoch": 0.7703854471441293, "ewc_loss": 0.0448233038187027, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001796783326426521, "grad_norm": 5.309347152709961, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8667192459106445, "num_tokens": 231096270.0, "step": 6056 }, { "epoch": 0.7705126574227198, "ewc_loss": 0.04447963461279869, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017624165047891438, "grad_norm": 5.469968795776367, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8553699254989624, "num_tokens": 231136300.0, "step": 6057 }, { "epoch": 0.7706398677013102, "ewc_loss": 0.04468381404876709, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017828342970460653, "grad_norm": 5.391237735748291, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8485825657844543, "num_tokens": 231169005.0, "step": 6058 }, { "epoch": 0.7707670779799007, "ewc_loss": 0.04461188241839409, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017756412853486836, "grad_norm": 5.445553302764893, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8681677579879761, "num_tokens": 231205274.0, "step": 6059 }, { "epoch": 0.7708942882584913, "ewc_loss": 0.04461494833230972, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017759480397216976, "grad_norm": 5.428703784942627, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8668302893638611, "num_tokens": 231239022.0, "step": 6060 }, { "epoch": 0.7710214985370818, "ewc_loss": 0.04466778784990311, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017812319856602699, "grad_norm": 5.393129825592041, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8434464335441589, "num_tokens": 231282952.0, "step": 6061 }, { "epoch": 0.7711487088156723, "ewc_loss": 0.04467139020562172, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001781592145562172, "grad_norm": 5.428452491760254, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8595830202102661, "num_tokens": 231322980.0, "step": 6062 }, { "epoch": 0.7712759190942629, "ewc_loss": 0.04466977342963219, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017814304737839848, "grad_norm": 5.471130847930908, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8538913726806641, "num_tokens": 231358382.0, "step": 6063 }, { "epoch": 0.7714031293728534, "ewc_loss": 0.04462188482284546, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017766417295206338, "grad_norm": 5.3451337814331055, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8694890141487122, "num_tokens": 231401915.0, "step": 6064 }, { "epoch": 0.7715303396514438, "ewc_loss": 0.044617511332035065, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017762044444680214, "grad_norm": 5.483945369720459, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8390681147575378, "num_tokens": 231433241.0, "step": 6065 }, { "epoch": 0.7716575499300343, "ewc_loss": 0.04478992521762848, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001781238679541275, "grad_norm": 5.366729259490967, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8566620349884033, "num_tokens": 231478318.0, "step": 6066 }, { "epoch": 0.7717847602086249, "ewc_loss": 0.04461561143398285, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001776014396455139, "grad_norm": 5.415024280548096, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8532522320747375, "num_tokens": 231518584.0, "step": 6067 }, { "epoch": 0.7719119704872154, "ewc_loss": 0.04465765878558159, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017802190268412232, "grad_norm": 5.4157915115356445, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8608651161193848, "num_tokens": 231556689.0, "step": 6068 }, { "epoch": 0.7720391807658059, "ewc_loss": 0.04463762789964676, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001778215664671734, "grad_norm": 5.411563873291016, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8605012893676758, "num_tokens": 231593936.0, "step": 6069 }, { "epoch": 0.7721663910443964, "ewc_loss": 0.04466376453638077, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001780829334165901, "grad_norm": 5.387026309967041, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8802981972694397, "num_tokens": 231634785.0, "step": 6070 }, { "epoch": 0.7722936013229869, "ewc_loss": 0.04467279836535454, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017817330081015825, "grad_norm": 5.408692836761475, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8571714162826538, "num_tokens": 231675798.0, "step": 6071 }, { "epoch": 0.7724208116015774, "ewc_loss": 0.04472839832305908, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017872927128337324, "grad_norm": 5.437085151672363, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8583348393440247, "num_tokens": 231711611.0, "step": 6072 }, { "epoch": 0.7725480218801679, "ewc_loss": 0.044705405831336975, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017849936557468027, "grad_norm": 5.448127746582031, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8508433103561401, "num_tokens": 231746997.0, "step": 6073 }, { "epoch": 0.7726752321587584, "ewc_loss": 0.04470586031675339, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001785038912203163, "grad_norm": 5.43234395980835, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8412728905677795, "num_tokens": 231787696.0, "step": 6074 }, { "epoch": 0.772802442437349, "ewc_loss": 0.04481250047683716, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017834959726314992, "grad_norm": 5.475065231323242, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8529902696609497, "num_tokens": 231826127.0, "step": 6075 }, { "epoch": 0.7729296527159395, "ewc_loss": 0.04468866437673569, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017833197489380836, "grad_norm": 5.463896751403809, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8552391529083252, "num_tokens": 231857538.0, "step": 6076 }, { "epoch": 0.7730568629945299, "ewc_loss": 0.044809356331825256, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001783181942300871, "grad_norm": 5.3974809646606445, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8595410585403442, "num_tokens": 231893913.0, "step": 6077 }, { "epoch": 0.7731840732731204, "ewc_loss": 0.04467080160975456, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017815333558246493, "grad_norm": 5.390204429626465, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8633565902709961, "num_tokens": 231934274.0, "step": 6078 }, { "epoch": 0.773311283551711, "ewc_loss": 0.04469608515501022, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017840614600572735, "grad_norm": 5.3644232749938965, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8549302220344543, "num_tokens": 231983197.0, "step": 6079 }, { "epoch": 0.7734384938303015, "ewc_loss": 0.04469135031104088, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001783588231774047, "grad_norm": 5.450717926025391, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8562653064727783, "num_tokens": 232021265.0, "step": 6080 }, { "epoch": 0.773565704108892, "ewc_loss": 0.04483191668987274, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001785437489161268, "grad_norm": 5.366186618804932, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8589126467704773, "num_tokens": 232060992.0, "step": 6081 }, { "epoch": 0.7736929143874826, "ewc_loss": 0.04472194239497185, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017866473353933543, "grad_norm": 5.449746608734131, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8573398590087891, "num_tokens": 232096758.0, "step": 6082 }, { "epoch": 0.773820124666073, "ewc_loss": 0.04483805596828461, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017860517255030572, "grad_norm": 5.39227819442749, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8573665618896484, "num_tokens": 232133773.0, "step": 6083 }, { "epoch": 0.7739473349446635, "ewc_loss": 0.04498114436864853, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017881534586194903, "grad_norm": 5.4176177978515625, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8585608005523682, "num_tokens": 232173182.0, "step": 6084 }, { "epoch": 0.774074545223254, "ewc_loss": 0.044838711619377136, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017861173546407372, "grad_norm": 5.4147748947143555, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8684800267219543, "num_tokens": 232206478.0, "step": 6085 }, { "epoch": 0.7742017555018446, "ewc_loss": 0.044842369854450226, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017864828987512738, "grad_norm": 5.526488304138184, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8549115657806396, "num_tokens": 232242231.0, "step": 6086 }, { "epoch": 0.7743289657804351, "ewc_loss": 0.04476670175790787, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001791123067960143, "grad_norm": 5.456665515899658, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8636986017227173, "num_tokens": 232270636.0, "step": 6087 }, { "epoch": 0.7744561760590256, "ewc_loss": 0.04469811171293259, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017842641682364047, "grad_norm": 5.519876480102539, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.865932047367096, "num_tokens": 232309974.0, "step": 6088 }, { "epoch": 0.774583386337616, "ewc_loss": 0.0446675568819046, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017812085570767522, "grad_norm": 5.447566509246826, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8698530793190002, "num_tokens": 232344118.0, "step": 6089 }, { "epoch": 0.7747105966162066, "ewc_loss": 0.04464246332645416, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017786996613722295, "grad_norm": 5.4882707595825195, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8505831956863403, "num_tokens": 232385183.0, "step": 6090 }, { "epoch": 0.7748378068947971, "ewc_loss": 0.04460059478878975, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017745126388035715, "grad_norm": 5.390986919403076, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8614943027496338, "num_tokens": 232421670.0, "step": 6091 }, { "epoch": 0.7749650171733876, "ewc_loss": 0.04472426325082779, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001774672418832779, "grad_norm": 5.573211193084717, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8496794700622559, "num_tokens": 232459563.0, "step": 6092 }, { "epoch": 0.7750922274519781, "ewc_loss": 0.04463512450456619, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001777965808287263, "grad_norm": 5.365767955780029, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8600544333457947, "num_tokens": 232496357.0, "step": 6093 }, { "epoch": 0.7752194377305687, "ewc_loss": 0.04460115730762482, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017745686636772007, "grad_norm": 5.672420501708984, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8611338138580322, "num_tokens": 232530436.0, "step": 6094 }, { "epoch": 0.7753466480091591, "ewc_loss": 0.04483014717698097, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017852608289103955, "grad_norm": 5.391371726989746, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8545116186141968, "num_tokens": 232568585.0, "step": 6095 }, { "epoch": 0.7754738582877496, "ewc_loss": 0.04456242173910141, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017706950893625617, "grad_norm": 5.629908084869385, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8678675889968872, "num_tokens": 232602506.0, "step": 6096 }, { "epoch": 0.7756010685663401, "ewc_loss": 0.04489975422620773, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001780014717951417, "grad_norm": 5.360581874847412, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8686577677726746, "num_tokens": 232642819.0, "step": 6097 }, { "epoch": 0.7757282788449307, "ewc_loss": 0.04478178173303604, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017682171892374754, "grad_norm": 5.525411128997803, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8472837209701538, "num_tokens": 232679505.0, "step": 6098 }, { "epoch": 0.7758554891235212, "ewc_loss": 0.044766414910554886, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017788875265978277, "grad_norm": 5.417880535125732, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8658685684204102, "num_tokens": 232715701.0, "step": 6099 }, { "epoch": 0.7759826994021117, "ewc_loss": 0.044722385704517365, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017744844080880284, "grad_norm": 5.478199481964111, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8618758320808411, "num_tokens": 232750786.0, "step": 6100 }, { "epoch": 0.7761099096807021, "ewc_loss": 0.044767022132873535, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017789483536034822, "grad_norm": 5.448680400848389, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8499864935874939, "num_tokens": 232791370.0, "step": 6101 }, { "epoch": 0.7762371199592927, "ewc_loss": 0.04469829052686691, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017720750474836677, "grad_norm": 5.361760139465332, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8632358312606812, "num_tokens": 232830895.0, "step": 6102 }, { "epoch": 0.7763643302378832, "ewc_loss": 0.04490838944911957, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017808782286010683, "grad_norm": 5.525545597076416, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8596347570419312, "num_tokens": 232869940.0, "step": 6103 }, { "epoch": 0.7764915405164737, "ewc_loss": 0.04475129395723343, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017773752915672958, "grad_norm": 5.3994574546813965, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8553557395935059, "num_tokens": 232910743.0, "step": 6104 }, { "epoch": 0.7766187507950643, "ewc_loss": 0.044724561274051666, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017747019592206925, "grad_norm": 5.449905872344971, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8596438765525818, "num_tokens": 232948428.0, "step": 6105 }, { "epoch": 0.7767459610736548, "ewc_loss": 0.04462239891290665, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.000177669309778139, "grad_norm": 5.424288272857666, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8667254447937012, "num_tokens": 232985142.0, "step": 6106 }, { "epoch": 0.7768731713522452, "ewc_loss": 0.04460415989160538, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017748693062458187, "grad_norm": 5.4326677322387695, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8708046078681946, "num_tokens": 233020810.0, "step": 6107 }, { "epoch": 0.7770003816308357, "ewc_loss": 0.04470314830541611, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001784768101060763, "grad_norm": 5.4441237449646, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8528039455413818, "num_tokens": 233055674.0, "step": 6108 }, { "epoch": 0.7771275919094263, "ewc_loss": 0.044625572860240936, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017770103295333683, "grad_norm": 5.491677284240723, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8559309840202332, "num_tokens": 233084506.0, "step": 6109 }, { "epoch": 0.7772548021880168, "ewc_loss": 0.04482370615005493, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001784616761142388, "grad_norm": 5.428729057312012, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8775063753128052, "num_tokens": 233120172.0, "step": 6110 }, { "epoch": 0.7773820124666073, "ewc_loss": 0.04474146291613579, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001776392455212772, "grad_norm": 5.417459487915039, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8498564958572388, "num_tokens": 233160075.0, "step": 6111 }, { "epoch": 0.7775092227451978, "ewc_loss": 0.04478895291686058, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017811414727475494, "grad_norm": 5.408236026763916, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8531856536865234, "num_tokens": 233201714.0, "step": 6112 }, { "epoch": 0.7776364330237884, "ewc_loss": 0.0447772778570652, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017799738270696253, "grad_norm": 5.445962429046631, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8663325309753418, "num_tokens": 233237642.0, "step": 6113 }, { "epoch": 0.7777636433023788, "ewc_loss": 0.04479248821735382, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017814949387684464, "grad_norm": 5.361150741577148, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8657559156417847, "num_tokens": 233279712.0, "step": 6114 }, { "epoch": 0.7778908535809693, "ewc_loss": 0.04481060802936554, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001783306652214378, "grad_norm": 5.37802267074585, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8656017780303955, "num_tokens": 233322441.0, "step": 6115 }, { "epoch": 0.7780180638595598, "ewc_loss": 0.044825710356235504, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017848171410150826, "grad_norm": 5.393791675567627, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8550916910171509, "num_tokens": 233371053.0, "step": 6116 }, { "epoch": 0.7781452741381504, "ewc_loss": 0.04496107995510101, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017861471860669553, "grad_norm": 5.519099235534668, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8624172210693359, "num_tokens": 233405372.0, "step": 6117 }, { "epoch": 0.7782724844167409, "ewc_loss": 0.044707041233778, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017851572192739695, "grad_norm": 5.398387432098389, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.867201030254364, "num_tokens": 233447209.0, "step": 6118 }, { "epoch": 0.7783996946953314, "ewc_loss": 0.044816866517066956, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.0001783932966645807, "grad_norm": 5.420360088348389, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8639262318611145, "num_tokens": 233484621.0, "step": 6119 }, { "epoch": 0.7785269049739219, "ewc_loss": 0.04472193494439125, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001786646607797593, "grad_norm": 5.498672008514404, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8436692357063293, "num_tokens": 233520756.0, "step": 6120 }, { "epoch": 0.7786541152525124, "ewc_loss": 0.04469939321279526, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017843926616478711, "grad_norm": 5.418628692626953, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.853091835975647, "num_tokens": 233559120.0, "step": 6121 }, { "epoch": 0.7787813255311029, "ewc_loss": 0.044686950743198395, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001783148036338389, "grad_norm": 5.5167436599731445, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.871253252029419, "num_tokens": 233590513.0, "step": 6122 }, { "epoch": 0.7789085358096934, "ewc_loss": 0.044689178466796875, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017833708261605352, "grad_norm": 5.406770706176758, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8768196105957031, "num_tokens": 233625986.0, "step": 6123 }, { "epoch": 0.779035746088284, "ewc_loss": 0.0446980819106102, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017842611123342067, "grad_norm": 5.503349304199219, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8686708807945251, "num_tokens": 233655097.0, "step": 6124 }, { "epoch": 0.7791629563668745, "ewc_loss": 0.04471375048160553, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017858283536043018, "grad_norm": 5.488882064819336, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8513622283935547, "num_tokens": 233687815.0, "step": 6125 }, { "epoch": 0.7792901666454649, "ewc_loss": 0.0446523055434227, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017796836618799716, "grad_norm": 5.465414524078369, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8738256692886353, "num_tokens": 233722611.0, "step": 6126 }, { "epoch": 0.7794173769240554, "ewc_loss": 0.044677868485450745, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017822398513089865, "grad_norm": 5.435763359069824, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8486611843109131, "num_tokens": 233765345.0, "step": 6127 }, { "epoch": 0.779544587202646, "ewc_loss": 0.04467617720365524, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001782070758054033, "grad_norm": 5.427339553833008, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8627474308013916, "num_tokens": 233804358.0, "step": 6128 }, { "epoch": 0.7796717974812365, "ewc_loss": 0.044712018221616745, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017856548947747797, "grad_norm": 5.458593845367432, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8498623967170715, "num_tokens": 233845117.0, "step": 6129 }, { "epoch": 0.779799007759827, "ewc_loss": 0.04467448592185974, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.00017819018103182316, "grad_norm": 5.461221694946289, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8537471294403076, "num_tokens": 233884091.0, "step": 6130 }, { "epoch": 0.7799262180384176, "ewc_loss": 0.044924430549144745, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017824821406975389, "grad_norm": 5.428884506225586, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.858879804611206, "num_tokens": 233922590.0, "step": 6131 }, { "epoch": 0.780053428317008, "ewc_loss": 0.044945865869522095, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017846256378106773, "grad_norm": 5.4552836418151855, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.863262414932251, "num_tokens": 233954138.0, "step": 6132 }, { "epoch": 0.7801806385955985, "ewc_loss": 0.044934988021850586, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017835380276665092, "grad_norm": 5.427530288696289, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8703664541244507, "num_tokens": 233988035.0, "step": 6133 }, { "epoch": 0.780307848874189, "ewc_loss": 0.04493039473891258, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017830786237027496, "grad_norm": 5.416545391082764, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8507306575775146, "num_tokens": 234023747.0, "step": 6134 }, { "epoch": 0.7804350591527796, "ewc_loss": 0.04499382525682449, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017894215125124902, "grad_norm": 5.418851852416992, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8692261576652527, "num_tokens": 234064182.0, "step": 6135 }, { "epoch": 0.7805622694313701, "ewc_loss": 0.04471093416213989, "ewc_loss_diag": 2.682209014892578e-05, "ewc_loss_parallel": 0.0001785546774044633, "grad_norm": 5.436394691467285, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8443731665611267, "num_tokens": 234106533.0, "step": 6136 }, { "epoch": 0.7806894797099606, "ewc_loss": 0.045004069805145264, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001790445821825415, "grad_norm": 5.427013397216797, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8727670907974243, "num_tokens": 234145492.0, "step": 6137 }, { "epoch": 0.780816689988551, "ewc_loss": 0.04493594914674759, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017836337792687118, "grad_norm": 5.46093225479126, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8633172512054443, "num_tokens": 234181709.0, "step": 6138 }, { "epoch": 0.7809439002671416, "ewc_loss": 0.04494375362992287, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017844144895207137, "grad_norm": 5.410955905914307, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8612459301948547, "num_tokens": 234217854.0, "step": 6139 }, { "epoch": 0.7810711105457321, "ewc_loss": 0.04490828886628151, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017808680422604084, "grad_norm": 5.461134910583496, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8612947463989258, "num_tokens": 234255769.0, "step": 6140 }, { "epoch": 0.7811983208243226, "ewc_loss": 0.044966407120227814, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017866797861643136, "grad_norm": 5.477619171142578, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8465059995651245, "num_tokens": 234290828.0, "step": 6141 }, { "epoch": 0.7813255311029131, "ewc_loss": 0.044917032122612, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017817421758081764, "grad_norm": 5.457819938659668, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8606874346733093, "num_tokens": 234326336.0, "step": 6142 }, { "epoch": 0.7814527413815037, "ewc_loss": 0.04498109221458435, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017881485109683126, "grad_norm": 5.456597805023193, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8563873767852783, "num_tokens": 234366210.0, "step": 6143 }, { "epoch": 0.7815799516600941, "ewc_loss": 0.044924333691596985, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017824722453951836, "grad_norm": 5.479698657989502, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8517747521400452, "num_tokens": 234398975.0, "step": 6144 }, { "epoch": 0.7817071619386846, "ewc_loss": 0.04497269541025162, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001787308428902179, "grad_norm": 5.407377243041992, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8542318940162659, "num_tokens": 234437746.0, "step": 6145 }, { "epoch": 0.7818343722172751, "ewc_loss": 0.0448184460401535, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017840907094068825, "grad_norm": 5.430233478546143, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8650494813919067, "num_tokens": 234478402.0, "step": 6146 }, { "epoch": 0.7819615824958657, "ewc_loss": 0.044953711330890656, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017854102770797908, "grad_norm": 5.479975700378418, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8451140522956848, "num_tokens": 234509699.0, "step": 6147 }, { "epoch": 0.7820887927744562, "ewc_loss": 0.04500657320022583, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017906961147673428, "grad_norm": 5.426193714141846, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8552356958389282, "num_tokens": 234551515.0, "step": 6148 }, { "epoch": 0.7822160030530467, "ewc_loss": 0.04493935406208038, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017839747306425124, "grad_norm": 5.457814693450928, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8674068450927734, "num_tokens": 234582811.0, "step": 6149 }, { "epoch": 0.7823432133316371, "ewc_loss": 0.04487675428390503, "ewc_loss_diag": 2.6941299438476562e-05, "ewc_loss_parallel": 0.00017899215163197368, "grad_norm": 5.420642375946045, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8532185554504395, "num_tokens": 234625299.0, "step": 6150 }, { "epoch": 0.7824704236102277, "ewc_loss": 0.04497494176030159, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017875332559924573, "grad_norm": 5.42265510559082, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8659282922744751, "num_tokens": 234665641.0, "step": 6151 }, { "epoch": 0.7825976338888182, "ewc_loss": 0.04500027745962143, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017900668899528682, "grad_norm": 5.459433078765869, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8590810298919678, "num_tokens": 234704668.0, "step": 6152 }, { "epoch": 0.7827248441674087, "ewc_loss": 0.04496724158525467, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017867630231194198, "grad_norm": 5.476631164550781, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8641533255577087, "num_tokens": 234738080.0, "step": 6153 }, { "epoch": 0.7828520544459993, "ewc_loss": 0.044948697090148926, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017849089636001736, "grad_norm": 5.404587745666504, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8582547903060913, "num_tokens": 234781077.0, "step": 6154 }, { "epoch": 0.7829792647245898, "ewc_loss": 0.04497316852211952, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017873558681458235, "grad_norm": 5.566082000732422, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8453536629676819, "num_tokens": 234809374.0, "step": 6155 }, { "epoch": 0.7831064750031802, "ewc_loss": 0.04497464746236801, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017875040066428483, "grad_norm": 5.393527030944824, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8487865924835205, "num_tokens": 234845585.0, "step": 6156 }, { "epoch": 0.7832336852817707, "ewc_loss": 0.044915687292814255, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017816078616306186, "grad_norm": 5.485871315002441, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8593825101852417, "num_tokens": 234883891.0, "step": 6157 }, { "epoch": 0.7833608955603613, "ewc_loss": 0.04493914544582367, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017839533393271267, "grad_norm": 5.415535926818848, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.859659731388092, "num_tokens": 234919445.0, "step": 6158 }, { "epoch": 0.7834881058389518, "ewc_loss": 0.044922828674316406, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017823220696300268, "grad_norm": 5.456328868865967, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8632247447967529, "num_tokens": 234951465.0, "step": 6159 }, { "epoch": 0.7836153161175423, "ewc_loss": 0.04492395371198654, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017824347014538944, "grad_norm": 5.431694030761719, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8526772260665894, "num_tokens": 234989970.0, "step": 6160 }, { "epoch": 0.7837425263961328, "ewc_loss": 0.044969357550144196, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017869746079668403, "grad_norm": 5.448204040527344, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8607502579689026, "num_tokens": 235029673.0, "step": 6161 }, { "epoch": 0.7838697366747234, "ewc_loss": 0.04496999830007553, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017870389274321496, "grad_norm": 5.473039150238037, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8589488863945007, "num_tokens": 235065389.0, "step": 6162 }, { "epoch": 0.7839969469533138, "ewc_loss": 0.045002296566963196, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001790268870536238, "grad_norm": 5.551872253417969, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8720786571502686, "num_tokens": 235099795.0, "step": 6163 }, { "epoch": 0.7841241572319043, "ewc_loss": 0.0449666865170002, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001786707725841552, "grad_norm": 5.387329578399658, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8613674640655518, "num_tokens": 235137475.0, "step": 6164 }, { "epoch": 0.7842513675104948, "ewc_loss": 0.044936925172805786, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017837315681390464, "grad_norm": 5.474049091339111, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8560191988945007, "num_tokens": 235178454.0, "step": 6165 }, { "epoch": 0.7843785777890854, "ewc_loss": 0.0450177937746048, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017918185039889067, "grad_norm": 5.435554027557373, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8779104351997375, "num_tokens": 235216139.0, "step": 6166 }, { "epoch": 0.7845057880676759, "ewc_loss": 0.04499095678329468, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017891349853016436, "grad_norm": 5.447107791900635, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.875397801399231, "num_tokens": 235253571.0, "step": 6167 }, { "epoch": 0.7846329983462664, "ewc_loss": 0.04496267810463905, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017863069660961628, "grad_norm": 5.424464702606201, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8434352874755859, "num_tokens": 235293950.0, "step": 6168 }, { "epoch": 0.7847602086248568, "ewc_loss": 0.045088037848472595, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017866355483420193, "grad_norm": 5.367325782775879, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8646497130393982, "num_tokens": 235336419.0, "step": 6169 }, { "epoch": 0.7848874189034474, "ewc_loss": 0.044950030744075775, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017850422591436654, "grad_norm": 5.510835647583008, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8647815585136414, "num_tokens": 235376039.0, "step": 6170 }, { "epoch": 0.7850146291820379, "ewc_loss": 0.04500141739845276, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001790180685929954, "grad_norm": 5.427159786224365, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8485520482063293, "num_tokens": 235412736.0, "step": 6171 }, { "epoch": 0.7851418394606284, "ewc_loss": 0.044911690056324005, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017812082660384476, "grad_norm": 5.574836254119873, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8610743880271912, "num_tokens": 235457318.0, "step": 6172 }, { "epoch": 0.785269049739219, "ewc_loss": 0.045038722455501556, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001781704486347735, "grad_norm": 5.437020778656006, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8492391705513, "num_tokens": 235491471.0, "step": 6173 }, { "epoch": 0.7853962600178095, "ewc_loss": 0.04487597942352295, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001777637080522254, "grad_norm": 5.43704891204834, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8709433078765869, "num_tokens": 235531324.0, "step": 6174 }, { "epoch": 0.7855234702963999, "ewc_loss": 0.044900327920913696, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.000178007161593996, "grad_norm": 5.416413307189941, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8486030697822571, "num_tokens": 235573233.0, "step": 6175 }, { "epoch": 0.7856506805749904, "ewc_loss": 0.04489096254110336, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001779135491233319, "grad_norm": 5.379428386688232, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8623473644256592, "num_tokens": 235614400.0, "step": 6176 }, { "epoch": 0.785777890853581, "ewc_loss": 0.04491950571537018, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001781989703886211, "grad_norm": 5.413689613342285, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8683264255523682, "num_tokens": 235652247.0, "step": 6177 }, { "epoch": 0.7859051011321715, "ewc_loss": 0.04494089260697365, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001784128398867324, "grad_norm": 5.4915008544921875, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8563663959503174, "num_tokens": 235681810.0, "step": 6178 }, { "epoch": 0.786032311410762, "ewc_loss": 0.04498109221458435, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017881480744108558, "grad_norm": 5.441693305969238, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8693567514419556, "num_tokens": 235718422.0, "step": 6179 }, { "epoch": 0.7861595216893525, "ewc_loss": 0.04497164115309715, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001787203218555078, "grad_norm": 5.424744606018066, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8569815754890442, "num_tokens": 235758886.0, "step": 6180 }, { "epoch": 0.786286731967943, "ewc_loss": 0.044974081218242645, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001787446963135153, "grad_norm": 5.445983409881592, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.863275945186615, "num_tokens": 235792270.0, "step": 6181 }, { "epoch": 0.7864139422465335, "ewc_loss": 0.0450400710105896, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017940464022103697, "grad_norm": 5.742368698120117, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8503783345222473, "num_tokens": 235826678.0, "step": 6182 }, { "epoch": 0.786541152525124, "ewc_loss": 0.045037999749183655, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017938391829375178, "grad_norm": 5.366292476654053, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.871675431728363, "num_tokens": 235860437.0, "step": 6183 }, { "epoch": 0.7866683628037145, "ewc_loss": 0.044922180473804474, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017822573136072606, "grad_norm": 5.400650978088379, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8659855723381042, "num_tokens": 235900865.0, "step": 6184 }, { "epoch": 0.7867955730823051, "ewc_loss": 0.045009858906269073, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001791024988051504, "grad_norm": 5.4087066650390625, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8467131853103638, "num_tokens": 235937419.0, "step": 6185 }, { "epoch": 0.7869227833608956, "ewc_loss": 0.045045528560876846, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017945919535122812, "grad_norm": 5.4752092361450195, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8510688543319702, "num_tokens": 235976425.0, "step": 6186 }, { "epoch": 0.787049993639486, "ewc_loss": 0.04502548277378082, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001792587136151269, "grad_norm": 5.520096302032471, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8548393845558167, "num_tokens": 236012968.0, "step": 6187 }, { "epoch": 0.7871772039180766, "ewc_loss": 0.045138828456401825, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001791715039871633, "grad_norm": 5.41659688949585, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8615846633911133, "num_tokens": 236052462.0, "step": 6188 }, { "epoch": 0.7873044141966671, "ewc_loss": 0.04513929411768913, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017917614604812115, "grad_norm": 5.462727069854736, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8760401010513306, "num_tokens": 236087604.0, "step": 6189 }, { "epoch": 0.7874316244752576, "ewc_loss": 0.045145511627197266, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017923832638189197, "grad_norm": 5.4001264572143555, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8595870137214661, "num_tokens": 236126344.0, "step": 6190 }, { "epoch": 0.7875588347538481, "ewc_loss": 0.04517141729593277, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017949739412870258, "grad_norm": 5.44697380065918, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8681437969207764, "num_tokens": 236161913.0, "step": 6191 }, { "epoch": 0.7876860450324387, "ewc_loss": 0.045304685831069946, "ewc_loss_diag": 2.7298927307128906e-05, "ewc_loss_parallel": 0.0001796093420125544, "grad_norm": 5.44842004776001, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.863385796546936, "num_tokens": 236200612.0, "step": 6192 }, { "epoch": 0.7878132553110291, "ewc_loss": 0.04516519606113434, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001794351846911013, "grad_norm": 5.424887180328369, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8662558794021606, "num_tokens": 236238136.0, "step": 6193 }, { "epoch": 0.7879404655896196, "ewc_loss": 0.04528306797146797, "ewc_loss_diag": 2.7298927307128906e-05, "ewc_loss_parallel": 0.00017939318786375225, "grad_norm": 5.397828578948975, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8577950596809387, "num_tokens": 236281303.0, "step": 6194 }, { "epoch": 0.7880676758682101, "ewc_loss": 0.04540733993053436, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001794151758076623, "grad_norm": 5.428805351257324, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8566866517066956, "num_tokens": 236322355.0, "step": 6195 }, { "epoch": 0.7881948861468007, "ewc_loss": 0.04513663798570633, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017914957425091416, "grad_norm": 5.452791690826416, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8590339422225952, "num_tokens": 236354263.0, "step": 6196 }, { "epoch": 0.7883220964253912, "ewc_loss": 0.04516344517469406, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001794176787370816, "grad_norm": 5.404329299926758, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8626291751861572, "num_tokens": 236391581.0, "step": 6197 }, { "epoch": 0.7884493067039817, "ewc_loss": 0.04539020359516144, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00017924385610967875, "grad_norm": 5.445096969604492, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8682399988174438, "num_tokens": 236426003.0, "step": 6198 }, { "epoch": 0.7885765169825721, "ewc_loss": 0.04518688842654228, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017965209553949535, "grad_norm": 5.458836078643799, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8702796101570129, "num_tokens": 236464874.0, "step": 6199 }, { "epoch": 0.7887037272611627, "ewc_loss": 0.04505617916584015, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017956567171495408, "grad_norm": 5.4229536056518555, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8721529245376587, "num_tokens": 236499826.0, "step": 6200 }, { "epoch": 0.7888309375397532, "ewc_loss": 0.04511871188879013, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001789703092072159, "grad_norm": 5.390308856964111, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8727478384971619, "num_tokens": 236538615.0, "step": 6201 }, { "epoch": 0.7889581478183437, "ewc_loss": 0.0452161580324173, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001799447782104835, "grad_norm": 5.480463981628418, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8586609363555908, "num_tokens": 236577860.0, "step": 6202 }, { "epoch": 0.7890853580969343, "ewc_loss": 0.045203231275081635, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017981554265134037, "grad_norm": 5.413987159729004, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8754004240036011, "num_tokens": 236619041.0, "step": 6203 }, { "epoch": 0.7892125683755248, "ewc_loss": 0.04515508562326431, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001793340634321794, "grad_norm": 5.447114944458008, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.86424320936203, "num_tokens": 236655788.0, "step": 6204 }, { "epoch": 0.7893397786541152, "ewc_loss": 0.04514914005994797, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017927458975464106, "grad_norm": 5.4518866539001465, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8726049065589905, "num_tokens": 236691532.0, "step": 6205 }, { "epoch": 0.7894669889327057, "ewc_loss": 0.04515150561928749, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017929825116880238, "grad_norm": 5.426209449768066, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8674573302268982, "num_tokens": 236733792.0, "step": 6206 }, { "epoch": 0.7895941992112963, "ewc_loss": 0.044992513954639435, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017892903997562826, "grad_norm": 5.374035835266113, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8730727434158325, "num_tokens": 236775454.0, "step": 6207 }, { "epoch": 0.7897214094898868, "ewc_loss": 0.045037806034088135, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017938193923328072, "grad_norm": 5.410731315612793, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8708337545394897, "num_tokens": 236811572.0, "step": 6208 }, { "epoch": 0.7898486197684773, "ewc_loss": 0.04503517597913742, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017935567302629352, "grad_norm": 5.436344623565674, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8457558155059814, "num_tokens": 236851018.0, "step": 6209 }, { "epoch": 0.7899758300470678, "ewc_loss": 0.04502575099468231, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017926139116752893, "grad_norm": 5.383708477020264, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8653001189231873, "num_tokens": 236891871.0, "step": 6210 }, { "epoch": 0.7901030403256584, "ewc_loss": 0.045015208423137665, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.0001791560061974451, "grad_norm": 5.432862758636475, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8644140958786011, "num_tokens": 236926771.0, "step": 6211 }, { "epoch": 0.7902302506042488, "ewc_loss": 0.04518323391675949, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001796155411284417, "grad_norm": 5.42703104019165, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8485652208328247, "num_tokens": 236964768.0, "step": 6212 }, { "epoch": 0.7903574608828393, "ewc_loss": 0.04505230486392975, "ewc_loss_diag": 2.7060508728027344e-05, "ewc_loss_parallel": 0.00017952696362044662, "grad_norm": 5.428555011749268, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8489269614219666, "num_tokens": 237005075.0, "step": 6213 }, { "epoch": 0.7904846711614298, "ewc_loss": 0.04517408832907677, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001795240823412314, "grad_norm": 5.497128486633301, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.863038957118988, "num_tokens": 237035517.0, "step": 6214 }, { "epoch": 0.7906118814400204, "ewc_loss": 0.04519763961434364, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001797595905372873, "grad_norm": 5.363211631774902, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.877722978591919, "num_tokens": 237076319.0, "step": 6215 }, { "epoch": 0.7907390917186109, "ewc_loss": 0.045193012803792953, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017971332999877632, "grad_norm": 5.436171054840088, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8530533313751221, "num_tokens": 237117935.0, "step": 6216 }, { "epoch": 0.7908663019972014, "ewc_loss": 0.045264821499586105, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018043142335955054, "grad_norm": 5.440084457397461, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8704007267951965, "num_tokens": 237155789.0, "step": 6217 }, { "epoch": 0.7909935122757918, "ewc_loss": 0.045194052159786224, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017972372006624937, "grad_norm": 5.3764753341674805, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8667248487472534, "num_tokens": 237198870.0, "step": 6218 }, { "epoch": 0.7911207225543824, "ewc_loss": 0.045169033110141754, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017947355809155852, "grad_norm": 5.408320903778076, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8662357330322266, "num_tokens": 237243084.0, "step": 6219 }, { "epoch": 0.7912479328329729, "ewc_loss": 0.04526921361684799, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001804753119358793, "grad_norm": 5.505760669708252, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8528751134872437, "num_tokens": 237277245.0, "step": 6220 }, { "epoch": 0.7913751431115634, "ewc_loss": 0.04521577060222626, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017994089284911752, "grad_norm": 5.403131484985352, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8662220239639282, "num_tokens": 237317841.0, "step": 6221 }, { "epoch": 0.791502353390154, "ewc_loss": 0.045192621648311615, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017970941553357989, "grad_norm": 5.529728889465332, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8515141010284424, "num_tokens": 237349327.0, "step": 6222 }, { "epoch": 0.7916295636687445, "ewc_loss": 0.04518527165055275, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001796359138097614, "grad_norm": 5.368272304534912, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8467329144477844, "num_tokens": 237396179.0, "step": 6223 }, { "epoch": 0.7917567739473349, "ewc_loss": 0.045223113149404526, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018001433636527508, "grad_norm": 5.454405307769775, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.858117938041687, "num_tokens": 237437766.0, "step": 6224 }, { "epoch": 0.7918839842259254, "ewc_loss": 0.04524527117609978, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018023591837845743, "grad_norm": 5.386597156524658, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.865364670753479, "num_tokens": 237479239.0, "step": 6225 }, { "epoch": 0.792011194504516, "ewc_loss": 0.04520653933286667, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017984860460273921, "grad_norm": 5.4973649978637695, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8748953342437744, "num_tokens": 237514641.0, "step": 6226 }, { "epoch": 0.7921384047831065, "ewc_loss": 0.045239221304655075, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018017541151493788, "grad_norm": 5.391831398010254, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8718324303627014, "num_tokens": 237555319.0, "step": 6227 }, { "epoch": 0.792265615061697, "ewc_loss": 0.04521011561155319, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017988434410654008, "grad_norm": 5.451481342315674, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8637806177139282, "num_tokens": 237597786.0, "step": 6228 }, { "epoch": 0.7923928253402875, "ewc_loss": 0.04525717347860336, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.000180354923941195, "grad_norm": 5.410696029663086, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8729472160339355, "num_tokens": 237636584.0, "step": 6229 }, { "epoch": 0.792520035618878, "ewc_loss": 0.04520432651042938, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001798264420358464, "grad_norm": 5.479427814483643, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8538805246353149, "num_tokens": 237676000.0, "step": 6230 }, { "epoch": 0.7926472458974685, "ewc_loss": 0.04525116831064224, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018029488273896277, "grad_norm": 5.434219837188721, "learning_rate": 1e-06, "loss": 0.3759, "mean_token_accuracy": 0.8869735598564148, "num_tokens": 237713003.0, "step": 6231 }, { "epoch": 0.792774456176059, "ewc_loss": 0.045230377465486526, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001800869795260951, "grad_norm": 5.604521751403809, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8634095191955566, "num_tokens": 237742054.0, "step": 6232 }, { "epoch": 0.7929016664546495, "ewc_loss": 0.04526398330926895, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018042305600829422, "grad_norm": 5.435535907745361, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8529121279716492, "num_tokens": 237783147.0, "step": 6233 }, { "epoch": 0.7930288767332401, "ewc_loss": 0.04516056180000305, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017938883684109896, "grad_norm": 5.535168170928955, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8557699918746948, "num_tokens": 237814322.0, "step": 6234 }, { "epoch": 0.7931560870118306, "ewc_loss": 0.045205794274806976, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017984113947022706, "grad_norm": 5.416965484619141, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8668180704116821, "num_tokens": 237849278.0, "step": 6235 }, { "epoch": 0.793283297290421, "ewc_loss": 0.0451989583671093, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017977278912439942, "grad_norm": 5.4424920082092285, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8652950525283813, "num_tokens": 237893434.0, "step": 6236 }, { "epoch": 0.7934105075690115, "ewc_loss": 0.04526664316654205, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018044965690933168, "grad_norm": 5.428584575653076, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8692274689674377, "num_tokens": 237932135.0, "step": 6237 }, { "epoch": 0.7935377178476021, "ewc_loss": 0.04524990916252136, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018028230988420546, "grad_norm": 5.455125331878662, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8565004467964172, "num_tokens": 237966320.0, "step": 6238 }, { "epoch": 0.7936649281261926, "ewc_loss": 0.0452958345413208, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001807415537768975, "grad_norm": 5.433120250701904, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8664355874061584, "num_tokens": 238006365.0, "step": 6239 }, { "epoch": 0.7937921384047831, "ewc_loss": 0.04524478316307068, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018023101438302547, "grad_norm": 5.476863384246826, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8589321374893188, "num_tokens": 238039307.0, "step": 6240 }, { "epoch": 0.7939193486833737, "ewc_loss": 0.04531557857990265, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001809390087146312, "grad_norm": 5.458220481872559, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8467913866043091, "num_tokens": 238078015.0, "step": 6241 }, { "epoch": 0.7940465589619641, "ewc_loss": 0.04525136575102806, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018029686179943383, "grad_norm": 5.45469856262207, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8597358465194702, "num_tokens": 238116790.0, "step": 6242 }, { "epoch": 0.7941737692405546, "ewc_loss": 0.04531310498714447, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018091427045874298, "grad_norm": 5.450958251953125, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8675217628479004, "num_tokens": 238155330.0, "step": 6243 }, { "epoch": 0.7943009795191451, "ewc_loss": 0.04529880732297897, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018077126878779382, "grad_norm": 5.436720848083496, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.845329999923706, "num_tokens": 238194492.0, "step": 6244 }, { "epoch": 0.7944281897977357, "ewc_loss": 0.04554995149374008, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001808413362596184, "grad_norm": 5.4351325035095215, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8611665964126587, "num_tokens": 238230451.0, "step": 6245 }, { "epoch": 0.7945554000763262, "ewc_loss": 0.04535776749253273, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018136086873710155, "grad_norm": 5.481417655944824, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8692458868026733, "num_tokens": 238266331.0, "step": 6246 }, { "epoch": 0.7946826103549167, "ewc_loss": 0.04529627412557602, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.000180745919351466, "grad_norm": 5.516801357269287, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8504829406738281, "num_tokens": 238307753.0, "step": 6247 }, { "epoch": 0.7948098206335071, "ewc_loss": 0.0452965572476387, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018074875697493553, "grad_norm": 5.4401445388793945, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8622114062309265, "num_tokens": 238349644.0, "step": 6248 }, { "epoch": 0.7949370309120977, "ewc_loss": 0.0455203503370285, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018054532120004296, "grad_norm": 5.49078369140625, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8473264575004578, "num_tokens": 238386308.0, "step": 6249 }, { "epoch": 0.7950642411906882, "ewc_loss": 0.04528535157442093, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018063669267576188, "grad_norm": 5.438780784606934, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8462854027748108, "num_tokens": 238427935.0, "step": 6250 }, { "epoch": 0.7951914514692787, "ewc_loss": 0.04548337683081627, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018017555703409016, "grad_norm": 5.470963954925537, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8680946826934814, "num_tokens": 238463639.0, "step": 6251 }, { "epoch": 0.7953186617478692, "ewc_loss": 0.045527130365371704, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018061307491734624, "grad_norm": 5.431087493896484, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.874677836894989, "num_tokens": 238498295.0, "step": 6252 }, { "epoch": 0.7954458720264598, "ewc_loss": 0.04529421776533127, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018072535749524832, "grad_norm": 5.4155964851379395, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8617085814476013, "num_tokens": 238538109.0, "step": 6253 }, { "epoch": 0.7955730823050502, "ewc_loss": 0.045569270849227905, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018103449838235974, "grad_norm": 9.08179759979248, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8474417924880981, "num_tokens": 238574415.0, "step": 6254 }, { "epoch": 0.7957002925836407, "ewc_loss": 0.049699634313583374, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00022233814524952322, "grad_norm": 6.008609771728516, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8634724617004395, "num_tokens": 238616368.0, "step": 6255 }, { "epoch": 0.7958275028622313, "ewc_loss": 0.04431670904159546, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00017095029761549085, "grad_norm": 5.273143768310547, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8658425807952881, "num_tokens": 238650634.0, "step": 6256 }, { "epoch": 0.7959547131408218, "ewc_loss": 0.04594302922487259, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018721350352279842, "grad_norm": 5.7215471267700195, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8559521436691284, "num_tokens": 238684383.0, "step": 6257 }, { "epoch": 0.7960819234194123, "ewc_loss": 0.04552497714757919, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018303295655641705, "grad_norm": 5.396331310272217, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.858059823513031, "num_tokens": 238722863.0, "step": 6258 }, { "epoch": 0.7962091336980028, "ewc_loss": 0.04542413353919983, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001820245524868369, "grad_norm": 5.608978271484375, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.862214207649231, "num_tokens": 238765027.0, "step": 6259 }, { "epoch": 0.7963363439765934, "ewc_loss": 0.04577690362930298, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001831108093028888, "grad_norm": 5.4408955574035645, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8658013343811035, "num_tokens": 238801950.0, "step": 6260 }, { "epoch": 0.7964635542551838, "ewc_loss": 0.0454057902097702, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018184112559538335, "grad_norm": 5.523230075836182, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8523939251899719, "num_tokens": 238840963.0, "step": 6261 }, { "epoch": 0.7965907645337743, "ewc_loss": 0.045456744730472565, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018235064635518938, "grad_norm": 5.549607276916504, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8471897840499878, "num_tokens": 238876167.0, "step": 6262 }, { "epoch": 0.7967179748123648, "ewc_loss": 0.04544106125831604, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018219382036477327, "grad_norm": 5.4741010665893555, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8666794300079346, "num_tokens": 238910293.0, "step": 6263 }, { "epoch": 0.7968451850909554, "ewc_loss": 0.04540909081697464, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018187412933912128, "grad_norm": 5.560557842254639, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8595724105834961, "num_tokens": 238947279.0, "step": 6264 }, { "epoch": 0.7969723953695459, "ewc_loss": 0.04541115090250969, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001818947057472542, "grad_norm": 5.4920759201049805, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8508991599082947, "num_tokens": 238984148.0, "step": 6265 }, { "epoch": 0.7970996056481364, "ewc_loss": 0.0453181229531765, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018096443091053516, "grad_norm": 5.487251281738281, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8488141298294067, "num_tokens": 239022370.0, "step": 6266 }, { "epoch": 0.7972268159267268, "ewc_loss": 0.045337527990341187, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001811584661481902, "grad_norm": 5.557663440704346, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8583647012710571, "num_tokens": 239064542.0, "step": 6267 }, { "epoch": 0.7973540262053174, "ewc_loss": 0.04533781483769417, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018116134742740542, "grad_norm": 5.530128479003906, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8592343330383301, "num_tokens": 239103193.0, "step": 6268 }, { "epoch": 0.7974812364839079, "ewc_loss": 0.04523566737771034, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001801398757379502, "grad_norm": 5.504278659820557, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8688819408416748, "num_tokens": 239137952.0, "step": 6269 }, { "epoch": 0.7976084467624984, "ewc_loss": 0.04526832699775696, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.0001804664934752509, "grad_norm": 5.486297130584717, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8512305021286011, "num_tokens": 239174600.0, "step": 6270 }, { "epoch": 0.797735657041089, "ewc_loss": 0.04547358304262161, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018007760809268802, "grad_norm": 5.465645790100098, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8659603595733643, "num_tokens": 239206157.0, "step": 6271 }, { "epoch": 0.7978628673196795, "ewc_loss": 0.04550496116280556, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018039140559267253, "grad_norm": 9.19333553314209, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.865424394607544, "num_tokens": 239243094.0, "step": 6272 }, { "epoch": 0.7979900775982699, "ewc_loss": 0.04974799603223801, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0002228217344963923, "grad_norm": 6.066112518310547, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8611968159675598, "num_tokens": 239281208.0, "step": 6273 }, { "epoch": 0.7981172878768604, "ewc_loss": 0.04445439204573631, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.000169885708601214, "grad_norm": 5.2243781089782715, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8570938110351562, "num_tokens": 239320159.0, "step": 6274 }, { "epoch": 0.798244498155451, "ewc_loss": 0.04618920013308525, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.000187233803444542, "grad_norm": 5.702796936035156, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8483786582946777, "num_tokens": 239360160.0, "step": 6275 }, { "epoch": 0.7983717084340415, "ewc_loss": 0.04577440768480301, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018308588187210262, "grad_norm": 5.433095455169678, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8635097146034241, "num_tokens": 239398101.0, "step": 6276 }, { "epoch": 0.798498918712632, "ewc_loss": 0.04567299783229828, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001820717880036682, "grad_norm": 5.557446479797363, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8766374588012695, "num_tokens": 239438560.0, "step": 6277 }, { "epoch": 0.7986261289912225, "ewc_loss": 0.04567651450634003, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001821069308789447, "grad_norm": 5.420475482940674, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8556530475616455, "num_tokens": 239475851.0, "step": 6278 }, { "epoch": 0.798753339269813, "ewc_loss": 0.0456947460770607, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018228923727292567, "grad_norm": 5.549553394317627, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8664063811302185, "num_tokens": 239519165.0, "step": 6279 }, { "epoch": 0.7988805495484035, "ewc_loss": 0.045496486127376556, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018274805916007608, "grad_norm": 5.461742401123047, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8485097885131836, "num_tokens": 239560602.0, "step": 6280 }, { "epoch": 0.799007759826994, "ewc_loss": 0.04539007693529129, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018168396491091698, "grad_norm": 5.571091175079346, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8615666627883911, "num_tokens": 239591933.0, "step": 6281 }, { "epoch": 0.7991349701055845, "ewc_loss": 0.045452624559402466, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018230943533126265, "grad_norm": 5.549594879150391, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.856304407119751, "num_tokens": 239625495.0, "step": 6282 }, { "epoch": 0.7992621803841751, "ewc_loss": 0.04534059762954712, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018118917068932205, "grad_norm": 5.500258445739746, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8767085671424866, "num_tokens": 239658249.0, "step": 6283 }, { "epoch": 0.7993893906627656, "ewc_loss": 0.04534223675727844, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018120557069778442, "grad_norm": 5.508588790893555, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8518857359886169, "num_tokens": 239690194.0, "step": 6284 }, { "epoch": 0.799516600941356, "ewc_loss": 0.04536479711532593, "ewc_loss_diag": 2.7179718017578125e-05, "ewc_loss_parallel": 0.00018143118359148502, "grad_norm": 5.5038743019104, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8599991798400879, "num_tokens": 239728315.0, "step": 6285 }, { "epoch": 0.7996438112199465, "ewc_loss": 0.04545135423541069, "ewc_loss_diag": 2.7298927307128906e-05, "ewc_loss_parallel": 0.00018107604410033673, "grad_norm": 5.49399471282959, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8656194806098938, "num_tokens": 239764505.0, "step": 6286 }, { "epoch": 0.7997710214985371, "ewc_loss": 0.0456806980073452, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001809280802262947, "grad_norm": 5.415008544921875, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8454500436782837, "num_tokens": 239809461.0, "step": 6287 }, { "epoch": 0.7998982317771276, "ewc_loss": 0.04573521763086319, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001814732822822407, "grad_norm": 5.541719913482666, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8548760414123535, "num_tokens": 239845232.0, "step": 6288 }, { "epoch": 0.8000254420557181, "ewc_loss": 0.04560138285160065, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018135563004761934, "grad_norm": 5.42090368270874, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8668792843818665, "num_tokens": 239885489.0, "step": 6289 }, { "epoch": 0.8001526523343087, "ewc_loss": 0.04558176547288895, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018115947023034096, "grad_norm": 5.510109901428223, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8480000495910645, "num_tokens": 239921159.0, "step": 6290 }, { "epoch": 0.8002798626128991, "ewc_loss": 0.045641861855983734, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018176039156969637, "grad_norm": 5.434602737426758, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8659806251525879, "num_tokens": 239960879.0, "step": 6291 }, { "epoch": 0.8004070728914896, "ewc_loss": 0.0455012246966362, "ewc_loss_diag": 2.7298927307128906e-05, "ewc_loss_parallel": 0.00018157472368329763, "grad_norm": 5.525777816772461, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8496506214141846, "num_tokens": 239994302.0, "step": 6292 }, { "epoch": 0.8005342831700801, "ewc_loss": 0.04560107737779617, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018135258869733661, "grad_norm": 5.447993278503418, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8683803677558899, "num_tokens": 240027759.0, "step": 6293 }, { "epoch": 0.8006614934486707, "ewc_loss": 0.04568227380514145, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018216452735941857, "grad_norm": 5.510685920715332, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8559456467628479, "num_tokens": 240065049.0, "step": 6294 }, { "epoch": 0.8007887037272612, "ewc_loss": 0.04560558497905731, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018139765597879887, "grad_norm": 5.493343830108643, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8461388349533081, "num_tokens": 240101453.0, "step": 6295 }, { "epoch": 0.8009159140058517, "ewc_loss": 0.04551301151514053, "ewc_loss_diag": 2.7298927307128906e-05, "ewc_loss_parallel": 0.0001816926378523931, "grad_norm": 5.601925373077393, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8402240872383118, "num_tokens": 240136831.0, "step": 6296 }, { "epoch": 0.8010431242844421, "ewc_loss": 0.04561518132686615, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018149361130781472, "grad_norm": 5.518702983856201, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8594555854797363, "num_tokens": 240173364.0, "step": 6297 }, { "epoch": 0.8011703345630327, "ewc_loss": 0.04551893472671509, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018053116218652576, "grad_norm": 5.52851676940918, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8392871022224426, "num_tokens": 240206718.0, "step": 6298 }, { "epoch": 0.8012975448416232, "ewc_loss": 0.04571294039487839, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001812504924600944, "grad_norm": 5.451400279998779, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8483052849769592, "num_tokens": 240245633.0, "step": 6299 }, { "epoch": 0.8014247551202137, "ewc_loss": 0.045663513243198395, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018075625121127814, "grad_norm": 5.454730033874512, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.873955249786377, "num_tokens": 240283887.0, "step": 6300 }, { "epoch": 0.8015519653988042, "ewc_loss": 0.04573989659547806, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018152008124161512, "grad_norm": 5.430882930755615, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8518949747085571, "num_tokens": 240322208.0, "step": 6301 }, { "epoch": 0.8016791756773948, "ewc_loss": 0.045715123414993286, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001812723494367674, "grad_norm": 5.484453201293945, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8644627332687378, "num_tokens": 240356154.0, "step": 6302 }, { "epoch": 0.8018063859559852, "ewc_loss": 0.04578134045004845, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018193449068348855, "grad_norm": 5.451797008514404, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8596495389938354, "num_tokens": 240389103.0, "step": 6303 }, { "epoch": 0.8019335962345757, "ewc_loss": 0.04571463167667389, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001812674308894202, "grad_norm": 5.396301746368408, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8778570890426636, "num_tokens": 240429545.0, "step": 6304 }, { "epoch": 0.8020608065131662, "ewc_loss": 0.045794859528541565, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018206966342404485, "grad_norm": 5.437434673309326, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8773494362831116, "num_tokens": 240468556.0, "step": 6305 }, { "epoch": 0.8021880167917568, "ewc_loss": 0.045866139233112335, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018156180158257484, "grad_norm": 5.431282043457031, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8624332547187805, "num_tokens": 240508534.0, "step": 6306 }, { "epoch": 0.8023152270703473, "ewc_loss": 0.0459166057407856, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018206644745077938, "grad_norm": 5.5191779136657715, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8602774143218994, "num_tokens": 240540833.0, "step": 6307 }, { "epoch": 0.8024424373489378, "ewc_loss": 0.04593926668167114, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018229307897854596, "grad_norm": 5.4780354499816895, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8524141311645508, "num_tokens": 240577768.0, "step": 6308 }, { "epoch": 0.8025696476275284, "ewc_loss": 0.045899420976638794, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001818946038838476, "grad_norm": 5.5018110275268555, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8564776182174683, "num_tokens": 240610607.0, "step": 6309 }, { "epoch": 0.8026968579061188, "ewc_loss": 0.04595497250556946, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018245013779960573, "grad_norm": 5.4976420402526855, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8578089475631714, "num_tokens": 240646056.0, "step": 6310 }, { "epoch": 0.8028240681847093, "ewc_loss": 0.04578958824276924, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018201698549091816, "grad_norm": 5.432727336883545, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8677077293395996, "num_tokens": 240682513.0, "step": 6311 }, { "epoch": 0.8029512784632998, "ewc_loss": 0.04589635133743286, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001818638847908005, "grad_norm": 5.527700901031494, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8604365587234497, "num_tokens": 240716311.0, "step": 6312 }, { "epoch": 0.8030784887418904, "ewc_loss": 0.04594990611076355, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018239946803078055, "grad_norm": 5.480258941650391, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8513734340667725, "num_tokens": 240758873.0, "step": 6313 }, { "epoch": 0.8032056990204809, "ewc_loss": 0.04579611495137215, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018208225083071738, "grad_norm": 5.484500408172607, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8655923008918762, "num_tokens": 240797163.0, "step": 6314 }, { "epoch": 0.8033329092990714, "ewc_loss": 0.045768409967422485, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018180516781285405, "grad_norm": 5.430842399597168, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8713036775588989, "num_tokens": 240835719.0, "step": 6315 }, { "epoch": 0.8034601195776618, "ewc_loss": 0.04592818021774292, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018218217883259058, "grad_norm": 5.479653358459473, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8531272411346436, "num_tokens": 240874359.0, "step": 6316 }, { "epoch": 0.8035873298562524, "ewc_loss": 0.04580166935920715, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001821377663873136, "grad_norm": 5.433834075927734, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8730639219284058, "num_tokens": 240909125.0, "step": 6317 }, { "epoch": 0.8037145401348429, "ewc_loss": 0.04590757563710213, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018197613826487213, "grad_norm": 5.43519926071167, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8641241788864136, "num_tokens": 240949391.0, "step": 6318 }, { "epoch": 0.8038417504134334, "ewc_loss": 0.04590389132499695, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018193927826359868, "grad_norm": 5.459053039550781, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8508611917495728, "num_tokens": 240987390.0, "step": 6319 }, { "epoch": 0.803968960692024, "ewc_loss": 0.04595918208360672, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001824922364903614, "grad_norm": 5.439648628234863, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8534818887710571, "num_tokens": 241025359.0, "step": 6320 }, { "epoch": 0.8040961709706145, "ewc_loss": 0.045886460691690445, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018176498997490853, "grad_norm": 5.4859514236450195, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8559461832046509, "num_tokens": 241060532.0, "step": 6321 }, { "epoch": 0.8042233812492049, "ewc_loss": 0.04583686962723732, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018248979176860303, "grad_norm": 5.395317077636719, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8664857745170593, "num_tokens": 241098547.0, "step": 6322 }, { "epoch": 0.8043505915277954, "ewc_loss": 0.045676976442337036, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001821115438360721, "grad_norm": 5.509463787078857, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8600873947143555, "num_tokens": 241131091.0, "step": 6323 }, { "epoch": 0.804477801806386, "ewc_loss": 0.04574817791581154, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001828235836001113, "grad_norm": 5.508488655090332, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8493841886520386, "num_tokens": 241164495.0, "step": 6324 }, { "epoch": 0.8046050120849765, "ewc_loss": 0.045800112187862396, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018212223949376494, "grad_norm": 5.420740604400635, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8548396825790405, "num_tokens": 241203777.0, "step": 6325 }, { "epoch": 0.804732222363567, "ewc_loss": 0.04582454264163971, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018236652249470353, "grad_norm": 5.653448581695557, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8541688919067383, "num_tokens": 241243219.0, "step": 6326 }, { "epoch": 0.8048594326421575, "ewc_loss": 0.04585665091872215, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018268759595230222, "grad_norm": 5.363006591796875, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8452707529067993, "num_tokens": 241284547.0, "step": 6327 }, { "epoch": 0.804986642920748, "ewc_loss": 0.04573944956064224, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018151557014789432, "grad_norm": 5.448568344116211, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.843509316444397, "num_tokens": 241331110.0, "step": 6328 }, { "epoch": 0.8051138531993385, "ewc_loss": 0.045852407813072205, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001826451625674963, "grad_norm": 5.418651103973389, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8608270883560181, "num_tokens": 241372720.0, "step": 6329 }, { "epoch": 0.805241063477929, "ewc_loss": 0.04583262652158737, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018244737293571234, "grad_norm": 5.439992904663086, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8619202375411987, "num_tokens": 241408173.0, "step": 6330 }, { "epoch": 0.8053682737565195, "ewc_loss": 0.04585990309715271, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018272011948283762, "grad_norm": 5.442673206329346, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8671094179153442, "num_tokens": 241445021.0, "step": 6331 }, { "epoch": 0.8054954840351101, "ewc_loss": 0.04590997099876404, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018322077812626958, "grad_norm": 5.528962135314941, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8646931648254395, "num_tokens": 241481874.0, "step": 6332 }, { "epoch": 0.8056226943137006, "ewc_loss": 0.045861728489398956, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018273835303261876, "grad_norm": 5.431679725646973, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8449510335922241, "num_tokens": 241523140.0, "step": 6333 }, { "epoch": 0.805749904592291, "ewc_loss": 0.045835502445697784, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018247611296828836, "grad_norm": 5.505832195281982, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8419076204299927, "num_tokens": 241563424.0, "step": 6334 }, { "epoch": 0.8058771148708815, "ewc_loss": 0.04585333168506622, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018265443213749677, "grad_norm": 5.450107097625732, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8730462789535522, "num_tokens": 241600004.0, "step": 6335 }, { "epoch": 0.8060043251494721, "ewc_loss": 0.045973341912031174, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018263381207361817, "grad_norm": 5.440789222717285, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8588296175003052, "num_tokens": 241642704.0, "step": 6336 }, { "epoch": 0.8061315354280626, "ewc_loss": 0.04597771167755127, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018267751147504896, "grad_norm": 5.5047383308410645, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8645862340927124, "num_tokens": 241679176.0, "step": 6337 }, { "epoch": 0.8062587457066531, "ewc_loss": 0.046023063361644745, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.000183131021913141, "grad_norm": 5.48298978805542, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8643559217453003, "num_tokens": 241721168.0, "step": 6338 }, { "epoch": 0.8063859559852437, "ewc_loss": 0.045969121158123016, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018259162607137114, "grad_norm": 5.4596099853515625, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8749639987945557, "num_tokens": 241759735.0, "step": 6339 }, { "epoch": 0.8065131662638341, "ewc_loss": 0.04593691974878311, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018226959218736738, "grad_norm": 5.4903435707092285, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.860938310623169, "num_tokens": 241799445.0, "step": 6340 }, { "epoch": 0.8066403765424246, "ewc_loss": 0.045961350202560425, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001825138897402212, "grad_norm": 5.479388236999512, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8652597665786743, "num_tokens": 241839333.0, "step": 6341 }, { "epoch": 0.8067675868210151, "ewc_loss": 0.04592544585466385, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001821548503357917, "grad_norm": 5.466882705688477, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.848612904548645, "num_tokens": 241880248.0, "step": 6342 }, { "epoch": 0.8068947970996057, "ewc_loss": 0.045940786600112915, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018230825662612915, "grad_norm": 5.432229518890381, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.87335205078125, "num_tokens": 241926717.0, "step": 6343 }, { "epoch": 0.8070220073781962, "ewc_loss": 0.045983582735061646, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018273622845299542, "grad_norm": 5.503417015075684, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8512781858444214, "num_tokens": 241966080.0, "step": 6344 }, { "epoch": 0.8071492176567867, "ewc_loss": 0.045981645584106445, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018271684530191123, "grad_norm": 5.4668402671813965, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8571537733078003, "num_tokens": 242011975.0, "step": 6345 }, { "epoch": 0.8072764279353771, "ewc_loss": 0.04590623080730438, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018196270684711635, "grad_norm": 5.510473251342773, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8730957508087158, "num_tokens": 242046320.0, "step": 6346 }, { "epoch": 0.8074036382139677, "ewc_loss": 0.04584546387195587, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018257573537994176, "grad_norm": 5.472275257110596, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8620158433914185, "num_tokens": 242085157.0, "step": 6347 }, { "epoch": 0.8075308484925582, "ewc_loss": 0.04564138129353523, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018175561854150146, "grad_norm": 5.5326313972473145, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8562088012695312, "num_tokens": 242116548.0, "step": 6348 }, { "epoch": 0.8076580587711487, "ewc_loss": 0.04570402204990387, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018238202028442174, "grad_norm": 5.475940227508545, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8662704229354858, "num_tokens": 242151883.0, "step": 6349 }, { "epoch": 0.8077852690497392, "ewc_loss": 0.045681100338697433, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001821527985157445, "grad_norm": 5.498301982879639, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8568661212921143, "num_tokens": 242190486.0, "step": 6350 }, { "epoch": 0.8079124793283298, "ewc_loss": 0.0457371324300766, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.000182713134563528, "grad_norm": 5.443392276763916, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8637602925300598, "num_tokens": 242232720.0, "step": 6351 }, { "epoch": 0.8080396896069202, "ewc_loss": 0.04576760530471802, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018301786622032523, "grad_norm": 5.478687763214111, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8616836071014404, "num_tokens": 242273926.0, "step": 6352 }, { "epoch": 0.8081668998855107, "ewc_loss": 0.045737773180007935, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001827195519581437, "grad_norm": 5.488550662994385, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8505662679672241, "num_tokens": 242313484.0, "step": 6353 }, { "epoch": 0.8082941101641012, "ewc_loss": 0.04573291540145874, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018267097766511142, "grad_norm": 5.456667423248291, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8549004197120667, "num_tokens": 242352101.0, "step": 6354 }, { "epoch": 0.8084213204426918, "ewc_loss": 0.04577047750353813, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018304656259715557, "grad_norm": 5.722054481506348, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8756486773490906, "num_tokens": 242392958.0, "step": 6355 }, { "epoch": 0.8085485307212823, "ewc_loss": 0.04580238461494446, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001833656569942832, "grad_norm": 5.437110900878906, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8612985610961914, "num_tokens": 242432327.0, "step": 6356 }, { "epoch": 0.8086757409998728, "ewc_loss": 0.04567503184080124, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018209213158115745, "grad_norm": 5.596312046051025, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8396319150924683, "num_tokens": 242473714.0, "step": 6357 }, { "epoch": 0.8088029512784632, "ewc_loss": 0.04572763293981552, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018261811055708677, "grad_norm": 5.461984157562256, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8673709630966187, "num_tokens": 242513179.0, "step": 6358 }, { "epoch": 0.8089301615570538, "ewc_loss": 0.04565420746803284, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018188389367423952, "grad_norm": 5.495626926422119, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8604936599731445, "num_tokens": 242549398.0, "step": 6359 }, { "epoch": 0.8090573718356443, "ewc_loss": 0.04568575322628021, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001821993209887296, "grad_norm": 5.463715553283691, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8556696176528931, "num_tokens": 242583705.0, "step": 6360 }, { "epoch": 0.8091845821142348, "ewc_loss": 0.045676663517951965, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.000182108415174298, "grad_norm": 5.434752464294434, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.865389347076416, "num_tokens": 242625900.0, "step": 6361 }, { "epoch": 0.8093117923928254, "ewc_loss": 0.045968927443027496, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001825896615628153, "grad_norm": 5.8381195068359375, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8497422337532043, "num_tokens": 242665766.0, "step": 6362 }, { "epoch": 0.8094390026714159, "ewc_loss": 0.04578994959592819, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018324126722291112, "grad_norm": 5.388957500457764, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8627198934555054, "num_tokens": 242707634.0, "step": 6363 }, { "epoch": 0.8095662129500064, "ewc_loss": 0.04563957452774048, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018173754506278783, "grad_norm": 5.490640163421631, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8562077283859253, "num_tokens": 242746791.0, "step": 6364 }, { "epoch": 0.8096934232285968, "ewc_loss": 0.04572660103440285, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018260780780110508, "grad_norm": 5.393937587738037, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.872891902923584, "num_tokens": 242784861.0, "step": 6365 }, { "epoch": 0.8098206335071874, "ewc_loss": 0.04573991149663925, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018274089961778373, "grad_norm": 5.515015602111816, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8751785755157471, "num_tokens": 242817612.0, "step": 6366 }, { "epoch": 0.8099478437857779, "ewc_loss": 0.04604058712720871, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.000183306256076321, "grad_norm": 5.418763637542725, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8668560981750488, "num_tokens": 242854951.0, "step": 6367 }, { "epoch": 0.8100750540643684, "ewc_loss": 0.046041443943977356, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018331484170630574, "grad_norm": 9.075210571289062, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8754177093505859, "num_tokens": 242894166.0, "step": 6368 }, { "epoch": 0.8102022643429589, "ewc_loss": 0.050310343503952026, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00022722450376022607, "grad_norm": 6.119025707244873, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8664154410362244, "num_tokens": 242936230.0, "step": 6369 }, { "epoch": 0.8103294746215495, "ewc_loss": 0.04482055455446243, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001735473342705518, "grad_norm": 5.299710273742676, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8711138963699341, "num_tokens": 242973148.0, "step": 6370 }, { "epoch": 0.8104566849001399, "ewc_loss": 0.0464395247399807, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018973703845404088, "grad_norm": 5.756743431091309, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.861924409866333, "num_tokens": 243013029.0, "step": 6371 }, { "epoch": 0.8105838951787304, "ewc_loss": 0.046023398637771606, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001855757727753371, "grad_norm": 5.4198198318481445, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8604439496994019, "num_tokens": 243054847.0, "step": 6372 }, { "epoch": 0.810711105457321, "ewc_loss": 0.04596412926912308, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018498308782000095, "grad_norm": 5.636094570159912, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8536373376846313, "num_tokens": 243096524.0, "step": 6373 }, { "epoch": 0.8108383157359115, "ewc_loss": 0.046001799404621124, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001853598078014329, "grad_norm": 5.5423126220703125, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8528740406036377, "num_tokens": 243132397.0, "step": 6374 }, { "epoch": 0.810965526014502, "ewc_loss": 0.04585288465023041, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018387063755653799, "grad_norm": 5.478607654571533, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8479334115982056, "num_tokens": 243171790.0, "step": 6375 }, { "epoch": 0.8110927362930925, "ewc_loss": 0.04600812867283821, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018542307952884585, "grad_norm": 5.592223167419434, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8589059114456177, "num_tokens": 243208217.0, "step": 6376 }, { "epoch": 0.811219946571683, "ewc_loss": 0.04586327075958252, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018397449457552284, "grad_norm": 5.5044145584106445, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8652080297470093, "num_tokens": 243245260.0, "step": 6377 }, { "epoch": 0.8113471568502735, "ewc_loss": 0.045908309519290924, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018442489090375602, "grad_norm": 5.5239057540893555, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.868375301361084, "num_tokens": 243283924.0, "step": 6378 }, { "epoch": 0.811474367128864, "ewc_loss": 0.04604329913854599, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018333336629439145, "grad_norm": 5.501002788543701, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8690370321273804, "num_tokens": 243321977.0, "step": 6379 }, { "epoch": 0.8116015774074545, "ewc_loss": 0.04598664492368698, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018398754764348269, "grad_norm": 5.536055088043213, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8578041791915894, "num_tokens": 243359349.0, "step": 6380 }, { "epoch": 0.8117287876860451, "ewc_loss": 0.04607127979397774, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018361318507231772, "grad_norm": 5.484955310821533, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8637682199478149, "num_tokens": 243395722.0, "step": 6381 }, { "epoch": 0.8118559979646356, "ewc_loss": 0.04610670357942581, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018396743689663708, "grad_norm": 5.631302356719971, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8496345281600952, "num_tokens": 243429022.0, "step": 6382 }, { "epoch": 0.811983208243226, "ewc_loss": 0.04607810080051422, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001836814044509083, "grad_norm": 5.543772220611572, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8406169414520264, "num_tokens": 243468474.0, "step": 6383 }, { "epoch": 0.8121104185218165, "ewc_loss": 0.04607503116130829, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001836507290136069, "grad_norm": 5.525596618652344, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8757461905479431, "num_tokens": 243503045.0, "step": 6384 }, { "epoch": 0.8122376288004071, "ewc_loss": 0.04600364342331886, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018293682660441846, "grad_norm": 5.494214057922363, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8498331308364868, "num_tokens": 243543865.0, "step": 6385 }, { "epoch": 0.8123648390789976, "ewc_loss": 0.045819610357284546, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001835379225667566, "grad_norm": 5.524566173553467, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.852446436882019, "num_tokens": 243581258.0, "step": 6386 }, { "epoch": 0.8124920493575881, "ewc_loss": 0.046058010309934616, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018348050070926547, "grad_norm": 5.442900657653809, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8541860580444336, "num_tokens": 243622777.0, "step": 6387 }, { "epoch": 0.8126192596361786, "ewc_loss": 0.04595629870891571, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001836840674513951, "grad_norm": 5.514744281768799, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8482831716537476, "num_tokens": 243664549.0, "step": 6388 }, { "epoch": 0.8127464699147691, "ewc_loss": 0.0458742119371891, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018408392497804016, "grad_norm": 5.431492805480957, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8548667430877686, "num_tokens": 243710408.0, "step": 6389 }, { "epoch": 0.8128736801933596, "ewc_loss": 0.04587263613939285, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018406817980576307, "grad_norm": 5.451842308044434, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.861889123916626, "num_tokens": 243752598.0, "step": 6390 }, { "epoch": 0.8130008904719501, "ewc_loss": 0.04586835205554962, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018402529531158507, "grad_norm": 5.4805402755737305, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8583595156669617, "num_tokens": 243789805.0, "step": 6391 }, { "epoch": 0.8131281007505406, "ewc_loss": 0.04603167250752449, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001844378130044788, "grad_norm": 5.458888053894043, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8702750205993652, "num_tokens": 243824124.0, "step": 6392 }, { "epoch": 0.8132553110291312, "ewc_loss": 0.04589555412530899, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001842973433667794, "grad_norm": 5.505454063415527, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8596498966217041, "num_tokens": 243863448.0, "step": 6393 }, { "epoch": 0.8133825213077217, "ewc_loss": 0.04593589901924133, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001847007661126554, "grad_norm": 5.4852166175842285, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8637927770614624, "num_tokens": 243902085.0, "step": 6394 }, { "epoch": 0.8135097315863121, "ewc_loss": 0.04599151760339737, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018403626745566726, "grad_norm": 5.44288444519043, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8546682000160217, "num_tokens": 243945349.0, "step": 6395 }, { "epoch": 0.8136369418649027, "ewc_loss": 0.04590248316526413, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.0001843666541390121, "grad_norm": 5.506083011627197, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8578330278396606, "num_tokens": 243982221.0, "step": 6396 }, { "epoch": 0.8137641521434932, "ewc_loss": 0.045934684574604034, "ewc_loss_diag": 2.7418136596679688e-05, "ewc_loss_parallel": 0.00018468861526343971, "grad_norm": 5.566564559936523, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.848692774772644, "num_tokens": 244012576.0, "step": 6397 }, { "epoch": 0.8138913624220837, "ewc_loss": 0.046028390526771545, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018440498388372362, "grad_norm": 5.461405277252197, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8675408363342285, "num_tokens": 244052089.0, "step": 6398 }, { "epoch": 0.8140185727006742, "ewc_loss": 0.04597214236855507, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018384250870440155, "grad_norm": 5.513609409332275, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8625020980834961, "num_tokens": 244088712.0, "step": 6399 }, { "epoch": 0.8141457829792648, "ewc_loss": 0.04600082337856293, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018412932695355266, "grad_norm": 5.4463114738464355, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8613560795783997, "num_tokens": 244127655.0, "step": 6400 }, { "epoch": 0.8142729932578552, "ewc_loss": 0.04600123316049576, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018413343059364706, "grad_norm": 5.507494926452637, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8655183911323547, "num_tokens": 244158588.0, "step": 6401 }, { "epoch": 0.8144002035364457, "ewc_loss": 0.04615813493728638, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018448174523655325, "grad_norm": 5.454065799713135, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8783656358718872, "num_tokens": 244193872.0, "step": 6402 }, { "epoch": 0.8145274138150362, "ewc_loss": 0.046008236706256866, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018420343985781074, "grad_norm": 5.467409610748291, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8446621298789978, "num_tokens": 244234055.0, "step": 6403 }, { "epoch": 0.8146546240936268, "ewc_loss": 0.04601585865020752, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001842797064455226, "grad_norm": 5.478204250335693, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8605165481567383, "num_tokens": 244275163.0, "step": 6404 }, { "epoch": 0.8147818343722173, "ewc_loss": 0.04600335657596588, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018415466183796525, "grad_norm": 5.508896827697754, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.857292890548706, "num_tokens": 244315241.0, "step": 6405 }, { "epoch": 0.8149090446508078, "ewc_loss": 0.046029023826122284, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018441132851876318, "grad_norm": 5.4769978523254395, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8749433159828186, "num_tokens": 244351947.0, "step": 6406 }, { "epoch": 0.8150362549293982, "ewc_loss": 0.04599260538816452, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018404712318442762, "grad_norm": 5.5271782875061035, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8576726913452148, "num_tokens": 244387706.0, "step": 6407 }, { "epoch": 0.8151634652079888, "ewc_loss": 0.04614522308111191, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018435261154081672, "grad_norm": 5.461381435394287, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8674834966659546, "num_tokens": 244428447.0, "step": 6408 }, { "epoch": 0.8152906754865793, "ewc_loss": 0.0461285226047039, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018418561376165599, "grad_norm": 5.497791290283203, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8459237813949585, "num_tokens": 244471191.0, "step": 6409 }, { "epoch": 0.8154178857651698, "ewc_loss": 0.04614255204796791, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018432590877637267, "grad_norm": 5.519031047821045, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8422325253486633, "num_tokens": 244509198.0, "step": 6410 }, { "epoch": 0.8155450960437604, "ewc_loss": 0.045977309346199036, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018389416800346226, "grad_norm": 5.476256847381592, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.850692093372345, "num_tokens": 244549677.0, "step": 6411 }, { "epoch": 0.8156723063223509, "ewc_loss": 0.046164073050022125, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018454113160260022, "grad_norm": 5.572027206420898, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8481158018112183, "num_tokens": 244583586.0, "step": 6412 }, { "epoch": 0.8157995166009414, "ewc_loss": 0.04600702226161957, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018419133266434073, "grad_norm": 5.434097766876221, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8690782785415649, "num_tokens": 244621220.0, "step": 6413 }, { "epoch": 0.8159267268795318, "ewc_loss": 0.04599609971046448, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001840821059886366, "grad_norm": 5.531219005584717, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8685734868049622, "num_tokens": 244655941.0, "step": 6414 }, { "epoch": 0.8160539371581224, "ewc_loss": 0.04614868015050888, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018438718689139932, "grad_norm": 5.469930648803711, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.877604603767395, "num_tokens": 244685807.0, "step": 6415 }, { "epoch": 0.8161811474367129, "ewc_loss": 0.046140603721141815, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001843064237618819, "grad_norm": 5.479847431182861, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8589789867401123, "num_tokens": 244726866.0, "step": 6416 }, { "epoch": 0.8163083577153034, "ewc_loss": 0.046171631664037704, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018461671425029635, "grad_norm": 5.443972587585449, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8517308235168457, "num_tokens": 244767273.0, "step": 6417 }, { "epoch": 0.8164355679938939, "ewc_loss": 0.04619012400507927, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001848016254371032, "grad_norm": 5.497717380523682, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8565047979354858, "num_tokens": 244806111.0, "step": 6418 }, { "epoch": 0.8165627782724845, "ewc_loss": 0.0461873784661293, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018477416597306728, "grad_norm": 5.475111961364746, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8709227442741394, "num_tokens": 244841706.0, "step": 6419 }, { "epoch": 0.8166899885510749, "ewc_loss": 0.046177178621292114, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018467220070306212, "grad_norm": 5.460424900054932, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8626111745834351, "num_tokens": 244883908.0, "step": 6420 }, { "epoch": 0.8168171988296654, "ewc_loss": 0.04617089405655861, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018460932187736034, "grad_norm": 5.514827728271484, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8523628115653992, "num_tokens": 244919090.0, "step": 6421 }, { "epoch": 0.8169444091082559, "ewc_loss": 0.04623093456029892, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018520976300351322, "grad_norm": 5.505793571472168, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8747912645339966, "num_tokens": 244961141.0, "step": 6422 }, { "epoch": 0.8170716193868465, "ewc_loss": 0.0461784191429615, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018468458438292146, "grad_norm": 5.462092876434326, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.844215989112854, "num_tokens": 245004503.0, "step": 6423 }, { "epoch": 0.817198829665437, "ewc_loss": 0.04622085019946098, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018510888912715018, "grad_norm": 5.508551120758057, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8580532670021057, "num_tokens": 245045042.0, "step": 6424 }, { "epoch": 0.8173260399440275, "ewc_loss": 0.04622088372707367, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001851092529250309, "grad_norm": 5.513216018676758, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8590015769004822, "num_tokens": 245085416.0, "step": 6425 }, { "epoch": 0.8174532502226179, "ewc_loss": 0.04620300978422165, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018493046809453517, "grad_norm": 5.547550678253174, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8491579294204712, "num_tokens": 245116137.0, "step": 6426 }, { "epoch": 0.8175804605012085, "ewc_loss": 0.0462031252682209, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018493164679966867, "grad_norm": 5.4953789710998535, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8721562623977661, "num_tokens": 245159181.0, "step": 6427 }, { "epoch": 0.817707670779799, "ewc_loss": 0.04618651047348976, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018476549303159118, "grad_norm": 5.438839435577393, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8531904816627502, "num_tokens": 245205199.0, "step": 6428 }, { "epoch": 0.8178348810583895, "ewc_loss": 0.04620914161205292, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001849917898653075, "grad_norm": 5.53206729888916, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8652266263961792, "num_tokens": 245242455.0, "step": 6429 }, { "epoch": 0.8179620913369801, "ewc_loss": 0.04622263461351395, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001851267588790506, "grad_norm": 5.4712443351745605, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8805979490280151, "num_tokens": 245280675.0, "step": 6430 }, { "epoch": 0.8180893016155706, "ewc_loss": 0.04616105183959007, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001845109072746709, "grad_norm": 5.511831760406494, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8754009008407593, "num_tokens": 245316947.0, "step": 6431 }, { "epoch": 0.818216511894161, "ewc_loss": 0.04622705653309822, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018517095304559916, "grad_norm": 5.523597240447998, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8643051385879517, "num_tokens": 245355145.0, "step": 6432 }, { "epoch": 0.8183437221727515, "ewc_loss": 0.04609279707074165, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018504906620364636, "grad_norm": 5.5415120124816895, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8723949193954468, "num_tokens": 245390282.0, "step": 6433 }, { "epoch": 0.8184709324513421, "ewc_loss": 0.04616507887840271, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018455115787219256, "grad_norm": 5.476696014404297, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8693102598190308, "num_tokens": 245430262.0, "step": 6434 }, { "epoch": 0.8185981427299326, "ewc_loss": 0.04606353119015694, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.00018475639808457345, "grad_norm": 5.522244453430176, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8580447435379028, "num_tokens": 245470874.0, "step": 6435 }, { "epoch": 0.8187253530085231, "ewc_loss": 0.04602767527103424, "ewc_loss_diag": 2.753734588623047e-05, "ewc_loss_parallel": 0.0001843978388933465, "grad_norm": 5.5108232498168945, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8657175898551941, "num_tokens": 245506626.0, "step": 6436 }, { "epoch": 0.8188525632871136, "ewc_loss": 0.04615738242864609, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018447422189638019, "grad_norm": 5.5379486083984375, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8449323773384094, "num_tokens": 245546794.0, "step": 6437 }, { "epoch": 0.8189797735657041, "ewc_loss": 0.046164125204086304, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001845416409196332, "grad_norm": 6.3713836669921875, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.863694429397583, "num_tokens": 245589544.0, "step": 6438 }, { "epoch": 0.8191069838442946, "ewc_loss": 0.046443767845630646, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018733806791715324, "grad_norm": 5.4557576179504395, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8754287958145142, "num_tokens": 245628889.0, "step": 6439 }, { "epoch": 0.8192341941228851, "ewc_loss": 0.04596788436174393, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018257925694342703, "grad_norm": 5.560920238494873, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8567065000534058, "num_tokens": 245674615.0, "step": 6440 }, { "epoch": 0.8193614044014756, "ewc_loss": 0.0461122989654541, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001840233599068597, "grad_norm": 5.444089889526367, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8607276678085327, "num_tokens": 245713966.0, "step": 6441 }, { "epoch": 0.8194886146800662, "ewc_loss": 0.04607934504747391, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018369384633842856, "grad_norm": 5.5739312171936035, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8646915555000305, "num_tokens": 245750035.0, "step": 6442 }, { "epoch": 0.8196158249586567, "ewc_loss": 0.046171918511390686, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018461958097759634, "grad_norm": 6.18544340133667, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8758119940757751, "num_tokens": 245788506.0, "step": 6443 }, { "epoch": 0.8197430352372471, "ewc_loss": 0.04623868688941002, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001852872665040195, "grad_norm": 5.421906471252441, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8636454343795776, "num_tokens": 245826547.0, "step": 6444 }, { "epoch": 0.8198702455158376, "ewc_loss": 0.04603063315153122, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018320672097615898, "grad_norm": 5.485773086547852, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8506355285644531, "num_tokens": 245874363.0, "step": 6445 }, { "epoch": 0.8199974557944282, "ewc_loss": 0.04625140503048897, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018419373373035342, "grad_norm": 6.389678955078125, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8427249193191528, "num_tokens": 245917813.0, "step": 6446 }, { "epoch": 0.8201246660730187, "ewc_loss": 0.04655276983976364, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001872073917184025, "grad_norm": 5.446191310882568, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8571882843971252, "num_tokens": 245952177.0, "step": 6447 }, { "epoch": 0.8202518763516092, "ewc_loss": 0.04602641239762306, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018194381846114993, "grad_norm": 5.560603618621826, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8631696701049805, "num_tokens": 245992897.0, "step": 6448 }, { "epoch": 0.8203790866301998, "ewc_loss": 0.046265609562397, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018433578952681273, "grad_norm": 5.611438274383545, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8682699203491211, "num_tokens": 246029657.0, "step": 6449 }, { "epoch": 0.8205062969087902, "ewc_loss": 0.04616633057594299, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018334299966227263, "grad_norm": 5.504128932952881, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8514760136604309, "num_tokens": 246072425.0, "step": 6450 }, { "epoch": 0.8206335071873807, "ewc_loss": 0.04616294801235199, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018330915190745145, "grad_norm": 5.421322345733643, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8785233497619629, "num_tokens": 246113131.0, "step": 6451 }, { "epoch": 0.8207607174659712, "ewc_loss": 0.04609536752104759, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018385406292509288, "grad_norm": 5.518710136413574, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8642091155052185, "num_tokens": 246154548.0, "step": 6452 }, { "epoch": 0.8208879277445618, "ewc_loss": 0.046289484947919846, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018457452824804932, "grad_norm": 5.572073936462402, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8581440448760986, "num_tokens": 246189518.0, "step": 6453 }, { "epoch": 0.8210151380231523, "ewc_loss": 0.04628074914216995, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018448718765284866, "grad_norm": 5.5742363929748535, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8568035960197449, "num_tokens": 246235248.0, "step": 6454 }, { "epoch": 0.8211423483017428, "ewc_loss": 0.04627620056271553, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018444169836584479, "grad_norm": 5.497901916503906, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8528750538825989, "num_tokens": 246277016.0, "step": 6455 }, { "epoch": 0.8212695585803332, "ewc_loss": 0.046270016580820084, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018437985272612423, "grad_norm": 5.499509334564209, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8571884036064148, "num_tokens": 246312352.0, "step": 6456 }, { "epoch": 0.8213967688589238, "ewc_loss": 0.0462479405105114, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001841591001721099, "grad_norm": 5.460386276245117, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8472684621810913, "num_tokens": 246355079.0, "step": 6457 }, { "epoch": 0.8215239791375143, "ewc_loss": 0.04636891931295395, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018536887364462018, "grad_norm": 5.632690906524658, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8474687337875366, "num_tokens": 246391073.0, "step": 6458 }, { "epoch": 0.8216511894161048, "ewc_loss": 0.046286776661872864, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001845474325818941, "grad_norm": 5.424933433532715, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8678010702133179, "num_tokens": 246435407.0, "step": 6459 }, { "epoch": 0.8217783996946953, "ewc_loss": 0.04634552448987961, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001851349079515785, "grad_norm": 5.638130187988281, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8472643494606018, "num_tokens": 246475303.0, "step": 6460 }, { "epoch": 0.8219056099732859, "ewc_loss": 0.046354398131370544, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018522364553064108, "grad_norm": 5.442699432373047, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8581774830818176, "num_tokens": 246515237.0, "step": 6461 }, { "epoch": 0.8220328202518764, "ewc_loss": 0.046374619007110596, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018542588804848492, "grad_norm": 5.616370677947998, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8549220561981201, "num_tokens": 246554958.0, "step": 6462 }, { "epoch": 0.8221600305304668, "ewc_loss": 0.04631096124649048, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018478928541298956, "grad_norm": 5.492051601409912, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8609516024589539, "num_tokens": 246592327.0, "step": 6463 }, { "epoch": 0.8222872408090574, "ewc_loss": 0.04633799195289612, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018505961634218693, "grad_norm": 5.5802321434021, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8657664656639099, "num_tokens": 246626965.0, "step": 6464 }, { "epoch": 0.8224144510876479, "ewc_loss": 0.04620923101902008, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018499269208405167, "grad_norm": 5.468249320983887, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8568950295448303, "num_tokens": 246662049.0, "step": 6465 }, { "epoch": 0.8225416613662384, "ewc_loss": 0.0461462028324604, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018436241953168064, "grad_norm": 5.5143914222717285, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8621942400932312, "num_tokens": 246696989.0, "step": 6466 }, { "epoch": 0.8226688716448289, "ewc_loss": 0.046202875673770905, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018492915842216462, "grad_norm": 5.51338005065918, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8646513223648071, "num_tokens": 246738418.0, "step": 6467 }, { "epoch": 0.8227960819234195, "ewc_loss": 0.0461454764008522, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001843551581259817, "grad_norm": 5.482110500335693, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8610982298851013, "num_tokens": 246776779.0, "step": 6468 }, { "epoch": 0.8229232922020099, "ewc_loss": 0.046246856451034546, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018536894640419632, "grad_norm": 5.49444055557251, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8658530116081238, "num_tokens": 246814029.0, "step": 6469 }, { "epoch": 0.8230505024806004, "ewc_loss": 0.046224888414144516, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001851492706919089, "grad_norm": 5.50921106338501, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8518065810203552, "num_tokens": 246856390.0, "step": 6470 }, { "epoch": 0.8231777127591909, "ewc_loss": 0.046225257217884064, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018515298143029213, "grad_norm": 5.487027168273926, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8615267276763916, "num_tokens": 246895563.0, "step": 6471 }, { "epoch": 0.8233049230377815, "ewc_loss": 0.04623585194349289, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001852589048212394, "grad_norm": 5.523495197296143, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8576124906539917, "num_tokens": 246935956.0, "step": 6472 }, { "epoch": 0.823432133316372, "ewc_loss": 0.0463886633515358, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018556629947852343, "grad_norm": 5.446572780609131, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8570876717567444, "num_tokens": 246984765.0, "step": 6473 }, { "epoch": 0.8235593435949625, "ewc_loss": 0.046367935836315155, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018535906565375626, "grad_norm": 5.58864164352417, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.863457441329956, "num_tokens": 247017086.0, "step": 6474 }, { "epoch": 0.8236865538735529, "ewc_loss": 0.04639513045549393, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001856309681897983, "grad_norm": 5.472766399383545, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8656488060951233, "num_tokens": 247056195.0, "step": 6475 }, { "epoch": 0.8238137641521435, "ewc_loss": 0.046419717371463776, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018587683734949678, "grad_norm": 5.522364616394043, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8744229078292847, "num_tokens": 247094063.0, "step": 6476 }, { "epoch": 0.823940974430734, "ewc_loss": 0.04623096436262131, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018521002493798733, "grad_norm": 5.494665145874023, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8587076663970947, "num_tokens": 247131743.0, "step": 6477 }, { "epoch": 0.8240681847093245, "ewc_loss": 0.046268023550510406, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018558060401119292, "grad_norm": 5.527850151062012, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8573192358016968, "num_tokens": 247169477.0, "step": 6478 }, { "epoch": 0.824195394987915, "ewc_loss": 0.04624539613723755, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018535435083322227, "grad_norm": 5.464272499084473, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8691229820251465, "num_tokens": 247209159.0, "step": 6479 }, { "epoch": 0.8243226052665056, "ewc_loss": 0.04640033468604088, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001856830349424854, "grad_norm": 5.531887531280518, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8625609874725342, "num_tokens": 247247603.0, "step": 6480 }, { "epoch": 0.824449815545096, "ewc_loss": 0.046248212456703186, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018538253789301962, "grad_norm": 5.456971168518066, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8703441023826599, "num_tokens": 247290184.0, "step": 6481 }, { "epoch": 0.8245770258236865, "ewc_loss": 0.04644186422228813, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018609833205118775, "grad_norm": 5.538140773773193, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8435444831848145, "num_tokens": 247331862.0, "step": 6482 }, { "epoch": 0.824704236102277, "ewc_loss": 0.046383872628211975, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018551840912550688, "grad_norm": 5.51713228225708, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8608786463737488, "num_tokens": 247368271.0, "step": 6483 }, { "epoch": 0.8248314463808676, "ewc_loss": 0.046374354511499405, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018542322504799813, "grad_norm": 5.489675045013428, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8596915006637573, "num_tokens": 247408077.0, "step": 6484 }, { "epoch": 0.8249586566594581, "ewc_loss": 0.046358637511730194, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018526606436353177, "grad_norm": 5.502518653869629, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8682233095169067, "num_tokens": 247447408.0, "step": 6485 }, { "epoch": 0.8250858669380486, "ewc_loss": 0.046258214861154556, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018548253865446895, "grad_norm": 5.46094274520874, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8714683055877686, "num_tokens": 247489863.0, "step": 6486 }, { "epoch": 0.8252130772166391, "ewc_loss": 0.046369053423404694, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001853702124208212, "grad_norm": 5.50400972366333, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8559035658836365, "num_tokens": 247532233.0, "step": 6487 }, { "epoch": 0.8253402874952296, "ewc_loss": 0.04644094407558441, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018608910613693297, "grad_norm": 5.555700778961182, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8363864421844482, "num_tokens": 247570719.0, "step": 6488 }, { "epoch": 0.8254674977738201, "ewc_loss": 0.04637151211500168, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018539480515755713, "grad_norm": 5.529383182525635, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8657423257827759, "num_tokens": 247603302.0, "step": 6489 }, { "epoch": 0.8255947080524106, "ewc_loss": 0.04639856517314911, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001856653398135677, "grad_norm": 5.487487316131592, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.864039957523346, "num_tokens": 247642837.0, "step": 6490 }, { "epoch": 0.8257219183310012, "ewc_loss": 0.04636882245540619, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018536789866629988, "grad_norm": 5.515416145324707, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8591117262840271, "num_tokens": 247685246.0, "step": 6491 }, { "epoch": 0.8258491286095917, "ewc_loss": 0.0462348498404026, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001852488931035623, "grad_norm": 5.520173072814941, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8517193794250488, "num_tokens": 247720871.0, "step": 6492 }, { "epoch": 0.8259763388881821, "ewc_loss": 0.0462893545627594, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018579393508844078, "grad_norm": 5.5909953117370605, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8614622354507446, "num_tokens": 247759796.0, "step": 6493 }, { "epoch": 0.8261035491667726, "ewc_loss": 0.04621131718158722, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018501354497857392, "grad_norm": 5.492953777313232, "learning_rate": 1e-06, "loss": 0.5334, "mean_token_accuracy": 0.839889407157898, "num_tokens": 247806486.0, "step": 6494 }, { "epoch": 0.8262307594453632, "ewc_loss": 0.04624189808964729, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018531936802901328, "grad_norm": 5.555934906005859, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8680378794670105, "num_tokens": 247842031.0, "step": 6495 }, { "epoch": 0.8263579697239537, "ewc_loss": 0.04619944840669632, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001848948741098866, "grad_norm": 5.502352237701416, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8645142316818237, "num_tokens": 247883438.0, "step": 6496 }, { "epoch": 0.8264851800025442, "ewc_loss": 0.04634074494242668, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018508713401388377, "grad_norm": 5.500879764556885, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.86838698387146, "num_tokens": 247921491.0, "step": 6497 }, { "epoch": 0.8266123902811348, "ewc_loss": 0.046179015189409256, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001846905506681651, "grad_norm": 5.515216827392578, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8708850145339966, "num_tokens": 247957454.0, "step": 6498 }, { "epoch": 0.8267396005597252, "ewc_loss": 0.04621739313006401, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018507431377656758, "grad_norm": 5.602606773376465, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8456101417541504, "num_tokens": 247997705.0, "step": 6499 }, { "epoch": 0.8268668108383157, "ewc_loss": 0.0462319552898407, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018521994934417307, "grad_norm": 5.498744964599609, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8557566404342651, "num_tokens": 248034536.0, "step": 6500 }, { "epoch": 0.8269940211169062, "ewc_loss": 0.04615083709359169, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.000184408767381683, "grad_norm": 5.527114391326904, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.861528754234314, "num_tokens": 248080916.0, "step": 6501 }, { "epoch": 0.8271212313954968, "ewc_loss": 0.04623030498623848, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018520343292038888, "grad_norm": 5.522517204284668, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8599737286567688, "num_tokens": 248119719.0, "step": 6502 }, { "epoch": 0.8272484416740873, "ewc_loss": 0.046224068850278854, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018514107796363533, "grad_norm": 5.604984760284424, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8577249050140381, "num_tokens": 248156221.0, "step": 6503 }, { "epoch": 0.8273756519526778, "ewc_loss": 0.04620617628097534, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018496214761398733, "grad_norm": 5.4755377769470215, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8636831045150757, "num_tokens": 248191780.0, "step": 6504 }, { "epoch": 0.8275028622312682, "ewc_loss": 0.04624678194522858, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018536820425651968, "grad_norm": 5.613437175750732, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8554953932762146, "num_tokens": 248228594.0, "step": 6505 }, { "epoch": 0.8276300725098588, "ewc_loss": 0.046254467219114304, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001854450674727559, "grad_norm": 5.538942813873291, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8652784824371338, "num_tokens": 248260372.0, "step": 6506 }, { "epoch": 0.8277572827884493, "ewc_loss": 0.046246401965618134, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001853644207585603, "grad_norm": 5.541402339935303, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.852898359298706, "num_tokens": 248299361.0, "step": 6507 }, { "epoch": 0.8278844930670398, "ewc_loss": 0.04629918932914734, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018589226237963885, "grad_norm": 5.558628082275391, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8525965809822083, "num_tokens": 248337370.0, "step": 6508 }, { "epoch": 0.8280117033456303, "ewc_loss": 0.04621905833482742, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001850909902714193, "grad_norm": 5.489553928375244, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8628385066986084, "num_tokens": 248377541.0, "step": 6509 }, { "epoch": 0.8281389136242209, "ewc_loss": 0.04640553146600723, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018573501438368112, "grad_norm": 5.568014144897461, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8431986570358276, "num_tokens": 248413928.0, "step": 6510 }, { "epoch": 0.8282661239028114, "ewc_loss": 0.04625473916530609, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018544775957707316, "grad_norm": 5.523165702819824, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8607536554336548, "num_tokens": 248453283.0, "step": 6511 }, { "epoch": 0.8283933341814018, "ewc_loss": 0.046324484050273895, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018614521832205355, "grad_norm": 5.526262283325195, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8657318353652954, "num_tokens": 248490286.0, "step": 6512 }, { "epoch": 0.8285205444599923, "ewc_loss": 0.046286486089229584, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001857652678154409, "grad_norm": 5.5175862312316895, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8444870710372925, "num_tokens": 248526349.0, "step": 6513 }, { "epoch": 0.8286477547385829, "ewc_loss": 0.0463114008307457, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001860143820522353, "grad_norm": 5.52492094039917, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8535977602005005, "num_tokens": 248561751.0, "step": 6514 }, { "epoch": 0.8287749650171734, "ewc_loss": 0.0463159941136837, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001860603370005265, "grad_norm": 5.485268592834473, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8744333386421204, "num_tokens": 248601921.0, "step": 6515 }, { "epoch": 0.8289021752957639, "ewc_loss": 0.04657299071550369, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018618891772348434, "grad_norm": 5.498748302459717, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8720312118530273, "num_tokens": 248641218.0, "step": 6516 }, { "epoch": 0.8290293855743545, "ewc_loss": 0.046497710049152374, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018665676179807633, "grad_norm": 5.624380588531494, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8627998232841492, "num_tokens": 248672616.0, "step": 6517 }, { "epoch": 0.8291565958529449, "ewc_loss": 0.04646138846874237, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018629358964972198, "grad_norm": 5.5148491859436035, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8638859391212463, "num_tokens": 248708212.0, "step": 6518 }, { "epoch": 0.8292838061315354, "ewc_loss": 0.046475812792778015, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001864377991296351, "grad_norm": 5.583123683929443, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8641381859779358, "num_tokens": 248746575.0, "step": 6519 }, { "epoch": 0.8294110164101259, "ewc_loss": 0.046444863080978394, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018612829444464296, "grad_norm": 5.486544132232666, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.856427013874054, "num_tokens": 248789423.0, "step": 6520 }, { "epoch": 0.8295382266887165, "ewc_loss": 0.04640460014343262, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001857257157098502, "grad_norm": 5.5425124168396, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8629430532455444, "num_tokens": 248824044.0, "step": 6521 }, { "epoch": 0.829665436967307, "ewc_loss": 0.04643965885043144, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018607627134770155, "grad_norm": 5.5020318031311035, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8580008745193481, "num_tokens": 248860805.0, "step": 6522 }, { "epoch": 0.8297926472458975, "ewc_loss": 0.04639158025383949, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018559549062047154, "grad_norm": 5.518136501312256, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8665469288825989, "num_tokens": 248898958.0, "step": 6523 }, { "epoch": 0.8299198575244879, "ewc_loss": 0.046451784670352936, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001861975179053843, "grad_norm": 5.475485324859619, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8595290184020996, "num_tokens": 248938847.0, "step": 6524 }, { "epoch": 0.8300470678030785, "ewc_loss": 0.04643614962697029, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018604118668008596, "grad_norm": 5.542759418487549, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8573768734931946, "num_tokens": 248977944.0, "step": 6525 }, { "epoch": 0.830174278081669, "ewc_loss": 0.04665633291006088, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018702232046052814, "grad_norm": 5.535565376281738, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8533338308334351, "num_tokens": 249014610.0, "step": 6526 }, { "epoch": 0.8303014883602595, "ewc_loss": 0.04659728705883026, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018643186194822192, "grad_norm": 5.583514213562012, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8306155800819397, "num_tokens": 249053869.0, "step": 6527 }, { "epoch": 0.83042869863885, "ewc_loss": 0.04657614603638649, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001862204517237842, "grad_norm": 5.521003723144531, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8528220653533936, "num_tokens": 249096874.0, "step": 6528 }, { "epoch": 0.8305559089174406, "ewc_loss": 0.04659642279148102, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001864232326624915, "grad_norm": 5.51781702041626, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8584792613983154, "num_tokens": 249133141.0, "step": 6529 }, { "epoch": 0.830683119196031, "ewc_loss": 0.04655500128865242, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001860089978436008, "grad_norm": 5.570068836212158, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8668909668922424, "num_tokens": 249163813.0, "step": 6530 }, { "epoch": 0.8308103294746215, "ewc_loss": 0.046586282551288605, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.000186321820365265, "grad_norm": 5.504691123962402, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8533166646957397, "num_tokens": 249202935.0, "step": 6531 }, { "epoch": 0.830937539753212, "ewc_loss": 0.04660847783088684, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018654376617632806, "grad_norm": 5.534807205200195, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8671004176139832, "num_tokens": 249241061.0, "step": 6532 }, { "epoch": 0.8310647500318026, "ewc_loss": 0.04661767929792404, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018663579248823225, "grad_norm": 5.531524658203125, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8486853837966919, "num_tokens": 249277863.0, "step": 6533 }, { "epoch": 0.8311919603103931, "ewc_loss": 0.04659871757030487, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018644613737706095, "grad_norm": 5.512439250946045, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8602167963981628, "num_tokens": 249318047.0, "step": 6534 }, { "epoch": 0.8313191705889836, "ewc_loss": 0.046597760170698166, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001864365767687559, "grad_norm": 5.5194172859191895, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8553036451339722, "num_tokens": 249362095.0, "step": 6535 }, { "epoch": 0.831446380867574, "ewc_loss": 0.04653219133615494, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018578089657239616, "grad_norm": 5.528990745544434, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8733658194541931, "num_tokens": 249403362.0, "step": 6536 }, { "epoch": 0.8315735911461646, "ewc_loss": 0.04658646881580353, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018632365390658379, "grad_norm": 5.517725944519043, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8673076629638672, "num_tokens": 249438051.0, "step": 6537 }, { "epoch": 0.8317008014247551, "ewc_loss": 0.0463494136929512, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018639452173374593, "grad_norm": 5.529963493347168, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8653579950332642, "num_tokens": 249477185.0, "step": 6538 }, { "epoch": 0.8318280117033456, "ewc_loss": 0.04656318575143814, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001860908669186756, "grad_norm": 5.54776668548584, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8528867363929749, "num_tokens": 249516880.0, "step": 6539 }, { "epoch": 0.8319552219819362, "ewc_loss": 0.0465850830078125, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018630982958711684, "grad_norm": 5.481730937957764, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8700649738311768, "num_tokens": 249555864.0, "step": 6540 }, { "epoch": 0.8320824322605267, "ewc_loss": 0.04633600264787674, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018626039673108608, "grad_norm": 5.542165756225586, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.860589861869812, "num_tokens": 249590566.0, "step": 6541 }, { "epoch": 0.8322096425391171, "ewc_loss": 0.04645953327417374, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018627500685397536, "grad_norm": 5.4687299728393555, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8726582527160645, "num_tokens": 249631523.0, "step": 6542 }, { "epoch": 0.8323368528177076, "ewc_loss": 0.046451106667518616, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018619076581671834, "grad_norm": 5.498141765594482, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8690899610519409, "num_tokens": 249674113.0, "step": 6543 }, { "epoch": 0.8324640630962982, "ewc_loss": 0.046397581696510315, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018565553182270378, "grad_norm": 5.494603633880615, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8715289831161499, "num_tokens": 249712934.0, "step": 6544 }, { "epoch": 0.8325912733748887, "ewc_loss": 0.04651997983455658, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018687950796447694, "grad_norm": 5.587718963623047, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8659567832946777, "num_tokens": 249750295.0, "step": 6545 }, { "epoch": 0.8327184836534792, "ewc_loss": 0.046453364193439484, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001862133212853223, "grad_norm": 5.496379852294922, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8654592633247375, "num_tokens": 249790289.0, "step": 6546 }, { "epoch": 0.8328456939320698, "ewc_loss": 0.0462900809943676, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018580119649413973, "grad_norm": 5.581906318664551, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8739734888076782, "num_tokens": 249822873.0, "step": 6547 }, { "epoch": 0.8329729042106602, "ewc_loss": 0.04635671526193619, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001864675577962771, "grad_norm": 5.516993999481201, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8609825372695923, "num_tokens": 249860564.0, "step": 6548 }, { "epoch": 0.8331001144892507, "ewc_loss": 0.04630354791879654, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001859358453657478, "grad_norm": 5.502238750457764, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8601886034011841, "num_tokens": 249901448.0, "step": 6549 }, { "epoch": 0.8332273247678412, "ewc_loss": 0.04631728678941727, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001860732736531645, "grad_norm": 5.619025707244873, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8504976034164429, "num_tokens": 249933524.0, "step": 6550 }, { "epoch": 0.8333545350464318, "ewc_loss": 0.04645024985074997, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018618219473864883, "grad_norm": 5.520583629608154, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8561060428619385, "num_tokens": 249971856.0, "step": 6551 }, { "epoch": 0.8334817453250223, "ewc_loss": 0.04657121002674103, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018617110617924482, "grad_norm": 5.573981285095215, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8534508347511292, "num_tokens": 250013940.0, "step": 6552 }, { "epoch": 0.8336089556036128, "ewc_loss": 0.04644623398780823, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018614200234878808, "grad_norm": 5.659934997558594, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8428173065185547, "num_tokens": 250044826.0, "step": 6553 }, { "epoch": 0.8337361658822032, "ewc_loss": 0.04630983993411064, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001859987823991105, "grad_norm": 5.455234050750732, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8443697094917297, "num_tokens": 250087409.0, "step": 6554 }, { "epoch": 0.8338633761607938, "ewc_loss": 0.04626176506280899, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018551803077571094, "grad_norm": 5.531739234924316, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8674124479293823, "num_tokens": 250126236.0, "step": 6555 }, { "epoch": 0.8339905864393843, "ewc_loss": 0.046337954699993134, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.000186279954505153, "grad_norm": 5.49249792098999, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8684313297271729, "num_tokens": 250163559.0, "step": 6556 }, { "epoch": 0.8341177967179748, "ewc_loss": 0.046320103108882904, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018610140250530094, "grad_norm": 5.50788688659668, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8615601062774658, "num_tokens": 250203536.0, "step": 6557 }, { "epoch": 0.8342450069965653, "ewc_loss": 0.04659204185009003, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018637938774190843, "grad_norm": 5.55059289932251, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8664398193359375, "num_tokens": 250237455.0, "step": 6558 }, { "epoch": 0.8343722172751559, "ewc_loss": 0.046587008982896805, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018632908177096397, "grad_norm": 5.5261454582214355, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8687698841094971, "num_tokens": 250274559.0, "step": 6559 }, { "epoch": 0.8344994275537464, "ewc_loss": 0.04648841172456741, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018656381871551275, "grad_norm": 5.549102783203125, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8622828722000122, "num_tokens": 250309392.0, "step": 6560 }, { "epoch": 0.8346266378323368, "ewc_loss": 0.046461425721645355, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018629396799951792, "grad_norm": 5.492356777191162, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8653581142425537, "num_tokens": 250352010.0, "step": 6561 }, { "epoch": 0.8347538481109273, "ewc_loss": 0.04647032171487808, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018638288020156324, "grad_norm": 5.687519073486328, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8488174080848694, "num_tokens": 250385745.0, "step": 6562 }, { "epoch": 0.8348810583895179, "ewc_loss": 0.046602487564086914, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018648385594133288, "grad_norm": 5.457513332366943, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.859617292881012, "num_tokens": 250426055.0, "step": 6563 }, { "epoch": 0.8350082686681084, "ewc_loss": 0.0465441457927227, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001859004405559972, "grad_norm": 5.546402931213379, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8580974340438843, "num_tokens": 250468795.0, "step": 6564 }, { "epoch": 0.8351354789466989, "ewc_loss": 0.04635320603847504, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.0001864324731286615, "grad_norm": 5.492194652557373, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8587865829467773, "num_tokens": 250509115.0, "step": 6565 }, { "epoch": 0.8352626892252895, "ewc_loss": 0.04631740599870682, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018607443780638278, "grad_norm": 5.522960186004639, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.86138916015625, "num_tokens": 250547770.0, "step": 6566 }, { "epoch": 0.8353898995038799, "ewc_loss": 0.04634876549243927, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018638806068338454, "grad_norm": 5.529307842254639, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8581708669662476, "num_tokens": 250589737.0, "step": 6567 }, { "epoch": 0.8355171097824704, "ewc_loss": 0.04635997489094734, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018650013953447342, "grad_norm": 5.605870246887207, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8679728507995605, "num_tokens": 250623058.0, "step": 6568 }, { "epoch": 0.8356443200610609, "ewc_loss": 0.046373240649700165, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018663280934561044, "grad_norm": 5.550615310668945, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8605237007141113, "num_tokens": 250660526.0, "step": 6569 }, { "epoch": 0.8357715303396515, "ewc_loss": 0.04635853320360184, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018648570403456688, "grad_norm": 5.516003608703613, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8456842303276062, "num_tokens": 250698218.0, "step": 6570 }, { "epoch": 0.835898740618242, "ewc_loss": 0.046363040804862976, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018653077131602913, "grad_norm": 5.552158832550049, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8531324863433838, "num_tokens": 250743410.0, "step": 6571 }, { "epoch": 0.8360259508968325, "ewc_loss": 0.04661141708493233, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018657316104508936, "grad_norm": 5.49312686920166, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8708851933479309, "num_tokens": 250781919.0, "step": 6572 }, { "epoch": 0.8361531611754229, "ewc_loss": 0.046531908214092255, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018699874635785818, "grad_norm": 5.594995975494385, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8579837083816528, "num_tokens": 250817051.0, "step": 6573 }, { "epoch": 0.8362803714540135, "ewc_loss": 0.04643481224775314, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018724851543083787, "grad_norm": 5.532582759857178, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8579086661338806, "num_tokens": 250857241.0, "step": 6574 }, { "epoch": 0.836407581732604, "ewc_loss": 0.04659421741962433, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001864011719590053, "grad_norm": 5.495761871337891, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8617048859596252, "num_tokens": 250903591.0, "step": 6575 }, { "epoch": 0.8365347920111945, "ewc_loss": 0.046526260673999786, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018694228492677212, "grad_norm": 5.5392045974731445, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8531703948974609, "num_tokens": 250945788.0, "step": 6576 }, { "epoch": 0.836662002289785, "ewc_loss": 0.04654351621866226, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018711482698563486, "grad_norm": 5.538133144378662, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8543496131896973, "num_tokens": 250984884.0, "step": 6577 }, { "epoch": 0.8367892125683756, "ewc_loss": 0.04649496078491211, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018662931688595563, "grad_norm": 5.495588302612305, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.872328519821167, "num_tokens": 251024347.0, "step": 6578 }, { "epoch": 0.836916422846966, "ewc_loss": 0.04665283113718033, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018698730855248868, "grad_norm": 5.542403221130371, "learning_rate": 1e-06, "loss": 0.3921, "mean_token_accuracy": 0.8823468089103699, "num_tokens": 251068527.0, "step": 6579 }, { "epoch": 0.8370436331255565, "ewc_loss": 0.0466204434633255, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001866634120233357, "grad_norm": 5.509722709655762, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8652457594871521, "num_tokens": 251109914.0, "step": 6580 }, { "epoch": 0.837170843404147, "ewc_loss": 0.046631116420030594, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018677015032153577, "grad_norm": 5.613818645477295, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.851475715637207, "num_tokens": 251144760.0, "step": 6581 }, { "epoch": 0.8372980536827376, "ewc_loss": 0.04660041257739067, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018646310491021723, "grad_norm": 5.499329090118408, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8589120507240295, "num_tokens": 251184456.0, "step": 6582 }, { "epoch": 0.8374252639613281, "ewc_loss": 0.046611178666353226, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001865707745309919, "grad_norm": 5.585843086242676, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8517658710479736, "num_tokens": 251229493.0, "step": 6583 }, { "epoch": 0.8375524742399186, "ewc_loss": 0.04667271673679352, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018718616047408432, "grad_norm": 5.601634979248047, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8660224676132202, "num_tokens": 251270886.0, "step": 6584 }, { "epoch": 0.837679684518509, "ewc_loss": 0.04658958688378334, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001863548532128334, "grad_norm": 5.580939292907715, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.869782567024231, "num_tokens": 251304833.0, "step": 6585 }, { "epoch": 0.8378068947970996, "ewc_loss": 0.04634300246834755, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018633042054716498, "grad_norm": 5.598701000213623, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8643819093704224, "num_tokens": 251334196.0, "step": 6586 }, { "epoch": 0.8379341050756901, "ewc_loss": 0.04645527899265289, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001862324570538476, "grad_norm": 5.503482818603516, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8503671884536743, "num_tokens": 251380278.0, "step": 6587 }, { "epoch": 0.8380613153542806, "ewc_loss": 0.046595461666584015, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001864135847426951, "grad_norm": 5.572476387023926, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8551417589187622, "num_tokens": 251418432.0, "step": 6588 }, { "epoch": 0.8381885256328712, "ewc_loss": 0.04657562077045441, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001862151693785563, "grad_norm": 5.526041507720947, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8577769994735718, "num_tokens": 251453747.0, "step": 6589 }, { "epoch": 0.8383157359114617, "ewc_loss": 0.046611908823251724, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018657807959243655, "grad_norm": 5.56941556930542, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8609229326248169, "num_tokens": 251492439.0, "step": 6590 }, { "epoch": 0.8384429461900521, "ewc_loss": 0.04654170945286751, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001858760806499049, "grad_norm": 5.553282260894775, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8621999025344849, "num_tokens": 251527945.0, "step": 6591 }, { "epoch": 0.8385701564686426, "ewc_loss": 0.046449631452560425, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018617601017467678, "grad_norm": 5.577376842498779, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8701053857803345, "num_tokens": 251567161.0, "step": 6592 }, { "epoch": 0.8386973667472332, "ewc_loss": 0.04634057730436325, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018630614795256406, "grad_norm": 5.599902629852295, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.861209511756897, "num_tokens": 251601189.0, "step": 6593 }, { "epoch": 0.8388245770258237, "ewc_loss": 0.04631941393017769, "ewc_loss_diag": 2.765655517578125e-05, "ewc_loss_parallel": 0.00018609453400131315, "grad_norm": 5.587861061096191, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.858093798160553, "num_tokens": 251630846.0, "step": 6594 }, { "epoch": 0.8389517873044142, "ewc_loss": 0.0465620718896389, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018607970559969544, "grad_norm": 5.558635711669922, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.867672324180603, "num_tokens": 251665701.0, "step": 6595 }, { "epoch": 0.8390789975830047, "ewc_loss": 0.046541981399059296, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018587878730613738, "grad_norm": 5.594476699829102, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8526812791824341, "num_tokens": 251699909.0, "step": 6596 }, { "epoch": 0.8392062078615952, "ewc_loss": 0.04657954350113869, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018625441589392722, "grad_norm": 5.580477237701416, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8583760261535645, "num_tokens": 251734052.0, "step": 6597 }, { "epoch": 0.8393334181401857, "ewc_loss": 0.04656316339969635, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018609061953611672, "grad_norm": 5.543448448181152, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.848495364189148, "num_tokens": 251772518.0, "step": 6598 }, { "epoch": 0.8394606284187762, "ewc_loss": 0.04658550024032593, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018631400598678738, "grad_norm": 5.501901626586914, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8576226234436035, "num_tokens": 251816378.0, "step": 6599 }, { "epoch": 0.8395878386973668, "ewc_loss": 0.046584807336330414, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.000186307035619393, "grad_norm": 5.59576416015625, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8596535325050354, "num_tokens": 251851264.0, "step": 6600 }, { "epoch": 0.8397150489759573, "ewc_loss": 0.04663756489753723, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018683461530599743, "grad_norm": 5.562567234039307, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8625891208648682, "num_tokens": 251888906.0, "step": 6601 }, { "epoch": 0.8398422592545478, "ewc_loss": 0.046603210270404816, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001864911027951166, "grad_norm": 5.496527194976807, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8671973943710327, "num_tokens": 251930641.0, "step": 6602 }, { "epoch": 0.8399694695331382, "ewc_loss": 0.04652427136898041, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018692240701057017, "grad_norm": 5.599822044372559, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8426826000213623, "num_tokens": 251968929.0, "step": 6603 }, { "epoch": 0.8400966798117288, "ewc_loss": 0.04653293639421463, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001870090636657551, "grad_norm": 5.563429832458496, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8525012731552124, "num_tokens": 252006751.0, "step": 6604 }, { "epoch": 0.8402238900903193, "ewc_loss": 0.046621765941381454, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018667663971427828, "grad_norm": 5.5452117919921875, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8562138676643372, "num_tokens": 252045861.0, "step": 6605 }, { "epoch": 0.8403511003689098, "ewc_loss": 0.04659648239612579, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018642382929101586, "grad_norm": 5.548561096191406, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.853147029876709, "num_tokens": 252084412.0, "step": 6606 }, { "epoch": 0.8404783106475003, "ewc_loss": 0.046655524522066116, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018701422959566116, "grad_norm": 5.55252742767334, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8653870224952698, "num_tokens": 252120576.0, "step": 6607 }, { "epoch": 0.8406055209260909, "ewc_loss": 0.046612538397312164, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001865843660198152, "grad_norm": 5.516397476196289, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8580087423324585, "num_tokens": 252161636.0, "step": 6608 }, { "epoch": 0.8407327312046813, "ewc_loss": 0.0464942529797554, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.00018662223010323942, "grad_norm": 5.515200614929199, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8617148399353027, "num_tokens": 252199669.0, "step": 6609 }, { "epoch": 0.8408599414832718, "ewc_loss": 0.04667062312364578, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018716523481998593, "grad_norm": 5.553638935089111, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.868523359298706, "num_tokens": 252236832.0, "step": 6610 }, { "epoch": 0.8409871517618623, "ewc_loss": 0.046675194054841995, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.000187210927833803, "grad_norm": 5.5647735595703125, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8556421995162964, "num_tokens": 252282518.0, "step": 6611 }, { "epoch": 0.8411143620404529, "ewc_loss": 0.04647423326969147, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001864219957496971, "grad_norm": 5.544530868530273, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.855891227722168, "num_tokens": 252316479.0, "step": 6612 }, { "epoch": 0.8412415723190434, "ewc_loss": 0.04665497690439224, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001870087580755353, "grad_norm": 5.617093086242676, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8540506362915039, "num_tokens": 252353789.0, "step": 6613 }, { "epoch": 0.8413687825976339, "ewc_loss": 0.04661860689520836, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001866450475063175, "grad_norm": 5.546520709991455, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8628742694854736, "num_tokens": 252391609.0, "step": 6614 }, { "epoch": 0.8414959928762245, "ewc_loss": 0.04660804569721222, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018653941515367478, "grad_norm": 5.505805969238281, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8743515014648438, "num_tokens": 252433514.0, "step": 6615 }, { "epoch": 0.8416232031548149, "ewc_loss": 0.04660622030496597, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001865212107077241, "grad_norm": 5.589475154876709, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8644434213638306, "num_tokens": 252469700.0, "step": 6616 }, { "epoch": 0.8417504134334054, "ewc_loss": 0.04665689915418625, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001870279957074672, "grad_norm": 5.5876264572143555, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.857169508934021, "num_tokens": 252501658.0, "step": 6617 }, { "epoch": 0.8418776237119959, "ewc_loss": 0.046580970287323, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018626867677085102, "grad_norm": 5.605043888092041, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8620022535324097, "num_tokens": 252534546.0, "step": 6618 }, { "epoch": 0.8420048339905865, "ewc_loss": 0.04658021777868271, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018626115343067795, "grad_norm": 5.573192596435547, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8682602643966675, "num_tokens": 252573172.0, "step": 6619 }, { "epoch": 0.842132044269177, "ewc_loss": 0.04657498747110367, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001862088538473472, "grad_norm": 5.54219913482666, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8692588806152344, "num_tokens": 252607258.0, "step": 6620 }, { "epoch": 0.8422592545477675, "ewc_loss": 0.04660693183541298, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001865282974904403, "grad_norm": 5.564813613891602, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8710308074951172, "num_tokens": 252651451.0, "step": 6621 }, { "epoch": 0.8423864648263579, "ewc_loss": 0.04659946262836456, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001864536025095731, "grad_norm": 5.538836479187012, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8722387552261353, "num_tokens": 252686348.0, "step": 6622 }, { "epoch": 0.8425136751049485, "ewc_loss": 0.04659906029701233, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001864496007328853, "grad_norm": 5.622888565063477, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8552542328834534, "num_tokens": 252724316.0, "step": 6623 }, { "epoch": 0.842640885383539, "ewc_loss": 0.04657097905874252, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001861687924247235, "grad_norm": 5.5256476402282715, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8495460152626038, "num_tokens": 252764612.0, "step": 6624 }, { "epoch": 0.8427680956621295, "ewc_loss": 0.04657983034849167, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001862572826212272, "grad_norm": 5.605164527893066, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8397237062454224, "num_tokens": 252809510.0, "step": 6625 }, { "epoch": 0.84289530594072, "ewc_loss": 0.0465717650949955, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001861766359070316, "grad_norm": 5.544570446014404, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8587251305580139, "num_tokens": 252847598.0, "step": 6626 }, { "epoch": 0.8430225162193106, "ewc_loss": 0.04659555107355118, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018641448696143925, "grad_norm": 5.681236743927002, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8465319871902466, "num_tokens": 252884401.0, "step": 6627 }, { "epoch": 0.843149726497901, "ewc_loss": 0.04655049741268158, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018596395966596901, "grad_norm": 5.485414981842041, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.843614399433136, "num_tokens": 252928348.0, "step": 6628 }, { "epoch": 0.8432769367764915, "ewc_loss": 0.04658444970846176, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018630347040016204, "grad_norm": 5.702269554138184, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8590867519378662, "num_tokens": 252965084.0, "step": 6629 }, { "epoch": 0.843404147055082, "ewc_loss": 0.04662215709686279, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018668056873138994, "grad_norm": 5.506610870361328, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8642711639404297, "num_tokens": 253003697.0, "step": 6630 }, { "epoch": 0.8435313573336726, "ewc_loss": 0.04657275602221489, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001861865457613021, "grad_norm": 5.715631008148193, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8452601432800293, "num_tokens": 253037239.0, "step": 6631 }, { "epoch": 0.8436585676122631, "ewc_loss": 0.04662476107478142, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018670658755581826, "grad_norm": 5.492838382720947, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8515123128890991, "num_tokens": 253078163.0, "step": 6632 }, { "epoch": 0.8437857778908536, "ewc_loss": 0.04660549759864807, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018651394930202514, "grad_norm": 5.6101250648498535, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8586670160293579, "num_tokens": 253118907.0, "step": 6633 }, { "epoch": 0.843912988169444, "ewc_loss": 0.04666629806160927, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018712195742409676, "grad_norm": 5.614779949188232, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8783468008041382, "num_tokens": 253153132.0, "step": 6634 }, { "epoch": 0.8440401984480346, "ewc_loss": 0.04654601961374283, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018591918342281133, "grad_norm": 5.573788642883301, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8568034172058105, "num_tokens": 253194446.0, "step": 6635 }, { "epoch": 0.8441674087266251, "ewc_loss": 0.04658222198486328, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018628122052177787, "grad_norm": 5.5663018226623535, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8625829219818115, "num_tokens": 253230798.0, "step": 6636 }, { "epoch": 0.8442946190052156, "ewc_loss": 0.04659218341112137, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018638082838151604, "grad_norm": 5.563478946685791, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8529558181762695, "num_tokens": 253269029.0, "step": 6637 }, { "epoch": 0.8444218292838062, "ewc_loss": 0.04662172123789787, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018667620315682143, "grad_norm": 5.566119194030762, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8564766645431519, "num_tokens": 253310656.0, "step": 6638 }, { "epoch": 0.8445490395623967, "ewc_loss": 0.04657910764217377, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018625006487127393, "grad_norm": 5.554394721984863, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8785238265991211, "num_tokens": 253346882.0, "step": 6639 }, { "epoch": 0.8446762498409871, "ewc_loss": 0.046619951725006104, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018665850802790374, "grad_norm": 5.592446327209473, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8509101271629333, "num_tokens": 253389601.0, "step": 6640 }, { "epoch": 0.8448034601195776, "ewc_loss": 0.046620070934295654, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018665968673303723, "grad_norm": 5.5663628578186035, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8583554625511169, "num_tokens": 253428725.0, "step": 6641 }, { "epoch": 0.8449306703981682, "ewc_loss": 0.046585649251937866, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018631546117831022, "grad_norm": 5.684650421142578, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8741944432258606, "num_tokens": 253461404.0, "step": 6642 }, { "epoch": 0.8450578806767587, "ewc_loss": 0.046634528785943985, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018680427456274629, "grad_norm": 5.575567245483398, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8641505837440491, "num_tokens": 253500324.0, "step": 6643 }, { "epoch": 0.8451850909553492, "ewc_loss": 0.046566739678382874, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018612640269566327, "grad_norm": 5.586211681365967, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8478055000305176, "num_tokens": 253542500.0, "step": 6644 }, { "epoch": 0.8453123012339397, "ewc_loss": 0.046620044857263565, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018665943935047835, "grad_norm": 5.620176792144775, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.84255051612854, "num_tokens": 253581887.0, "step": 6645 }, { "epoch": 0.8454395115125302, "ewc_loss": 0.046567998826503754, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018613896099850535, "grad_norm": 5.528438568115234, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8681972622871399, "num_tokens": 253616942.0, "step": 6646 }, { "epoch": 0.8455667217911207, "ewc_loss": 0.04661343991756439, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018659337365534157, "grad_norm": 5.5980610847473145, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.854610800743103, "num_tokens": 253652610.0, "step": 6647 }, { "epoch": 0.8456939320697112, "ewc_loss": 0.04661155864596367, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018657457258086652, "grad_norm": 5.594125270843506, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8585245609283447, "num_tokens": 253686269.0, "step": 6648 }, { "epoch": 0.8458211423483017, "ewc_loss": 0.04661111161112785, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001865701051428914, "grad_norm": 5.5669732093811035, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8546663522720337, "num_tokens": 253723632.0, "step": 6649 }, { "epoch": 0.8459483526268923, "ewc_loss": 0.04650542140007019, "ewc_loss_diag": 2.777576446533203e-05, "ewc_loss_parallel": 0.0001867339015007019, "grad_norm": 5.587649345397949, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8534347414970398, "num_tokens": 253755990.0, "step": 6650 }, { "epoch": 0.8460755629054828, "ewc_loss": 0.04662259668111801, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018668493430595845, "grad_norm": 5.564443111419678, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8543124198913574, "num_tokens": 253794200.0, "step": 6651 }, { "epoch": 0.8462027731840732, "ewc_loss": 0.04661116003990173, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018657059990800917, "grad_norm": 5.518989562988281, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8713281750679016, "num_tokens": 253840554.0, "step": 6652 }, { "epoch": 0.8463299834626637, "ewc_loss": 0.04659469425678253, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018640593043528497, "grad_norm": 5.611705303192139, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8687890768051147, "num_tokens": 253873642.0, "step": 6653 }, { "epoch": 0.8464571937412543, "ewc_loss": 0.04663609713315964, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018681996152736247, "grad_norm": 5.574854850769043, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8650854825973511, "num_tokens": 253911839.0, "step": 6654 }, { "epoch": 0.8465844040198448, "ewc_loss": 0.046623870730400085, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.00018669771088752896, "grad_norm": 5.607880592346191, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8511433005332947, "num_tokens": 253944649.0, "step": 6655 }, { "epoch": 0.8467116142984353, "ewc_loss": 0.04667901620268822, "ewc_loss_diag": 2.7894973754882812e-05, "ewc_loss_parallel": 0.0001872491411631927, "grad_norm": 26.09177589416504, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8598842620849609, "num_tokens": 253977739.0, "step": 6656 }, { "epoch": 0.8468388245770259, "ewc_loss": 0.06272232532501221, "ewc_loss_diag": 2.8014183044433594e-05, "ewc_loss_parallel": 0.00034768221667036414, "grad_norm": 8.088886260986328, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.845856249332428, "num_tokens": 254021216.0, "step": 6657 }, { "epoch": 0.8469660348556163, "ewc_loss": 0.04799002781510353, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.0001979178487090394, "grad_norm": 4.768773078918457, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8574883341789246, "num_tokens": 254063209.0, "step": 6658 }, { "epoch": 0.8470932451342068, "ewc_loss": 0.05370578169822693, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002514132938813418, "grad_norm": 7.258535861968994, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8549983501434326, "num_tokens": 254102325.0, "step": 6659 }, { "epoch": 0.8472204554127973, "ewc_loss": 0.05769665166735649, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00029010127764195204, "grad_norm": 6.915570259094238, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8649043440818787, "num_tokens": 254136991.0, "step": 6660 }, { "epoch": 0.8473476656913879, "ewc_loss": 0.050961967557668686, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00022031302796676755, "grad_norm": 5.730031967163086, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8599339723587036, "num_tokens": 254179520.0, "step": 6661 }, { "epoch": 0.8474748759699784, "ewc_loss": 0.051641400903463364, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00022588667343370616, "grad_norm": 6.326662063598633, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8604277968406677, "num_tokens": 254220661.0, "step": 6662 }, { "epoch": 0.8476020862485689, "ewc_loss": 0.05235777050256729, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00023060894454829395, "grad_norm": 5.992961406707764, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8682493567466736, "num_tokens": 254261853.0, "step": 6663 }, { "epoch": 0.8477292965271594, "ewc_loss": 0.05025818943977356, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00021083383762743324, "grad_norm": 5.878853797912598, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.861897349357605, "num_tokens": 254301577.0, "step": 6664 }, { "epoch": 0.8478565068057499, "ewc_loss": 0.050398074090480804, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00021223269868642092, "grad_norm": 5.8382086753845215, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8743505477905273, "num_tokens": 254341792.0, "step": 6665 }, { "epoch": 0.8479837170843404, "ewc_loss": 0.04961178079247475, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020559046242851764, "grad_norm": 5.837243556976318, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8473856449127197, "num_tokens": 254376755.0, "step": 6666 }, { "epoch": 0.8481109273629309, "ewc_loss": 0.04940688610076904, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020354149455670267, "grad_norm": 5.724388599395752, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.869601309299469, "num_tokens": 254413565.0, "step": 6667 }, { "epoch": 0.8482381376415215, "ewc_loss": 0.049010731279850006, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00019957996846642345, "grad_norm": 5.784884452819824, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8493585586547852, "num_tokens": 254449889.0, "step": 6668 }, { "epoch": 0.848365347920112, "ewc_loss": 0.0488826222717762, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00019829887605737895, "grad_norm": 5.694827556610107, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8734337091445923, "num_tokens": 254488129.0, "step": 6669 }, { "epoch": 0.8484925581987025, "ewc_loss": 0.048476941883563995, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00019424206402618438, "grad_norm": 5.612587928771973, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8713052868843079, "num_tokens": 254522031.0, "step": 6670 }, { "epoch": 0.8486197684772929, "ewc_loss": 0.048334840685129166, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00019282105495221913, "grad_norm": 5.617320537567139, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8713177442550659, "num_tokens": 254559249.0, "step": 6671 }, { "epoch": 0.8487469787558835, "ewc_loss": 0.0480990968644619, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00019168433209415525, "grad_norm": 5.586519241333008, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8537139296531677, "num_tokens": 254603968.0, "step": 6672 }, { "epoch": 0.848874189034474, "ewc_loss": 0.04795780032873154, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00019027134112548083, "grad_norm": 5.52207612991333, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8699531555175781, "num_tokens": 254642178.0, "step": 6673 }, { "epoch": 0.8490013993130645, "ewc_loss": 0.047864608466625214, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00018933943647425622, "grad_norm": 5.545658588409424, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8526185750961304, "num_tokens": 254679129.0, "step": 6674 }, { "epoch": 0.849128609591655, "ewc_loss": 0.04756172001361847, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00018875199020840228, "grad_norm": 5.643837928771973, "learning_rate": 1e-06, "loss": 0.5647, "mean_token_accuracy": 0.8314236402511597, "num_tokens": 254714952.0, "step": 6675 }, { "epoch": 0.8492558198702456, "ewc_loss": 0.04738627374172211, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018821822595782578, "grad_norm": 5.609368324279785, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8576970100402832, "num_tokens": 254743323.0, "step": 6676 }, { "epoch": 0.849383030148836, "ewc_loss": 0.04740241914987564, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00018715893384069204, "grad_norm": 5.477377891540527, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8664476275444031, "num_tokens": 254784749.0, "step": 6677 }, { "epoch": 0.8495102404274265, "ewc_loss": 0.04725874215364456, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018694286700338125, "grad_norm": 5.529111862182617, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8661492466926575, "num_tokens": 254820102.0, "step": 6678 }, { "epoch": 0.849637450706017, "ewc_loss": 0.04729115590453148, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018726702546700835, "grad_norm": 5.525984287261963, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8574358224868774, "num_tokens": 254859988.0, "step": 6679 }, { "epoch": 0.8497646609846076, "ewc_loss": 0.0472443588078022, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018679906497709453, "grad_norm": 5.49897575378418, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8555128574371338, "num_tokens": 254906792.0, "step": 6680 }, { "epoch": 0.8498918712631981, "ewc_loss": 0.04725276678800583, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001868831313913688, "grad_norm": 5.598963737487793, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.862619161605835, "num_tokens": 254941552.0, "step": 6681 }, { "epoch": 0.8500190815417886, "ewc_loss": 0.04720524698495865, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018640795315150172, "grad_norm": 5.555971622467041, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8525368571281433, "num_tokens": 254974653.0, "step": 6682 }, { "epoch": 0.850146291820379, "ewc_loss": 0.04714612662792206, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001858167233876884, "grad_norm": 5.487112522125244, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8446202278137207, "num_tokens": 255018418.0, "step": 6683 }, { "epoch": 0.8502735020989696, "ewc_loss": 0.047190986573696136, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018626531527843326, "grad_norm": 5.858479022979736, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8648731708526611, "num_tokens": 255049626.0, "step": 6684 }, { "epoch": 0.8504007123775601, "ewc_loss": 0.04718717932701111, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018622724746819586, "grad_norm": 5.455092430114746, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8546569347381592, "num_tokens": 255086323.0, "step": 6685 }, { "epoch": 0.8505279226561506, "ewc_loss": 0.04701615497469902, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001845170190790668, "grad_norm": 5.529932498931885, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8693915605545044, "num_tokens": 255124102.0, "step": 6686 }, { "epoch": 0.8506551329347412, "ewc_loss": 0.04688569903373718, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001856538438005373, "grad_norm": 5.497835159301758, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8635872602462769, "num_tokens": 255162176.0, "step": 6687 }, { "epoch": 0.8507823432133317, "ewc_loss": 0.04685533046722412, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018535018898546696, "grad_norm": 5.563172817230225, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8631255626678467, "num_tokens": 255196256.0, "step": 6688 }, { "epoch": 0.8509095534919221, "ewc_loss": 0.04692213982343674, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001860182819655165, "grad_norm": 5.539290904998779, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8540530204772949, "num_tokens": 255236752.0, "step": 6689 }, { "epoch": 0.8510367637705126, "ewc_loss": 0.04684625193476677, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018525938503444195, "grad_norm": 5.551717281341553, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8604410886764526, "num_tokens": 255268694.0, "step": 6690 }, { "epoch": 0.8511639740491032, "ewc_loss": 0.04687092453241348, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018550612730905414, "grad_norm": 5.461085796356201, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8662495613098145, "num_tokens": 255309445.0, "step": 6691 }, { "epoch": 0.8512911843276937, "ewc_loss": 0.046829741448163986, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018509429355617613, "grad_norm": 5.542044639587402, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8633129596710205, "num_tokens": 255341126.0, "step": 6692 }, { "epoch": 0.8514183946062842, "ewc_loss": 0.04681219160556793, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018491881201043725, "grad_norm": 5.4825358390808105, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8695937991142273, "num_tokens": 255379553.0, "step": 6693 }, { "epoch": 0.8515456048848747, "ewc_loss": 0.04681824520230293, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018497933342587203, "grad_norm": 5.504087448120117, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8599709272384644, "num_tokens": 255416588.0, "step": 6694 }, { "epoch": 0.8516728151634652, "ewc_loss": 0.04685491323471069, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001853459980338812, "grad_norm": 5.534899711608887, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8472913503646851, "num_tokens": 255449574.0, "step": 6695 }, { "epoch": 0.8518000254420557, "ewc_loss": 0.04683809354901314, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018517780699767172, "grad_norm": 5.456015586853027, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8589884042739868, "num_tokens": 255494088.0, "step": 6696 }, { "epoch": 0.8519272357206462, "ewc_loss": 0.04684872925281525, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018528415239416063, "grad_norm": 5.588430881500244, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8684920072555542, "num_tokens": 255529785.0, "step": 6697 }, { "epoch": 0.8520544459992367, "ewc_loss": 0.04682611674070358, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018505803018342704, "grad_norm": 5.503203392028809, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8470574617385864, "num_tokens": 255566675.0, "step": 6698 }, { "epoch": 0.8521816562778273, "ewc_loss": 0.04678540676832199, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001846509549068287, "grad_norm": 5.467663764953613, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8651033639907837, "num_tokens": 255606502.0, "step": 6699 }, { "epoch": 0.8523088665564178, "ewc_loss": 0.04686826467514038, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001854795409599319, "grad_norm": 5.558699607849121, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8515830039978027, "num_tokens": 255649179.0, "step": 6700 }, { "epoch": 0.8524360768350082, "ewc_loss": 0.046866144984960556, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018545832426752895, "grad_norm": 5.5569000244140625, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8602434396743774, "num_tokens": 255688190.0, "step": 6701 }, { "epoch": 0.8525632871135987, "ewc_loss": 0.046812862157821655, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018492552044335753, "grad_norm": 6.623505115509033, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8584843873977661, "num_tokens": 255730401.0, "step": 6702 }, { "epoch": 0.8526904973921893, "ewc_loss": 0.04752284288406372, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001920252834679559, "grad_norm": 5.489923000335693, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8552624583244324, "num_tokens": 255767881.0, "step": 6703 }, { "epoch": 0.8528177076707798, "ewc_loss": 0.04660411551594734, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018283803365193307, "grad_norm": 5.536126136779785, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8516311645507812, "num_tokens": 255808347.0, "step": 6704 }, { "epoch": 0.8529449179493703, "ewc_loss": 0.04696021229028702, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018639898917172104, "grad_norm": 5.57220458984375, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.842065691947937, "num_tokens": 255849538.0, "step": 6705 }, { "epoch": 0.8530721282279609, "ewc_loss": 0.04684997349977493, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018529662338551134, "grad_norm": 5.534513473510742, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8711879849433899, "num_tokens": 255890228.0, "step": 6706 }, { "epoch": 0.8531993385065513, "ewc_loss": 0.046867359429597855, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001854704605648294, "grad_norm": 5.509660720825195, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8585163354873657, "num_tokens": 255930189.0, "step": 6707 }, { "epoch": 0.8533265487851418, "ewc_loss": 0.04684097319841385, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018520660523790866, "grad_norm": 5.500574111938477, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8665599822998047, "num_tokens": 255968310.0, "step": 6708 }, { "epoch": 0.8534537590637323, "ewc_loss": 0.04717108607292175, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018606631783768535, "grad_norm": 5.519244194030762, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8712043762207031, "num_tokens": 256005872.0, "step": 6709 }, { "epoch": 0.8535809693423229, "ewc_loss": 0.047147512435913086, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018583059136290103, "grad_norm": 5.491517543792725, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8658573031425476, "num_tokens": 256045374.0, "step": 6710 }, { "epoch": 0.8537081796209134, "ewc_loss": 0.046864524483680725, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000185442142537795, "grad_norm": 5.518616199493408, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8626315593719482, "num_tokens": 256084874.0, "step": 6711 }, { "epoch": 0.8538353898995039, "ewc_loss": 0.04692420735955238, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001860389456851408, "grad_norm": 5.630843162536621, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.853411853313446, "num_tokens": 256122251.0, "step": 6712 }, { "epoch": 0.8539626001780944, "ewc_loss": 0.046905964612960815, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018585650832392275, "grad_norm": 5.508434295654297, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8581415414810181, "num_tokens": 256165025.0, "step": 6713 }, { "epoch": 0.8540898104566849, "ewc_loss": 0.046855028718709946, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018534716218709946, "grad_norm": 5.466775894165039, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8664451837539673, "num_tokens": 256202220.0, "step": 6714 }, { "epoch": 0.8542170207352754, "ewc_loss": 0.04690703749656677, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018586726218927652, "grad_norm": 5.53769063949585, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8625129461288452, "num_tokens": 256234036.0, "step": 6715 }, { "epoch": 0.8543442310138659, "ewc_loss": 0.046922456473112106, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018602143973112106, "grad_norm": 5.5297040939331055, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8595287799835205, "num_tokens": 256275253.0, "step": 6716 }, { "epoch": 0.8544714412924564, "ewc_loss": 0.04690554738044739, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001858523755799979, "grad_norm": 5.5408034324646, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8578143119812012, "num_tokens": 256311296.0, "step": 6717 }, { "epoch": 0.854598651571047, "ewc_loss": 0.04688125476241112, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018560941680334508, "grad_norm": 5.492110252380371, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8648630380630493, "num_tokens": 256348149.0, "step": 6718 }, { "epoch": 0.8547258618496375, "ewc_loss": 0.04688527435064316, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018564962374512106, "grad_norm": 5.493365287780762, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8742872476577759, "num_tokens": 256387861.0, "step": 6719 }, { "epoch": 0.8548530721282279, "ewc_loss": 0.04685661569237709, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001853630383266136, "grad_norm": 5.543979644775391, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8694759607315063, "num_tokens": 256427131.0, "step": 6720 }, { "epoch": 0.8549802824068184, "ewc_loss": 0.04685303568840027, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018532724061515182, "grad_norm": 5.482394695281982, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8642902374267578, "num_tokens": 256465041.0, "step": 6721 }, { "epoch": 0.855107492685409, "ewc_loss": 0.04682234302163124, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001850203116191551, "grad_norm": 5.515449523925781, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8515124320983887, "num_tokens": 256502576.0, "step": 6722 }, { "epoch": 0.8552347029639995, "ewc_loss": 0.04686751961708069, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018547207582741976, "grad_norm": 5.56347131729126, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8624091148376465, "num_tokens": 256540493.0, "step": 6723 }, { "epoch": 0.85536191324259, "ewc_loss": 0.04690674692392349, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018586433725431561, "grad_norm": 5.549148082733154, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8446190357208252, "num_tokens": 256578115.0, "step": 6724 }, { "epoch": 0.8554891235211806, "ewc_loss": 0.04683496803045273, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018514657858759165, "grad_norm": 5.529475212097168, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8684546947479248, "num_tokens": 256615957.0, "step": 6725 }, { "epoch": 0.855616333799771, "ewc_loss": 0.04687456041574478, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001855424779932946, "grad_norm": 5.585856914520264, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.863047182559967, "num_tokens": 256649972.0, "step": 6726 }, { "epoch": 0.8557435440783615, "ewc_loss": 0.04686106741428375, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018540753808338195, "grad_norm": 5.490335464477539, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.867645800113678, "num_tokens": 256686683.0, "step": 6727 }, { "epoch": 0.855870754356952, "ewc_loss": 0.046831268817186356, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018510955851525068, "grad_norm": 5.615810394287109, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8591783046722412, "num_tokens": 256723587.0, "step": 6728 }, { "epoch": 0.8559979646355426, "ewc_loss": 0.046914003789424896, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018593693675938994, "grad_norm": 5.492026329040527, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.861087441444397, "num_tokens": 256765245.0, "step": 6729 }, { "epoch": 0.8561251749141331, "ewc_loss": 0.04683903604745865, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001851872366387397, "grad_norm": 5.452332496643066, "learning_rate": 1e-06, "loss": 0.3653, "mean_token_accuracy": 0.8871880769729614, "num_tokens": 256806930.0, "step": 6730 }, { "epoch": 0.8562523851927236, "ewc_loss": 0.046923667192459106, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018603353237267584, "grad_norm": 5.56697416305542, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8583406805992126, "num_tokens": 256844568.0, "step": 6731 }, { "epoch": 0.856379595471314, "ewc_loss": 0.04688692465424538, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018566612561699003, "grad_norm": 5.570127487182617, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8579808473587036, "num_tokens": 256877483.0, "step": 6732 }, { "epoch": 0.8565068057499046, "ewc_loss": 0.04690650850534439, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018586197984404862, "grad_norm": 5.5260467529296875, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8649647831916809, "num_tokens": 256921139.0, "step": 6733 }, { "epoch": 0.8566340160284951, "ewc_loss": 0.04692721739411354, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018606905359774828, "grad_norm": 5.644655704498291, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8608095645904541, "num_tokens": 256957861.0, "step": 6734 }, { "epoch": 0.8567612263070856, "ewc_loss": 0.04690633714199066, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018586024816613644, "grad_norm": 5.498132705688477, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8630840182304382, "num_tokens": 256998582.0, "step": 6735 }, { "epoch": 0.8568884365856761, "ewc_loss": 0.04692547023296356, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018605159129947424, "grad_norm": 5.572869777679443, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8586559891700745, "num_tokens": 257037638.0, "step": 6736 }, { "epoch": 0.8570156468642667, "ewc_loss": 0.0468965619802475, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018576248839963228, "grad_norm": 5.516486167907715, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8632369637489319, "num_tokens": 257074427.0, "step": 6737 }, { "epoch": 0.8571428571428571, "ewc_loss": 0.046642035245895386, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001856586168287322, "grad_norm": 5.53528356552124, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8638536334037781, "num_tokens": 257113826.0, "step": 6738 }, { "epoch": 0.8572700674214476, "ewc_loss": 0.04688446596264839, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001856415328802541, "grad_norm": 5.573453426361084, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8470966815948486, "num_tokens": 257152387.0, "step": 6739 }, { "epoch": 0.8573972777000382, "ewc_loss": 0.04693920537829399, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018618893227539957, "grad_norm": 5.590010643005371, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8628940582275391, "num_tokens": 257192083.0, "step": 6740 }, { "epoch": 0.8575244879786287, "ewc_loss": 0.04661384969949722, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001853767898865044, "grad_norm": 5.465032577514648, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8525552749633789, "num_tokens": 257235425.0, "step": 6741 }, { "epoch": 0.8576516982572192, "ewc_loss": 0.04670698568224907, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018630814156495035, "grad_norm": 5.646506309509277, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8567072153091431, "num_tokens": 257268621.0, "step": 6742 }, { "epoch": 0.8577789085358097, "ewc_loss": 0.046971678733825684, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018651364371180534, "grad_norm": 5.4851789474487305, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8653708696365356, "num_tokens": 257306909.0, "step": 6743 }, { "epoch": 0.8579061188144002, "ewc_loss": 0.046654582023620605, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018578408344183117, "grad_norm": 5.5431318283081055, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8750168085098267, "num_tokens": 257343388.0, "step": 6744 }, { "epoch": 0.8580333290929907, "ewc_loss": 0.04675360769033432, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018677438492886722, "grad_norm": 5.570737838745117, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8661274909973145, "num_tokens": 257377172.0, "step": 6745 }, { "epoch": 0.8581605393715812, "ewc_loss": 0.04698353260755539, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018663221271708608, "grad_norm": 5.555154800415039, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.853712797164917, "num_tokens": 257410312.0, "step": 6746 }, { "epoch": 0.8582877496501717, "ewc_loss": 0.04692814499139786, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001860783522715792, "grad_norm": 5.528903007507324, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8628244996070862, "num_tokens": 257449919.0, "step": 6747 }, { "epoch": 0.8584149599287623, "ewc_loss": 0.04672197997570038, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001864580699475482, "grad_norm": 5.502664566040039, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8719482421875, "num_tokens": 257488296.0, "step": 6748 }, { "epoch": 0.8585421702073528, "ewc_loss": 0.04695792496204376, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018637609900906682, "grad_norm": 5.561002731323242, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8693525791168213, "num_tokens": 257524678.0, "step": 6749 }, { "epoch": 0.8586693804859432, "ewc_loss": 0.0467253215610981, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018649149569682777, "grad_norm": 5.530613422393799, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8658339977264404, "num_tokens": 257559060.0, "step": 6750 }, { "epoch": 0.8587965907645337, "ewc_loss": 0.04673738032579422, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018661207286641002, "grad_norm": 5.536892890930176, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8705157041549683, "num_tokens": 257594340.0, "step": 6751 }, { "epoch": 0.8589238010431243, "ewc_loss": 0.04675929620862007, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018683123926166445, "grad_norm": 5.548199653625488, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8731334805488586, "num_tokens": 257627936.0, "step": 6752 }, { "epoch": 0.8590510113217148, "ewc_loss": 0.046716369688510895, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018640195776242763, "grad_norm": 5.572781562805176, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8573461771011353, "num_tokens": 257660311.0, "step": 6753 }, { "epoch": 0.8591782216003053, "ewc_loss": 0.04674091935157776, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018664744857233018, "grad_norm": 5.528539180755615, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.854132890701294, "num_tokens": 257697496.0, "step": 6754 }, { "epoch": 0.8593054318788959, "ewc_loss": 0.04674946516752243, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001867329265223816, "grad_norm": 5.624320983886719, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8425922393798828, "num_tokens": 257737242.0, "step": 6755 }, { "epoch": 0.8594326421574863, "ewc_loss": 0.04675934463739395, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018683174857869744, "grad_norm": 5.500052452087402, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8620607256889343, "num_tokens": 257772946.0, "step": 6756 }, { "epoch": 0.8595598524360768, "ewc_loss": 0.04672442376613617, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018648254626896232, "grad_norm": 5.613350868225098, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8564454317092896, "num_tokens": 257807198.0, "step": 6757 }, { "epoch": 0.8596870627146673, "ewc_loss": 0.04675731435418129, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018681143410503864, "grad_norm": 5.477714538574219, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8670400381088257, "num_tokens": 257848618.0, "step": 6758 }, { "epoch": 0.8598142729932579, "ewc_loss": 0.04670232534408569, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001862615317804739, "grad_norm": 5.521636486053467, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8732767701148987, "num_tokens": 257887805.0, "step": 6759 }, { "epoch": 0.8599414832718484, "ewc_loss": 0.04673255980014801, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018656387692317367, "grad_norm": 5.481513023376465, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8674359321594238, "num_tokens": 257929276.0, "step": 6760 }, { "epoch": 0.8600686935504389, "ewc_loss": 0.04698972404003143, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018669411656446755, "grad_norm": 5.663484573364258, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8555334806442261, "num_tokens": 257958629.0, "step": 6761 }, { "epoch": 0.8601959038290294, "ewc_loss": 0.04704486206173897, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001872455031843856, "grad_norm": 5.561337947845459, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8576059937477112, "num_tokens": 257990303.0, "step": 6762 }, { "epoch": 0.8603231141076199, "ewc_loss": 0.04700251296162605, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018682201334740967, "grad_norm": 5.5806355476379395, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.8447573184967041, "num_tokens": 258028704.0, "step": 6763 }, { "epoch": 0.8604503243862104, "ewc_loss": 0.04706145077943802, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018741139501798898, "grad_norm": 5.54180383682251, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.869387149810791, "num_tokens": 258062757.0, "step": 6764 }, { "epoch": 0.8605775346648009, "ewc_loss": 0.047020070254802704, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018699755310080945, "grad_norm": 5.559685230255127, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8606604337692261, "num_tokens": 258098574.0, "step": 6765 }, { "epoch": 0.8607047449433914, "ewc_loss": 0.04706800356507301, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001874769077403471, "grad_norm": 5.544061183929443, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8706552982330322, "num_tokens": 258130937.0, "step": 6766 }, { "epoch": 0.860831955221982, "ewc_loss": 0.04700677841901779, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000186864665010944, "grad_norm": 5.559357166290283, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8633435964584351, "num_tokens": 258165142.0, "step": 6767 }, { "epoch": 0.8609591655005725, "ewc_loss": 0.0470682829618454, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018747971625998616, "grad_norm": 5.532649040222168, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8775267601013184, "num_tokens": 258203986.0, "step": 6768 }, { "epoch": 0.8610863757791629, "ewc_loss": 0.047303907573223114, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001873945293482393, "grad_norm": 5.56806755065918, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8619159460067749, "num_tokens": 258238692.0, "step": 6769 }, { "epoch": 0.8612135860577534, "ewc_loss": 0.04702082276344299, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018700507644098252, "grad_norm": 5.5713934898376465, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8414406776428223, "num_tokens": 258280942.0, "step": 6770 }, { "epoch": 0.861340796336344, "ewc_loss": 0.047055017203092575, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018734704644884914, "grad_norm": 5.588473796844482, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.863750696182251, "num_tokens": 258318835.0, "step": 6771 }, { "epoch": 0.8614680066149345, "ewc_loss": 0.04702075943350792, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018700446526054293, "grad_norm": 5.495050430297852, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8520628809928894, "num_tokens": 258359461.0, "step": 6772 }, { "epoch": 0.861595216893525, "ewc_loss": 0.04700111597776413, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018680805806070566, "grad_norm": 5.641317844390869, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8566880822181702, "num_tokens": 258394582.0, "step": 6773 }, { "epoch": 0.8617224271721156, "ewc_loss": 0.047017887234687805, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018697573977988213, "grad_norm": 5.496464729309082, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8645621538162231, "num_tokens": 258438526.0, "step": 6774 }, { "epoch": 0.861849637450706, "ewc_loss": 0.046988531947135925, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001866821839939803, "grad_norm": 5.5594482421875, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8476312160491943, "num_tokens": 258479332.0, "step": 6775 }, { "epoch": 0.8619768477292965, "ewc_loss": 0.04700707644224167, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018686761904973537, "grad_norm": 5.610377788543701, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8506858348846436, "num_tokens": 258513202.0, "step": 6776 }, { "epoch": 0.862104058007887, "ewc_loss": 0.047008708119392395, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018688397540245205, "grad_norm": 5.54324197769165, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8627060055732727, "num_tokens": 258549145.0, "step": 6777 }, { "epoch": 0.8622312682864776, "ewc_loss": 0.04700690507888794, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018686593102756888, "grad_norm": 5.6596574783325195, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8520785570144653, "num_tokens": 258586421.0, "step": 6778 }, { "epoch": 0.8623584785650681, "ewc_loss": 0.0470132902264595, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018692977027967572, "grad_norm": 5.603527545928955, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8625041842460632, "num_tokens": 258620014.0, "step": 6779 }, { "epoch": 0.8624856888436586, "ewc_loss": 0.04697684943675995, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001865653757704422, "grad_norm": 5.513509273529053, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8674035668373108, "num_tokens": 258656124.0, "step": 6780 }, { "epoch": 0.862612899122249, "ewc_loss": 0.04696470499038696, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000186443910934031, "grad_norm": 5.501791954040527, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8492618799209595, "num_tokens": 258692537.0, "step": 6781 }, { "epoch": 0.8627401094008396, "ewc_loss": 0.047048017382621765, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018727703718468547, "grad_norm": 5.584981918334961, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8642439842224121, "num_tokens": 258732737.0, "step": 6782 }, { "epoch": 0.8628673196794301, "ewc_loss": 0.047089964151382446, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018769653979688883, "grad_norm": 5.538397312164307, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8490246534347534, "num_tokens": 258770634.0, "step": 6783 }, { "epoch": 0.8629945299580206, "ewc_loss": 0.0469987690448761, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018678455671761185, "grad_norm": 5.564512729644775, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8606230020523071, "num_tokens": 258809643.0, "step": 6784 }, { "epoch": 0.8631217402366111, "ewc_loss": 0.04707106202840805, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018750748131424189, "grad_norm": 5.547219276428223, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8655885457992554, "num_tokens": 258845336.0, "step": 6785 }, { "epoch": 0.8632489505152017, "ewc_loss": 0.0470217764377594, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018701462249737233, "grad_norm": 5.558537483215332, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8637281656265259, "num_tokens": 258882206.0, "step": 6786 }, { "epoch": 0.8633761607937921, "ewc_loss": 0.04705074429512024, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018730433657765388, "grad_norm": 5.52799129486084, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8685057163238525, "num_tokens": 258918661.0, "step": 6787 }, { "epoch": 0.8635033710723826, "ewc_loss": 0.047046005725860596, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018725695554167032, "grad_norm": 5.601532459259033, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8554336428642273, "num_tokens": 258957452.0, "step": 6788 }, { "epoch": 0.8636305813509731, "ewc_loss": 0.04712574928998947, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001880543422885239, "grad_norm": 5.571081161499023, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8519731163978577, "num_tokens": 259003507.0, "step": 6789 }, { "epoch": 0.8637577916295637, "ewc_loss": 0.04698476940393448, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001866445818450302, "grad_norm": 5.498805999755859, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8554893732070923, "num_tokens": 259042284.0, "step": 6790 }, { "epoch": 0.8638850019081542, "ewc_loss": 0.04705694317817688, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018736632773652673, "grad_norm": 5.5990519523620605, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8751386404037476, "num_tokens": 259082929.0, "step": 6791 }, { "epoch": 0.8640122121867447, "ewc_loss": 0.04706019163131714, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018739876395557076, "grad_norm": 5.534545421600342, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8760509490966797, "num_tokens": 259125920.0, "step": 6792 }, { "epoch": 0.8641394224653351, "ewc_loss": 0.04701218008995056, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018691869627218693, "grad_norm": 5.567046642303467, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8626630902290344, "num_tokens": 259163140.0, "step": 6793 }, { "epoch": 0.8642666327439257, "ewc_loss": 0.04707338660955429, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018753076437860727, "grad_norm": 5.6378607749938965, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8600386381149292, "num_tokens": 259199038.0, "step": 6794 }, { "epoch": 0.8643938430225162, "ewc_loss": 0.047014303505420685, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018693988386075944, "grad_norm": 5.5576324462890625, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.859379231929779, "num_tokens": 259235713.0, "step": 6795 }, { "epoch": 0.8645210533011067, "ewc_loss": 0.04702890291810036, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001870859123300761, "grad_norm": 5.522563457489014, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8515920639038086, "num_tokens": 259278314.0, "step": 6796 }, { "epoch": 0.8646482635796973, "ewc_loss": 0.04708024486899376, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001875993184512481, "grad_norm": 5.609842300415039, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8689705729484558, "num_tokens": 259313039.0, "step": 6797 }, { "epoch": 0.8647754738582878, "ewc_loss": 0.04702834039926529, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018708029529079795, "grad_norm": 5.55136775970459, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8677076697349548, "num_tokens": 259348845.0, "step": 6798 }, { "epoch": 0.8649026841368782, "ewc_loss": 0.04701574891805649, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018695434846449643, "grad_norm": 5.545356750488281, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8554109334945679, "num_tokens": 259387345.0, "step": 6799 }, { "epoch": 0.8650298944154687, "ewc_loss": 0.04701823741197586, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001869792613433674, "grad_norm": 5.528774738311768, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8598214983940125, "num_tokens": 259427835.0, "step": 6800 }, { "epoch": 0.8651571046940593, "ewc_loss": 0.04694754630327225, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.00018749303126242012, "grad_norm": 5.608196258544922, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8649383783340454, "num_tokens": 259460453.0, "step": 6801 }, { "epoch": 0.8652843149726498, "ewc_loss": 0.04692105948925018, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.00018722818640526384, "grad_norm": 5.513247013092041, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8665450811386108, "num_tokens": 259499296.0, "step": 6802 }, { "epoch": 0.8654115252512403, "ewc_loss": 0.04686863720417023, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.00018670396821107715, "grad_norm": 5.581669807434082, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.856924295425415, "num_tokens": 259537649.0, "step": 6803 }, { "epoch": 0.8655387355298308, "ewc_loss": 0.046975746750831604, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.0001877750182757154, "grad_norm": 5.540477752685547, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8533066511154175, "num_tokens": 259583136.0, "step": 6804 }, { "epoch": 0.8656659458084213, "ewc_loss": 0.04690203070640564, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.00018703787645790726, "grad_norm": 5.558385372161865, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8647143840789795, "num_tokens": 259620422.0, "step": 6805 }, { "epoch": 0.8657931560870118, "ewc_loss": 0.046965502202510834, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.0001876725727925077, "grad_norm": 5.5746588706970215, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8489246368408203, "num_tokens": 259663948.0, "step": 6806 }, { "epoch": 0.8659203663656023, "ewc_loss": 0.046947330236434937, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.00018749090668279678, "grad_norm": 5.550572395324707, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.861887514591217, "num_tokens": 259701986.0, "step": 6807 }, { "epoch": 0.8660475766441929, "ewc_loss": 0.046946775168180466, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.0001874853332992643, "grad_norm": 5.552381992340088, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8759797215461731, "num_tokens": 259740684.0, "step": 6808 }, { "epoch": 0.8661747869227834, "ewc_loss": 0.04708768427371979, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018767373694572598, "grad_norm": 5.5622758865356445, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8437739610671997, "num_tokens": 259781659.0, "step": 6809 }, { "epoch": 0.8663019972013739, "ewc_loss": 0.047052912414073944, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018732600437942892, "grad_norm": 5.552003383636475, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8556807637214661, "num_tokens": 259818554.0, "step": 6810 }, { "epoch": 0.8664292074799644, "ewc_loss": 0.046963535249233246, "ewc_loss_diag": 2.8252601623535156e-05, "ewc_loss_parallel": 0.00018765291315503418, "grad_norm": 5.535362720489502, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8632140159606934, "num_tokens": 259856652.0, "step": 6811 }, { "epoch": 0.8665564177585549, "ewc_loss": 0.04710092023015022, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018780608661472797, "grad_norm": 5.559939384460449, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8792102336883545, "num_tokens": 259898255.0, "step": 6812 }, { "epoch": 0.8666836280371454, "ewc_loss": 0.04710979759693146, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018789483874570578, "grad_norm": 5.5637030601501465, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.86626136302948, "num_tokens": 259935305.0, "step": 6813 }, { "epoch": 0.8668108383157359, "ewc_loss": 0.04707154631614685, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018751234165392816, "grad_norm": 5.5238471031188965, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8650193214416504, "num_tokens": 259975245.0, "step": 6814 }, { "epoch": 0.8669380485943264, "ewc_loss": 0.047085393220186234, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018765080312732607, "grad_norm": 5.570289134979248, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8665177822113037, "num_tokens": 260014150.0, "step": 6815 }, { "epoch": 0.867065258872917, "ewc_loss": 0.04713529720902443, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018814984650816768, "grad_norm": 5.613212585449219, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8522781133651733, "num_tokens": 260054655.0, "step": 6816 }, { "epoch": 0.8671924691515075, "ewc_loss": 0.04683227464556694, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018756103236228228, "grad_norm": 5.592413902282715, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8609651327133179, "num_tokens": 260091528.0, "step": 6817 }, { "epoch": 0.8673196794300979, "ewc_loss": 0.0468316487967968, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018755477503873408, "grad_norm": 5.576451301574707, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8757875561714172, "num_tokens": 260123069.0, "step": 6818 }, { "epoch": 0.8674468897086884, "ewc_loss": 0.046862222254276276, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018786049622576684, "grad_norm": 5.685393810272217, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8456550240516663, "num_tokens": 260154482.0, "step": 6819 }, { "epoch": 0.867574099987279, "ewc_loss": 0.04680871218442917, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018732539319898933, "grad_norm": 5.581585884094238, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8629039525985718, "num_tokens": 260193983.0, "step": 6820 }, { "epoch": 0.8677013102658695, "ewc_loss": 0.046809956431388855, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018733783508650959, "grad_norm": 5.6097731590271, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8635367751121521, "num_tokens": 260228705.0, "step": 6821 }, { "epoch": 0.86782852054446, "ewc_loss": 0.046839311718940735, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018763140542432666, "grad_norm": 5.638327598571777, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8598319888114929, "num_tokens": 260262821.0, "step": 6822 }, { "epoch": 0.8679557308230506, "ewc_loss": 0.04678497463464737, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001870880223577842, "grad_norm": 5.576724052429199, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8588376045227051, "num_tokens": 260298949.0, "step": 6823 }, { "epoch": 0.868082941101641, "ewc_loss": 0.04680226743221283, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018726095731835812, "grad_norm": 5.580456256866455, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8685333728790283, "num_tokens": 260340028.0, "step": 6824 }, { "epoch": 0.8682101513802315, "ewc_loss": 0.046812690794467926, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018736516358330846, "grad_norm": 5.568456172943115, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8603716492652893, "num_tokens": 260381328.0, "step": 6825 }, { "epoch": 0.868337361658822, "ewc_loss": 0.046785734593868256, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001870956039056182, "grad_norm": 5.541943550109863, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8613237142562866, "num_tokens": 260423421.0, "step": 6826 }, { "epoch": 0.8684645719374126, "ewc_loss": 0.04683094099164009, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018754768825601786, "grad_norm": 5.566220760345459, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8603546619415283, "num_tokens": 260461985.0, "step": 6827 }, { "epoch": 0.8685917822160031, "ewc_loss": 0.046821724623441696, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018745553097687662, "grad_norm": 5.578758716583252, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8465389609336853, "num_tokens": 260503041.0, "step": 6828 }, { "epoch": 0.8687189924945936, "ewc_loss": 0.04687485843896866, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018798686505760998, "grad_norm": 5.585268020629883, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.868781566619873, "num_tokens": 260545553.0, "step": 6829 }, { "epoch": 0.868846202773184, "ewc_loss": 0.04690033942461014, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018824165454134345, "grad_norm": 5.772953510284424, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8685899376869202, "num_tokens": 260577160.0, "step": 6830 }, { "epoch": 0.8689734130517746, "ewc_loss": 0.046859029680490494, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.0001878285693237558, "grad_norm": 5.48986291885376, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8737881779670715, "num_tokens": 260614748.0, "step": 6831 }, { "epoch": 0.8691006233303651, "ewc_loss": 0.04677712917327881, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018700957298278809, "grad_norm": 5.558419704437256, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8623468279838562, "num_tokens": 260651762.0, "step": 6832 }, { "epoch": 0.8692278336089556, "ewc_loss": 0.04708102345466614, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018760710372589529, "grad_norm": 5.5271148681640625, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8495991230010986, "num_tokens": 260694438.0, "step": 6833 }, { "epoch": 0.8693550438875461, "ewc_loss": 0.047126539051532745, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018806227308232337, "grad_norm": 5.649563789367676, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.869271993637085, "num_tokens": 260730256.0, "step": 6834 }, { "epoch": 0.8694822541661367, "ewc_loss": 0.04714008420705795, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000188197722309269, "grad_norm": 5.5881452560424805, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.841030478477478, "num_tokens": 260766190.0, "step": 6835 }, { "epoch": 0.8696094644447271, "ewc_loss": 0.047070518136024475, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018750208255369216, "grad_norm": 5.533646583557129, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8752641677856445, "num_tokens": 260801733.0, "step": 6836 }, { "epoch": 0.8697366747233176, "ewc_loss": 0.047104738652706146, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018784428539220244, "grad_norm": 5.503055095672607, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8732782006263733, "num_tokens": 260843461.0, "step": 6837 }, { "epoch": 0.8698638850019081, "ewc_loss": 0.04715847969055176, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018838165851775557, "grad_norm": 5.623195648193359, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8591444492340088, "num_tokens": 260875006.0, "step": 6838 }, { "epoch": 0.8699910952804987, "ewc_loss": 0.04713404178619385, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018813731730915606, "grad_norm": 5.55278205871582, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8548674583435059, "num_tokens": 260917553.0, "step": 6839 }, { "epoch": 0.8701183055590892, "ewc_loss": 0.04716120660305023, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018840895791072398, "grad_norm": 5.625015735626221, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8367376923561096, "num_tokens": 260955293.0, "step": 6840 }, { "epoch": 0.8702455158376797, "ewc_loss": 0.046895720064640045, "ewc_loss_diag": 2.8133392333984375e-05, "ewc_loss_parallel": 0.00018819549586623907, "grad_norm": 5.654063701629639, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8620362281799316, "num_tokens": 260983966.0, "step": 6841 }, { "epoch": 0.8703727261162701, "ewc_loss": 0.04712137579917908, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018801064288709313, "grad_norm": 5.535269260406494, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8746293783187866, "num_tokens": 261019434.0, "step": 6842 }, { "epoch": 0.8704999363948607, "ewc_loss": 0.047144584357738495, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018824271683115512, "grad_norm": 5.63254976272583, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8494305610656738, "num_tokens": 261053761.0, "step": 6843 }, { "epoch": 0.8706271466734512, "ewc_loss": 0.04717511683702469, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018854805966839194, "grad_norm": 5.537774085998535, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8517195582389832, "num_tokens": 261093490.0, "step": 6844 }, { "epoch": 0.8707543569520417, "ewc_loss": 0.04717220738530159, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018851894128601998, "grad_norm": 5.601525783538818, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8615509867668152, "num_tokens": 261129518.0, "step": 6845 }, { "epoch": 0.8708815672306323, "ewc_loss": 0.047191351652145386, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018871037173084915, "grad_norm": 5.620117664337158, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8446149230003357, "num_tokens": 261164282.0, "step": 6846 }, { "epoch": 0.8710087775092228, "ewc_loss": 0.04718003422021866, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018859721603803337, "grad_norm": 5.5417680740356445, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8408787250518799, "num_tokens": 261205055.0, "step": 6847 }, { "epoch": 0.8711359877878132, "ewc_loss": 0.047157034277915955, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018836722301784903, "grad_norm": 5.562968730926514, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8546239137649536, "num_tokens": 261242329.0, "step": 6848 }, { "epoch": 0.8712631980664037, "ewc_loss": 0.047197774052619934, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001887746184365824, "grad_norm": 5.57982063293457, "learning_rate": 1e-06, "loss": 0.5488, "mean_token_accuracy": 0.834000825881958, "num_tokens": 261280330.0, "step": 6849 }, { "epoch": 0.8713904083449943, "ewc_loss": 0.04720298945903778, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018882675794884562, "grad_norm": 5.508054733276367, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8714420199394226, "num_tokens": 261318232.0, "step": 6850 }, { "epoch": 0.8715176186235848, "ewc_loss": 0.047256771475076675, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018936459673568606, "grad_norm": 5.660048961639404, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.877112865447998, "num_tokens": 261351483.0, "step": 6851 }, { "epoch": 0.8716448289021753, "ewc_loss": 0.047258175909519196, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018937863933388144, "grad_norm": 5.555885314941406, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8633015751838684, "num_tokens": 261387058.0, "step": 6852 }, { "epoch": 0.8717720391807658, "ewc_loss": 0.04724964499473572, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018929333600681275, "grad_norm": 5.538310527801514, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8770637512207031, "num_tokens": 261430110.0, "step": 6853 }, { "epoch": 0.8718992494593563, "ewc_loss": 0.047194138169288635, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018873828230425715, "grad_norm": 5.53623104095459, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8684929609298706, "num_tokens": 261470707.0, "step": 6854 }, { "epoch": 0.8720264597379468, "ewc_loss": 0.04717797785997391, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018857663962990046, "grad_norm": 5.5550737380981445, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8689287900924683, "num_tokens": 261507146.0, "step": 6855 }, { "epoch": 0.8721536700165373, "ewc_loss": 0.04720207303762436, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001888176193460822, "grad_norm": 5.6176838874816895, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8458198308944702, "num_tokens": 261549723.0, "step": 6856 }, { "epoch": 0.8722808802951278, "ewc_loss": 0.047169506549835205, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018849193293135613, "grad_norm": 5.57806396484375, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8662432432174683, "num_tokens": 261591030.0, "step": 6857 }, { "epoch": 0.8724080905737184, "ewc_loss": 0.04719763621687889, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001887732360046357, "grad_norm": 5.63516092300415, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8697226047515869, "num_tokens": 261623919.0, "step": 6858 }, { "epoch": 0.8725353008523089, "ewc_loss": 0.047143854200839996, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018823544087354094, "grad_norm": 5.56430196762085, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8779210448265076, "num_tokens": 261655277.0, "step": 6859 }, { "epoch": 0.8726625111308994, "ewc_loss": 0.04715705290436745, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000188367412192747, "grad_norm": 5.6025261878967285, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8488588333129883, "num_tokens": 261697462.0, "step": 6860 }, { "epoch": 0.8727897214094898, "ewc_loss": 0.047139715403318405, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000188194026122801, "grad_norm": 5.539359092712402, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8583873510360718, "num_tokens": 261739134.0, "step": 6861 }, { "epoch": 0.8729169316880804, "ewc_loss": 0.04714134335517883, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018821030971594155, "grad_norm": 5.6563334465026855, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8646454811096191, "num_tokens": 261776295.0, "step": 6862 }, { "epoch": 0.8730441419666709, "ewc_loss": 0.04717206209897995, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001885175151983276, "grad_norm": 5.614134788513184, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8641974925994873, "num_tokens": 261812388.0, "step": 6863 }, { "epoch": 0.8731713522452614, "ewc_loss": 0.04710864648222923, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018788334273267537, "grad_norm": 5.545942783355713, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8626466989517212, "num_tokens": 261855537.0, "step": 6864 }, { "epoch": 0.873298562523852, "ewc_loss": 0.0471644401550293, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018844129226636142, "grad_norm": 5.562358856201172, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8703265190124512, "num_tokens": 261899922.0, "step": 6865 }, { "epoch": 0.8734257728024425, "ewc_loss": 0.04713018238544464, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018809869652613997, "grad_norm": 5.601896286010742, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8732849955558777, "num_tokens": 261934475.0, "step": 6866 }, { "epoch": 0.8735529830810329, "ewc_loss": 0.047149814665317535, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018829500186257064, "grad_norm": 5.575679779052734, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8620573282241821, "num_tokens": 261973395.0, "step": 6867 }, { "epoch": 0.8736801933596234, "ewc_loss": 0.047156333923339844, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001883602380985394, "grad_norm": 5.58927059173584, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8456680774688721, "num_tokens": 262011644.0, "step": 6868 }, { "epoch": 0.873807403638214, "ewc_loss": 0.047163985669612885, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018843670841306448, "grad_norm": 5.655860900878906, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8568384647369385, "num_tokens": 262044038.0, "step": 6869 }, { "epoch": 0.8739346139168045, "ewc_loss": 0.04715350270271301, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018833190551958978, "grad_norm": 5.5671305656433105, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8603790402412415, "num_tokens": 262081023.0, "step": 6870 }, { "epoch": 0.874061824195395, "ewc_loss": 0.04714695364236832, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001882664073491469, "grad_norm": 5.586078643798828, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8541342616081238, "num_tokens": 262123981.0, "step": 6871 }, { "epoch": 0.8741890344739855, "ewc_loss": 0.04715204983949661, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001883173972601071, "grad_norm": 5.621092796325684, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8512325286865234, "num_tokens": 262161542.0, "step": 6872 }, { "epoch": 0.874316244752576, "ewc_loss": 0.0471494123339653, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018829101463779807, "grad_norm": 5.511969089508057, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8509007096290588, "num_tokens": 262202482.0, "step": 6873 }, { "epoch": 0.8744434550311665, "ewc_loss": 0.04720419645309448, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018883883603848517, "grad_norm": 5.6768598556518555, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8635395765304565, "num_tokens": 262239499.0, "step": 6874 }, { "epoch": 0.874570665309757, "ewc_loss": 0.04716571420431137, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018845402519218624, "grad_norm": 5.563246726989746, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8632190823554993, "num_tokens": 262281324.0, "step": 6875 }, { "epoch": 0.8746978755883476, "ewc_loss": 0.047178443521261215, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018858131079468876, "grad_norm": 5.648179054260254, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8526383638381958, "num_tokens": 262318772.0, "step": 6876 }, { "epoch": 0.8748250858669381, "ewc_loss": 0.04717789962887764, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018857586837839335, "grad_norm": 5.574828624725342, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8741776943206787, "num_tokens": 262353264.0, "step": 6877 }, { "epoch": 0.8749522961455286, "ewc_loss": 0.04718305915594101, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018862748402170837, "grad_norm": 5.580226421356201, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8590806722640991, "num_tokens": 262393360.0, "step": 6878 }, { "epoch": 0.875079506424119, "ewc_loss": 0.04716097563505173, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001884066587081179, "grad_norm": 5.57606840133667, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8706780672073364, "num_tokens": 262432010.0, "step": 6879 }, { "epoch": 0.8752067167027096, "ewc_loss": 0.04719163104891777, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018871319480240345, "grad_norm": 5.586408615112305, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.861971378326416, "num_tokens": 262468828.0, "step": 6880 }, { "epoch": 0.8753339269813001, "ewc_loss": 0.04719766601920128, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001887735415948555, "grad_norm": 5.590115070343018, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8683067560195923, "num_tokens": 262515141.0, "step": 6881 }, { "epoch": 0.8754611372598906, "ewc_loss": 0.04714478552341461, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018824473954737186, "grad_norm": 5.569228172302246, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8589284420013428, "num_tokens": 262553137.0, "step": 6882 }, { "epoch": 0.8755883475384811, "ewc_loss": 0.04712093248963356, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018800620455294847, "grad_norm": 5.629706382751465, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8837340474128723, "num_tokens": 262593847.0, "step": 6883 }, { "epoch": 0.8757155578170717, "ewc_loss": 0.04712013155221939, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018799818644765764, "grad_norm": 5.563845634460449, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.85032057762146, "num_tokens": 262635048.0, "step": 6884 }, { "epoch": 0.8758427680956621, "ewc_loss": 0.0470459870994091, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018725675181485713, "grad_norm": 5.5883564949035645, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8531823754310608, "num_tokens": 262671200.0, "step": 6885 }, { "epoch": 0.8759699783742526, "ewc_loss": 0.0471479706466198, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018827657913789153, "grad_norm": 5.5959391593933105, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8636424541473389, "num_tokens": 262708304.0, "step": 6886 }, { "epoch": 0.8760971886528431, "ewc_loss": 0.047121960669755936, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001880164782050997, "grad_norm": 5.562171459197998, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.859130859375, "num_tokens": 262750680.0, "step": 6887 }, { "epoch": 0.8762243989314337, "ewc_loss": 0.047142598778009415, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018822286801878363, "grad_norm": 5.619492530822754, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.856598973274231, "num_tokens": 262784845.0, "step": 6888 }, { "epoch": 0.8763516092100242, "ewc_loss": 0.04714720696210861, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018826895393431187, "grad_norm": 5.532443523406982, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8618829250335693, "num_tokens": 262822646.0, "step": 6889 }, { "epoch": 0.8764788194886147, "ewc_loss": 0.047179605811834335, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018859293777495623, "grad_norm": 5.5913872718811035, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8570874929428101, "num_tokens": 262863663.0, "step": 6890 }, { "epoch": 0.8766060297672051, "ewc_loss": 0.04722989350557327, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018909579375758767, "grad_norm": 5.586053848266602, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8475515246391296, "num_tokens": 262903726.0, "step": 6891 }, { "epoch": 0.8767332400457957, "ewc_loss": 0.047212470322847366, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018892157822847366, "grad_norm": 5.602907657623291, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8514354228973389, "num_tokens": 262942748.0, "step": 6892 }, { "epoch": 0.8768604503243862, "ewc_loss": 0.047217123210430145, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018896811525337398, "grad_norm": 5.602928638458252, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8416365385055542, "num_tokens": 262977639.0, "step": 6893 }, { "epoch": 0.8769876606029767, "ewc_loss": 0.04719243198633194, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018872119835577905, "grad_norm": 5.566751003265381, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.864665150642395, "num_tokens": 263015970.0, "step": 6894 }, { "epoch": 0.8771148708815673, "ewc_loss": 0.04722197353839874, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018901658768299967, "grad_norm": 5.588098049163818, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8708381652832031, "num_tokens": 263058618.0, "step": 6895 }, { "epoch": 0.8772420811601578, "ewc_loss": 0.04722573608160019, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018905423348769546, "grad_norm": 5.619500637054443, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8446763753890991, "num_tokens": 263098628.0, "step": 6896 }, { "epoch": 0.8773692914387482, "ewc_loss": 0.047208696603775024, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018888383056037128, "grad_norm": 5.549927234649658, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8652929067611694, "num_tokens": 263136793.0, "step": 6897 }, { "epoch": 0.8774965017173387, "ewc_loss": 0.04725886136293411, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000189385493285954, "grad_norm": 5.628873348236084, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8637944459915161, "num_tokens": 263170996.0, "step": 6898 }, { "epoch": 0.8776237119959293, "ewc_loss": 0.0472215861082077, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001890127605292946, "grad_norm": 5.621584415435791, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.853024423122406, "num_tokens": 263199961.0, "step": 6899 }, { "epoch": 0.8777509222745198, "ewc_loss": 0.04728763550519943, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018967325740959495, "grad_norm": 5.5928425788879395, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8606829047203064, "num_tokens": 263241541.0, "step": 6900 }, { "epoch": 0.8778781325531103, "ewc_loss": 0.04723434895277023, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001891403808258474, "grad_norm": 5.567740440368652, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8738787770271301, "num_tokens": 263278778.0, "step": 6901 }, { "epoch": 0.8780053428317008, "ewc_loss": 0.04730302840471268, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001898271730169654, "grad_norm": 5.6762824058532715, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.864750862121582, "num_tokens": 263310155.0, "step": 6902 }, { "epoch": 0.8781325531102913, "ewc_loss": 0.04727078229188919, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018950468802358955, "grad_norm": 5.546509742736816, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.857283353805542, "num_tokens": 263355793.0, "step": 6903 }, { "epoch": 0.8782597633888818, "ewc_loss": 0.047258585691452026, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001893827284220606, "grad_norm": 5.593744277954102, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8713751435279846, "num_tokens": 263394446.0, "step": 6904 }, { "epoch": 0.8783869736674723, "ewc_loss": 0.047256313264369965, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018936002743430436, "grad_norm": 5.580819129943848, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8639293909072876, "num_tokens": 263438479.0, "step": 6905 }, { "epoch": 0.8785141839460628, "ewc_loss": 0.04725620895624161, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018935895059257746, "grad_norm": 5.613569736480713, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8474677205085754, "num_tokens": 263478514.0, "step": 6906 }, { "epoch": 0.8786413942246534, "ewc_loss": 0.047212112694978714, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018891799845732749, "grad_norm": 5.626121520996094, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8430320024490356, "num_tokens": 263516734.0, "step": 6907 }, { "epoch": 0.8787686045032439, "ewc_loss": 0.04721078276634216, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018890471255872399, "grad_norm": 5.543867588043213, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8508366942405701, "num_tokens": 263560105.0, "step": 6908 }, { "epoch": 0.8788958147818343, "ewc_loss": 0.04724935442209244, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018929039651993662, "grad_norm": 5.658286094665527, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8635722994804382, "num_tokens": 263597033.0, "step": 6909 }, { "epoch": 0.8790230250604248, "ewc_loss": 0.04723350703716278, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018913194071501493, "grad_norm": 5.577982425689697, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8683444857597351, "num_tokens": 263631106.0, "step": 6910 }, { "epoch": 0.8791502353390154, "ewc_loss": 0.04718979448080063, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018869481573347002, "grad_norm": 5.5806989669799805, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.854785680770874, "num_tokens": 263668227.0, "step": 6911 }, { "epoch": 0.8792774456176059, "ewc_loss": 0.04725931957364082, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001893900625873357, "grad_norm": 5.63733434677124, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8635609149932861, "num_tokens": 263704335.0, "step": 6912 }, { "epoch": 0.8794046558961964, "ewc_loss": 0.04724489152431488, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018924578034784645, "grad_norm": 5.551960468292236, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.867082953453064, "num_tokens": 263742532.0, "step": 6913 }, { "epoch": 0.879531866174787, "ewc_loss": 0.04726756364107132, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018947251373901963, "grad_norm": 5.587265491485596, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8624731302261353, "num_tokens": 263782321.0, "step": 6914 }, { "epoch": 0.8796590764533775, "ewc_loss": 0.047280825674533844, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001896051107905805, "grad_norm": 5.577527046203613, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8692181706428528, "num_tokens": 263818037.0, "step": 6915 }, { "epoch": 0.8797862867319679, "ewc_loss": 0.04729107394814491, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018970761448144913, "grad_norm": 5.631947994232178, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8470708131790161, "num_tokens": 263854901.0, "step": 6916 }, { "epoch": 0.8799134970105584, "ewc_loss": 0.04728931933641434, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018969009397551417, "grad_norm": 5.568290710449219, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8657199144363403, "num_tokens": 263896241.0, "step": 6917 }, { "epoch": 0.880040707289149, "ewc_loss": 0.047291092574596405, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018970777455251664, "grad_norm": 5.567020893096924, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8624271154403687, "num_tokens": 263934673.0, "step": 6918 }, { "epoch": 0.8801679175677395, "ewc_loss": 0.04739130288362503, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001907098776428029, "grad_norm": 5.560428142547607, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8529383540153503, "num_tokens": 263973456.0, "step": 6919 }, { "epoch": 0.88029512784633, "ewc_loss": 0.04732035845518112, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019000044267158955, "grad_norm": 5.619846820831299, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8443009257316589, "num_tokens": 264006050.0, "step": 6920 }, { "epoch": 0.8804223381249205, "ewc_loss": 0.04733341932296753, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001901310752145946, "grad_norm": 5.598643779754639, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8717651963233948, "num_tokens": 264042736.0, "step": 6921 }, { "epoch": 0.880549548403511, "ewc_loss": 0.04735489934682846, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001903458614833653, "grad_norm": 5.567596912384033, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8527829051017761, "num_tokens": 264084433.0, "step": 6922 }, { "epoch": 0.8806767586821015, "ewc_loss": 0.04736422747373581, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019043915381189436, "grad_norm": 5.646764755249023, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8692858219146729, "num_tokens": 264118331.0, "step": 6923 }, { "epoch": 0.880803968960692, "ewc_loss": 0.047328218817710876, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001900790521176532, "grad_norm": 5.596077919006348, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8634045720100403, "num_tokens": 264152015.0, "step": 6924 }, { "epoch": 0.8809311792392825, "ewc_loss": 0.04730619490146637, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018985880888067186, "grad_norm": 5.56549596786499, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8677579760551453, "num_tokens": 264187085.0, "step": 6925 }, { "epoch": 0.8810583895178731, "ewc_loss": 0.04731801152229309, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018997699953615665, "grad_norm": 5.587080955505371, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8581119179725647, "num_tokens": 264225558.0, "step": 6926 }, { "epoch": 0.8811855997964636, "ewc_loss": 0.04735283553600311, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019032521231565624, "grad_norm": 5.619113445281982, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8519609570503235, "num_tokens": 264264220.0, "step": 6927 }, { "epoch": 0.881312810075054, "ewc_loss": 0.04730330780148506, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000189829952432774, "grad_norm": 5.616931915283203, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8649144768714905, "num_tokens": 264302097.0, "step": 6928 }, { "epoch": 0.8814400203536445, "ewc_loss": 0.04735231399536133, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019032001728191972, "grad_norm": 5.582110404968262, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8636525273323059, "num_tokens": 264341966.0, "step": 6929 }, { "epoch": 0.8815672306322351, "ewc_loss": 0.047332897782325745, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001901258365251124, "grad_norm": 5.574095726013184, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.837643027305603, "num_tokens": 264384298.0, "step": 6930 }, { "epoch": 0.8816944409108256, "ewc_loss": 0.04733695834875107, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019016643636859953, "grad_norm": 5.598412990570068, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8620256185531616, "num_tokens": 264422513.0, "step": 6931 }, { "epoch": 0.8818216511894161, "ewc_loss": 0.04733666032552719, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019016348232980818, "grad_norm": 5.565760612487793, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8737826347351074, "num_tokens": 264461819.0, "step": 6932 }, { "epoch": 0.8819488614680067, "ewc_loss": 0.047335200011730194, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019014890131074935, "grad_norm": 5.586063385009766, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8510552048683167, "num_tokens": 264502618.0, "step": 6933 }, { "epoch": 0.8820760717465971, "ewc_loss": 0.04734625667333603, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001902594231069088, "grad_norm": 5.629091262817383, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8549657464027405, "num_tokens": 264534382.0, "step": 6934 }, { "epoch": 0.8822032820251876, "ewc_loss": 0.04731486737728119, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018994553829543293, "grad_norm": 5.56352424621582, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.850829005241394, "num_tokens": 264575297.0, "step": 6935 }, { "epoch": 0.8823304923037781, "ewc_loss": 0.04732279106974602, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001900247880257666, "grad_norm": 5.606588363647461, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8677648901939392, "num_tokens": 264611822.0, "step": 6936 }, { "epoch": 0.8824577025823687, "ewc_loss": 0.04742119461297989, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00018978810112457722, "grad_norm": 5.548922061920166, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8567935228347778, "num_tokens": 264657483.0, "step": 6937 }, { "epoch": 0.8825849128609592, "ewc_loss": 0.04749350994825363, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019051125855185091, "grad_norm": 5.645294666290283, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8670961260795593, "num_tokens": 264690633.0, "step": 6938 }, { "epoch": 0.8827121231395497, "ewc_loss": 0.0474550798535347, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019012695702258497, "grad_norm": 5.519533634185791, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8760442733764648, "num_tokens": 264733268.0, "step": 6939 }, { "epoch": 0.8828393334181401, "ewc_loss": 0.04734544828534126, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019025136134587228, "grad_norm": 5.615991592407227, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8542289733886719, "num_tokens": 264774769.0, "step": 6940 }, { "epoch": 0.8829665436967307, "ewc_loss": 0.04733659327030182, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001901628274936229, "grad_norm": 5.637638092041016, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8563165068626404, "num_tokens": 264806658.0, "step": 6941 }, { "epoch": 0.8830937539753212, "ewc_loss": 0.0473611056804657, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019040792540181428, "grad_norm": 5.622082233428955, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8488466143608093, "num_tokens": 264841054.0, "step": 6942 }, { "epoch": 0.8832209642539117, "ewc_loss": 0.047340475022792816, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019020163745153695, "grad_norm": 5.6148576736450195, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.855840802192688, "num_tokens": 264880241.0, "step": 6943 }, { "epoch": 0.8833481745325023, "ewc_loss": 0.04730674624443054, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001898643240565434, "grad_norm": 5.580672740936279, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8629500269889832, "num_tokens": 264922449.0, "step": 6944 }, { "epoch": 0.8834753848110928, "ewc_loss": 0.047322504222393036, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019002192129846662, "grad_norm": 5.600935935974121, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8594487905502319, "num_tokens": 264961605.0, "step": 6945 }, { "epoch": 0.8836025950896832, "ewc_loss": 0.04732195660471916, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019001643522642553, "grad_norm": 5.625718116760254, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8690888285636902, "num_tokens": 264992924.0, "step": 6946 }, { "epoch": 0.8837298053682737, "ewc_loss": 0.047333166003227234, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001901285577332601, "grad_norm": 5.625975131988525, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8610707521438599, "num_tokens": 265031849.0, "step": 6947 }, { "epoch": 0.8838570156468643, "ewc_loss": 0.047349218279123306, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019028906535822898, "grad_norm": 5.609614849090576, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8724067807197571, "num_tokens": 265069714.0, "step": 6948 }, { "epoch": 0.8839842259254548, "ewc_loss": 0.04735887795686722, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019038564641959965, "grad_norm": 5.634927749633789, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8568886518478394, "num_tokens": 265103325.0, "step": 6949 }, { "epoch": 0.8841114362040453, "ewc_loss": 0.047317177057266235, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018996863218490034, "grad_norm": 5.614374160766602, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8397340774536133, "num_tokens": 265139133.0, "step": 6950 }, { "epoch": 0.8842386464826358, "ewc_loss": 0.047313764691352844, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018993450794368982, "grad_norm": 5.611119747161865, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8367745876312256, "num_tokens": 265183536.0, "step": 6951 }, { "epoch": 0.8843658567612263, "ewc_loss": 0.04734165966510773, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019021348271053284, "grad_norm": 5.604375839233398, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8523377180099487, "num_tokens": 265226288.0, "step": 6952 }, { "epoch": 0.8844930670398168, "ewc_loss": 0.04727453738451004, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018954223196487874, "grad_norm": 5.6234259605407715, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8605982065200806, "num_tokens": 265265846.0, "step": 6953 }, { "epoch": 0.8846202773184073, "ewc_loss": 0.04730834811925888, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001898803748190403, "grad_norm": 5.624931335449219, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8751100301742554, "num_tokens": 265300612.0, "step": 6954 }, { "epoch": 0.8847474875969978, "ewc_loss": 0.047256529331207275, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001893621520139277, "grad_norm": 5.57689905166626, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8698686957359314, "num_tokens": 265341440.0, "step": 6955 }, { "epoch": 0.8848746978755884, "ewc_loss": 0.047269608825445175, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001894929591799155, "grad_norm": 5.602891445159912, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8514688611030579, "num_tokens": 265384854.0, "step": 6956 }, { "epoch": 0.8850019081541789, "ewc_loss": 0.04723179340362549, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001891147840069607, "grad_norm": 5.646337509155273, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8540323972702026, "num_tokens": 265420611.0, "step": 6957 }, { "epoch": 0.8851291184327693, "ewc_loss": 0.04728389158844948, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001896357862278819, "grad_norm": 5.65916109085083, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8589901328086853, "num_tokens": 265457720.0, "step": 6958 }, { "epoch": 0.8852563287113598, "ewc_loss": 0.047265924513339996, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001894561282824725, "grad_norm": 5.613868713378906, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8691291213035583, "num_tokens": 265493434.0, "step": 6959 }, { "epoch": 0.8853835389899504, "ewc_loss": 0.04736289754509926, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00018920515140052885, "grad_norm": 5.6317644119262695, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8634015917778015, "num_tokens": 265529015.0, "step": 6960 }, { "epoch": 0.8855107492685409, "ewc_loss": 0.047246746718883514, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018926434859167784, "grad_norm": 5.620326519012451, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8458458781242371, "num_tokens": 265568698.0, "step": 6961 }, { "epoch": 0.8856379595471314, "ewc_loss": 0.04727725684642792, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018956942949444056, "grad_norm": 5.669657230377197, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8617737293243408, "num_tokens": 265605631.0, "step": 6962 }, { "epoch": 0.885765169825722, "ewc_loss": 0.047236863523721695, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018916551198344678, "grad_norm": 5.57785177230835, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8614784479141235, "num_tokens": 265647071.0, "step": 6963 }, { "epoch": 0.8858923801043125, "ewc_loss": 0.04721550643444061, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018895193352364004, "grad_norm": 5.632237434387207, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8600031733512878, "num_tokens": 265688680.0, "step": 6964 }, { "epoch": 0.8860195903829029, "ewc_loss": 0.04725608229637146, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018935769912786782, "grad_norm": 5.622002601623535, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8666175603866577, "num_tokens": 265727707.0, "step": 6965 }, { "epoch": 0.8861468006614934, "ewc_loss": 0.04718237370252609, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018862060096580535, "grad_norm": 5.576277256011963, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8587311506271362, "num_tokens": 265765860.0, "step": 6966 }, { "epoch": 0.886274010940084, "ewc_loss": 0.04722340404987335, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018903093587141484, "grad_norm": 5.639540195465088, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8646355867385864, "num_tokens": 265802988.0, "step": 6967 }, { "epoch": 0.8864012212186745, "ewc_loss": 0.047234006226062775, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001891369465738535, "grad_norm": 5.648168563842773, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8551695942878723, "num_tokens": 265840614.0, "step": 6968 }, { "epoch": 0.886528431497265, "ewc_loss": 0.047209233045578, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018888918566517532, "grad_norm": 5.578300952911377, "learning_rate": 1e-06, "loss": 0.5376, "mean_token_accuracy": 0.8346477150917053, "num_tokens": 265883032.0, "step": 6969 }, { "epoch": 0.8866556417758555, "ewc_loss": 0.047273412346839905, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001895309833344072, "grad_norm": 5.647884845733643, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8724825382232666, "num_tokens": 265918309.0, "step": 6970 }, { "epoch": 0.886782852054446, "ewc_loss": 0.04725632816553116, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001893601584015414, "grad_norm": 5.596620082855225, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8629971146583557, "num_tokens": 265956406.0, "step": 6971 }, { "epoch": 0.8869100623330365, "ewc_loss": 0.047256480902433395, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001893616863526404, "grad_norm": 5.702645301818848, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8691730499267578, "num_tokens": 265988413.0, "step": 6972 }, { "epoch": 0.887037272611627, "ewc_loss": 0.047234732657670975, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018914419342763722, "grad_norm": 5.616809844970703, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8603506684303284, "num_tokens": 266022036.0, "step": 6973 }, { "epoch": 0.8871644828902175, "ewc_loss": 0.04723920673131943, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018918892601504922, "grad_norm": 5.731086730957031, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8585765957832336, "num_tokens": 266053735.0, "step": 6974 }, { "epoch": 0.8872916931688081, "ewc_loss": 0.047270767390728, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018950452795252204, "grad_norm": 5.559296131134033, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8747131824493408, "num_tokens": 266094198.0, "step": 6975 }, { "epoch": 0.8874189034473986, "ewc_loss": 0.047252945601940155, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018932635430246592, "grad_norm": 5.674167633056641, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8690865635871887, "num_tokens": 266132213.0, "step": 6976 }, { "epoch": 0.887546113725989, "ewc_loss": 0.0472814217209816, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001896111061796546, "grad_norm": 5.692071437835693, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8610349893569946, "num_tokens": 266168001.0, "step": 6977 }, { "epoch": 0.8876733240045795, "ewc_loss": 0.04722416400909424, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001890385028673336, "grad_norm": 5.61976432800293, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8321999311447144, "num_tokens": 266208771.0, "step": 6978 }, { "epoch": 0.8878005342831701, "ewc_loss": 0.04723621532320976, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018915903638117015, "grad_norm": 5.6676740646362305, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8668487071990967, "num_tokens": 266248090.0, "step": 6979 }, { "epoch": 0.8879277445617606, "ewc_loss": 0.04724032059311867, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018920008733402938, "grad_norm": 5.7062249183654785, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8507136702537537, "num_tokens": 266283748.0, "step": 6980 }, { "epoch": 0.8880549548403511, "ewc_loss": 0.047256723046302795, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018936408741865307, "grad_norm": 5.647672653198242, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8674971461296082, "num_tokens": 266316613.0, "step": 6981 }, { "epoch": 0.8881821651189417, "ewc_loss": 0.04718894511461258, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018868633196689188, "grad_norm": 5.6120147705078125, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8634873628616333, "num_tokens": 266357515.0, "step": 6982 }, { "epoch": 0.8883093753975321, "ewc_loss": 0.047226183116436005, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001890587154775858, "grad_norm": 5.645036697387695, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8726837038993835, "num_tokens": 266393593.0, "step": 6983 }, { "epoch": 0.8884365856761226, "ewc_loss": 0.04723183065652847, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018911516235675663, "grad_norm": 5.567022800445557, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8722065091133118, "num_tokens": 266436104.0, "step": 6984 }, { "epoch": 0.8885637959547131, "ewc_loss": 0.04722106456756592, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018900753639172763, "grad_norm": 5.657858848571777, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8739597201347351, "num_tokens": 266472809.0, "step": 6985 }, { "epoch": 0.8886910062333037, "ewc_loss": 0.04729347303509712, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001897316105896607, "grad_norm": 5.598662853240967, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8705079555511475, "num_tokens": 266512726.0, "step": 6986 }, { "epoch": 0.8888182165118942, "ewc_loss": 0.0472680889070034, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001882570650195703, "grad_norm": 5.642144203186035, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8507868647575378, "num_tokens": 266546098.0, "step": 6987 }, { "epoch": 0.8889454267904847, "ewc_loss": 0.047249436378479004, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018929122597910464, "grad_norm": 5.614857196807861, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8527969121932983, "num_tokens": 266588478.0, "step": 6988 }, { "epoch": 0.8890726370690751, "ewc_loss": 0.0472375825047493, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001891727006295696, "grad_norm": 5.614819526672363, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8498606085777283, "num_tokens": 266628975.0, "step": 6989 }, { "epoch": 0.8891998473476657, "ewc_loss": 0.047243572771549225, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018923259631264955, "grad_norm": 5.573739528656006, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8698604106903076, "num_tokens": 266667533.0, "step": 6990 }, { "epoch": 0.8893270576262562, "ewc_loss": 0.04723843187093735, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018918119894806296, "grad_norm": 5.621319770812988, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8424783945083618, "num_tokens": 266706990.0, "step": 6991 }, { "epoch": 0.8894542679048467, "ewc_loss": 0.047266408801078796, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018946098862215877, "grad_norm": 5.620249271392822, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8479983806610107, "num_tokens": 266747314.0, "step": 6992 }, { "epoch": 0.8895814781834372, "ewc_loss": 0.0472906269133091, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018970314704347402, "grad_norm": 5.5647687911987305, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8514858484268188, "num_tokens": 266791031.0, "step": 6993 }, { "epoch": 0.8897086884620278, "ewc_loss": 0.04730362445116043, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018983309564646333, "grad_norm": 5.657485008239746, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8612969517707825, "num_tokens": 266823236.0, "step": 6994 }, { "epoch": 0.8898358987406182, "ewc_loss": 0.04728004336357117, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00018959728186018765, "grad_norm": 5.564467430114746, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8595550060272217, "num_tokens": 266867993.0, "step": 6995 }, { "epoch": 0.8899631090192087, "ewc_loss": 0.04736834019422531, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019048027752432972, "grad_norm": 5.622654438018799, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8429617285728455, "num_tokens": 266908573.0, "step": 6996 }, { "epoch": 0.8900903192977992, "ewc_loss": 0.047298163175582886, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001897784968605265, "grad_norm": 5.569268226623535, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8593780994415283, "num_tokens": 266951167.0, "step": 6997 }, { "epoch": 0.8902175295763898, "ewc_loss": 0.047350335866212845, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019030022667720914, "grad_norm": 5.717131614685059, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.859367847442627, "num_tokens": 266988976.0, "step": 6998 }, { "epoch": 0.8903447398549803, "ewc_loss": 0.04733853042125702, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019018215243704617, "grad_norm": 5.603402614593506, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.859800398349762, "num_tokens": 267022450.0, "step": 6999 }, { "epoch": 0.8904719501335708, "ewc_loss": 0.04731341451406479, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001899310009321198, "grad_norm": 5.6461262702941895, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8541421890258789, "num_tokens": 267059933.0, "step": 7000 }, { "epoch": 0.8905991604121613, "ewc_loss": 0.04732344672083855, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001900313509395346, "grad_norm": 5.621439456939697, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8519295454025269, "num_tokens": 267092518.0, "step": 7001 }, { "epoch": 0.8907263706907518, "ewc_loss": 0.04732988029718399, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019009567040484399, "grad_norm": 5.639274597167969, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8553005456924438, "num_tokens": 267130008.0, "step": 7002 }, { "epoch": 0.8908535809693423, "ewc_loss": 0.047340307384729385, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019019994942937046, "grad_norm": 5.5775065422058105, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8635781407356262, "num_tokens": 267171039.0, "step": 7003 }, { "epoch": 0.8909807912479328, "ewc_loss": 0.047351688146591187, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001903137454064563, "grad_norm": 5.598994255065918, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8639967441558838, "num_tokens": 267211197.0, "step": 7004 }, { "epoch": 0.8911080015265234, "ewc_loss": 0.04740886762738228, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019088554836343974, "grad_norm": 5.602158546447754, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.866098165512085, "num_tokens": 267247471.0, "step": 7005 }, { "epoch": 0.8912352118051139, "ewc_loss": 0.04745778813958168, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019137475464958698, "grad_norm": 5.627692699432373, "learning_rate": 1e-06, "loss": 0.552, "mean_token_accuracy": 0.8328225612640381, "num_tokens": 267287276.0, "step": 7006 }, { "epoch": 0.8913624220837043, "ewc_loss": 0.047476962208747864, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019156650523655117, "grad_norm": 5.630350589752197, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8571289777755737, "num_tokens": 267323827.0, "step": 7007 }, { "epoch": 0.8914896323622948, "ewc_loss": 0.047482870519161224, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019162558601237833, "grad_norm": 5.625856876373291, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8689711093902588, "num_tokens": 267364053.0, "step": 7008 }, { "epoch": 0.8916168426408854, "ewc_loss": 0.047485679388046265, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019165364210493863, "grad_norm": 5.640830993652344, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8427646160125732, "num_tokens": 267404393.0, "step": 7009 }, { "epoch": 0.8917440529194759, "ewc_loss": 0.04748547077178955, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019165159028489143, "grad_norm": 5.6379594802856445, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8609715700149536, "num_tokens": 267444589.0, "step": 7010 }, { "epoch": 0.8918712631980664, "ewc_loss": 0.047467153519392014, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019146841077599674, "grad_norm": 5.5940680503845215, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8613526821136475, "num_tokens": 267484460.0, "step": 7011 }, { "epoch": 0.891998473476657, "ewc_loss": 0.0474507175385952, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019130404689349234, "grad_norm": 5.668710231781006, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8530819416046143, "num_tokens": 267517390.0, "step": 7012 }, { "epoch": 0.8921256837552475, "ewc_loss": 0.047454990446567535, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019134680042043328, "grad_norm": 5.579112529754639, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8607261776924133, "num_tokens": 267555813.0, "step": 7013 }, { "epoch": 0.8922528940338379, "ewc_loss": 0.04746776074171066, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001914744934765622, "grad_norm": 5.592096328735352, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8581587076187134, "num_tokens": 267595818.0, "step": 7014 }, { "epoch": 0.8923801043124284, "ewc_loss": 0.04745080694556236, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001913049491122365, "grad_norm": 5.6832661628723145, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8349142074584961, "num_tokens": 267633106.0, "step": 7015 }, { "epoch": 0.892507314591019, "ewc_loss": 0.04746651649475098, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019146203703712672, "grad_norm": 5.608243942260742, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8624681234359741, "num_tokens": 267670166.0, "step": 7016 }, { "epoch": 0.8926345248696095, "ewc_loss": 0.04749037325382233, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001917006156872958, "grad_norm": 5.646117210388184, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8565244078636169, "num_tokens": 267707284.0, "step": 7017 }, { "epoch": 0.8927617351482, "ewc_loss": 0.04747094586491585, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019150633306708187, "grad_norm": 5.6075944900512695, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8601794242858887, "num_tokens": 267742349.0, "step": 7018 }, { "epoch": 0.8928889454267905, "ewc_loss": 0.047625161707401276, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019182779942639172, "grad_norm": 5.646673202514648, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8650025725364685, "num_tokens": 267776003.0, "step": 7019 }, { "epoch": 0.893016155705381, "ewc_loss": 0.04747338593006134, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019153070752508938, "grad_norm": 5.581108570098877, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.856044352054596, "num_tokens": 267815887.0, "step": 7020 }, { "epoch": 0.8931433659839715, "ewc_loss": 0.047497376799583435, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019177065405528992, "grad_norm": 5.619385719299316, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8702013492584229, "num_tokens": 267855874.0, "step": 7021 }, { "epoch": 0.893270576262562, "ewc_loss": 0.0475098192691803, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019189505837857723, "grad_norm": 5.57239294052124, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.869469165802002, "num_tokens": 267897901.0, "step": 7022 }, { "epoch": 0.8933977865411525, "ewc_loss": 0.04748201370239258, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019161698583047837, "grad_norm": 5.6120285987854, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8708727955818176, "num_tokens": 267938967.0, "step": 7023 }, { "epoch": 0.8935249968197431, "ewc_loss": 0.047517985105514526, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001919767091749236, "grad_norm": 5.665510177612305, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8565340042114258, "num_tokens": 267975196.0, "step": 7024 }, { "epoch": 0.8936522070983336, "ewc_loss": 0.047511566430330276, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001919125352287665, "grad_norm": 5.594295978546143, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8746371865272522, "num_tokens": 268014407.0, "step": 7025 }, { "epoch": 0.893779417376924, "ewc_loss": 0.04746398329734802, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001914367312565446, "grad_norm": 5.673252105712891, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.875852644443512, "num_tokens": 268043892.0, "step": 7026 }, { "epoch": 0.8939066276555145, "ewc_loss": 0.0476473793387413, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019204999262001365, "grad_norm": 5.6306257247924805, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8595553636550903, "num_tokens": 268079651.0, "step": 7027 }, { "epoch": 0.8940338379341051, "ewc_loss": 0.04741498827934265, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001909467828227207, "grad_norm": 5.63184118270874, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8630950450897217, "num_tokens": 268114021.0, "step": 7028 }, { "epoch": 0.8941610482126956, "ewc_loss": 0.047461580485105515, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001914126769406721, "grad_norm": 5.618342399597168, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8587343096733093, "num_tokens": 268155315.0, "step": 7029 }, { "epoch": 0.8942882584912861, "ewc_loss": 0.04758160561323166, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019139223149977624, "grad_norm": 5.620869159698486, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.857693076133728, "num_tokens": 268199312.0, "step": 7030 }, { "epoch": 0.8944154687698767, "ewc_loss": 0.04745111241936684, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019130799046251923, "grad_norm": 5.617326736450195, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8664019107818604, "num_tokens": 268236955.0, "step": 7031 }, { "epoch": 0.8945426790484671, "ewc_loss": 0.04746091365814209, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019140602671541274, "grad_norm": 5.643091678619385, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8576792478561401, "num_tokens": 268272183.0, "step": 7032 }, { "epoch": 0.8946698893270576, "ewc_loss": 0.04742192104458809, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019101609359495342, "grad_norm": 5.66138219833374, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8506906032562256, "num_tokens": 268305407.0, "step": 7033 }, { "epoch": 0.8947970996056481, "ewc_loss": 0.04739910736680031, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019078794866800308, "grad_norm": 5.679852485656738, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8713189959526062, "num_tokens": 268340443.0, "step": 7034 }, { "epoch": 0.8949243098842387, "ewc_loss": 0.04742242023348808, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019102108490187675, "grad_norm": 5.604292869567871, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8786866664886475, "num_tokens": 268375597.0, "step": 7035 }, { "epoch": 0.8950515201628292, "ewc_loss": 0.04740167036652565, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019081357459072024, "grad_norm": 5.618357181549072, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8547071218490601, "num_tokens": 268412872.0, "step": 7036 }, { "epoch": 0.8951787304414197, "ewc_loss": 0.04745938628911972, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019139073265250772, "grad_norm": 5.642776966094971, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8604888916015625, "num_tokens": 268452021.0, "step": 7037 }, { "epoch": 0.8953059407200101, "ewc_loss": 0.04743940755724907, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019119094940833747, "grad_norm": 5.592915058135986, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8689396381378174, "num_tokens": 268486354.0, "step": 7038 }, { "epoch": 0.8954331509986007, "ewc_loss": 0.04743606969714165, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001911575673148036, "grad_norm": 5.6550445556640625, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8549602031707764, "num_tokens": 268519918.0, "step": 7039 }, { "epoch": 0.8955603612771912, "ewc_loss": 0.04743177443742752, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019111461006104946, "grad_norm": 5.623194217681885, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8523327112197876, "num_tokens": 268554259.0, "step": 7040 }, { "epoch": 0.8956875715557817, "ewc_loss": 0.04747675731778145, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019156443886458874, "grad_norm": 5.617584228515625, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8540993332862854, "num_tokens": 268595785.0, "step": 7041 }, { "epoch": 0.8958147818343722, "ewc_loss": 0.0474700927734375, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019149782019667327, "grad_norm": 5.605449676513672, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.844497561454773, "num_tokens": 268640805.0, "step": 7042 }, { "epoch": 0.8959419921129628, "ewc_loss": 0.04746536538004875, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019145052647218108, "grad_norm": 5.581688404083252, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8655823469161987, "num_tokens": 268682446.0, "step": 7043 }, { "epoch": 0.8960692023915532, "ewc_loss": 0.04749760776758194, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019177293870598078, "grad_norm": 5.62457799911499, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8694583177566528, "num_tokens": 268720262.0, "step": 7044 }, { "epoch": 0.8961964126701437, "ewc_loss": 0.04747972637414932, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019159415387548506, "grad_norm": 5.63587760925293, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8481428623199463, "num_tokens": 268760519.0, "step": 7045 }, { "epoch": 0.8963236229487342, "ewc_loss": 0.04749462381005287, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019174312183167785, "grad_norm": 5.636154651641846, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8526893258094788, "num_tokens": 268796621.0, "step": 7046 }, { "epoch": 0.8964508332273248, "ewc_loss": 0.04749227315187454, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019171962048858404, "grad_norm": 5.675908088684082, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8685298562049866, "num_tokens": 268830527.0, "step": 7047 }, { "epoch": 0.8965780435059153, "ewc_loss": 0.0474715456366539, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001915123575599864, "grad_norm": 5.653326034545898, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8654197454452515, "num_tokens": 268862112.0, "step": 7048 }, { "epoch": 0.8967052537845058, "ewc_loss": 0.047475799918174744, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019155486370436847, "grad_norm": 5.660301208496094, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8592408299446106, "num_tokens": 268901089.0, "step": 7049 }, { "epoch": 0.8968324640630962, "ewc_loss": 0.047407396137714386, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001908708509290591, "grad_norm": 5.582430362701416, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8618001937866211, "num_tokens": 268942766.0, "step": 7050 }, { "epoch": 0.8969596743416868, "ewc_loss": 0.047446541488170624, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019126231200061738, "grad_norm": 5.675321578979492, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8664517998695374, "num_tokens": 268978642.0, "step": 7051 }, { "epoch": 0.8970868846202773, "ewc_loss": 0.04747854918241501, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019158236682415009, "grad_norm": 5.627047538757324, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8667032718658447, "num_tokens": 269016048.0, "step": 7052 }, { "epoch": 0.8972140948988678, "ewc_loss": 0.04744318500161171, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019122872618027031, "grad_norm": 5.688265800476074, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8608312606811523, "num_tokens": 269054381.0, "step": 7053 }, { "epoch": 0.8973413051774584, "ewc_loss": 0.047444168478250504, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001912385632749647, "grad_norm": 5.653260231018066, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8649693727493286, "num_tokens": 269090672.0, "step": 7054 }, { "epoch": 0.8974685154560489, "ewc_loss": 0.04741407930850983, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001909376442199573, "grad_norm": 5.604823112487793, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8639295101165771, "num_tokens": 269130231.0, "step": 7055 }, { "epoch": 0.8975957257346393, "ewc_loss": 0.047457944601774216, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001913763117045164, "grad_norm": 5.662363529205322, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8647074699401855, "num_tokens": 269166282.0, "step": 7056 }, { "epoch": 0.8977229360132298, "ewc_loss": 0.047454893589019775, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019134582544211298, "grad_norm": 5.635319232940674, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8642309308052063, "num_tokens": 269205326.0, "step": 7057 }, { "epoch": 0.8978501462918204, "ewc_loss": 0.04756665602326393, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019124273967463523, "grad_norm": 5.623753547668457, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8628317713737488, "num_tokens": 269243903.0, "step": 7058 }, { "epoch": 0.8979773565704109, "ewc_loss": 0.047461312264204025, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019140999938827008, "grad_norm": 5.65427303314209, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8544750809669495, "num_tokens": 269288330.0, "step": 7059 }, { "epoch": 0.8981045668490014, "ewc_loss": 0.04745342209935188, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019133109890390188, "grad_norm": 5.640920162200928, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8507083654403687, "num_tokens": 269328731.0, "step": 7060 }, { "epoch": 0.898231777127592, "ewc_loss": 0.047487013041973114, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019166702986694872, "grad_norm": 5.657944202423096, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8618131875991821, "num_tokens": 269369576.0, "step": 7061 }, { "epoch": 0.8983589874061825, "ewc_loss": 0.04748677834868431, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001916646579047665, "grad_norm": 5.662973880767822, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8591113090515137, "num_tokens": 269407734.0, "step": 7062 }, { "epoch": 0.8984861976847729, "ewc_loss": 0.04744084179401398, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001912052684929222, "grad_norm": 5.683474063873291, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8409918546676636, "num_tokens": 269444042.0, "step": 7063 }, { "epoch": 0.8986134079633634, "ewc_loss": 0.04748356342315674, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019163251272402704, "grad_norm": 5.6788225173950195, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8837968111038208, "num_tokens": 269480979.0, "step": 7064 }, { "epoch": 0.898740618241954, "ewc_loss": 0.047434888780117035, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001911457657115534, "grad_norm": 5.651609420776367, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8602386713027954, "num_tokens": 269515594.0, "step": 7065 }, { "epoch": 0.8988678285205445, "ewc_loss": 0.04741857945919037, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001909826823975891, "grad_norm": 5.640333652496338, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8627380728721619, "num_tokens": 269556974.0, "step": 7066 }, { "epoch": 0.898995038799135, "ewc_loss": 0.047405414283275604, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001908510021166876, "grad_norm": 5.724687576293945, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8555692434310913, "num_tokens": 269591088.0, "step": 7067 }, { "epoch": 0.8991222490777255, "ewc_loss": 0.04739899933338165, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001907868863781914, "grad_norm": 5.636292457580566, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.869418740272522, "num_tokens": 269627921.0, "step": 7068 }, { "epoch": 0.899249459356316, "ewc_loss": 0.04752887040376663, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019086488464381546, "grad_norm": 5.6856255531311035, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8445266485214233, "num_tokens": 269666586.0, "step": 7069 }, { "epoch": 0.8993766696349065, "ewc_loss": 0.0474332794547081, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001911296567413956, "grad_norm": 5.69439697265625, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8640390634536743, "num_tokens": 269701205.0, "step": 7070 }, { "epoch": 0.899503879913497, "ewc_loss": 0.04743562638759613, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001911531580844894, "grad_norm": 5.678663730621338, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8742175698280334, "num_tokens": 269739982.0, "step": 7071 }, { "epoch": 0.8996310901920875, "ewc_loss": 0.04741284251213074, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001909252896439284, "grad_norm": 5.695245742797852, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8625494837760925, "num_tokens": 269773048.0, "step": 7072 }, { "epoch": 0.8997583004706781, "ewc_loss": 0.047433286905288696, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019112974405288696, "grad_norm": 5.688645839691162, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8602207899093628, "num_tokens": 269811343.0, "step": 7073 }, { "epoch": 0.8998855107492686, "ewc_loss": 0.047419965267181396, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019099650671705604, "grad_norm": 5.625619888305664, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8653842210769653, "num_tokens": 269851601.0, "step": 7074 }, { "epoch": 0.900012721027859, "ewc_loss": 0.04739145189523697, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019071137649007142, "grad_norm": 5.6235198974609375, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8695681095123291, "num_tokens": 269891396.0, "step": 7075 }, { "epoch": 0.9001399313064495, "ewc_loss": 0.047455862164497375, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019135551701765507, "grad_norm": 5.6624836921691895, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8603214621543884, "num_tokens": 269930292.0, "step": 7076 }, { "epoch": 0.9002671415850401, "ewc_loss": 0.04746771603822708, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001914740278152749, "grad_norm": 5.694592475891113, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8333770036697388, "num_tokens": 269968464.0, "step": 7077 }, { "epoch": 0.9003943518636306, "ewc_loss": 0.047457076609134674, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019136765331495553, "grad_norm": 5.646563529968262, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8475635051727295, "num_tokens": 270011569.0, "step": 7078 }, { "epoch": 0.9005215621422211, "ewc_loss": 0.0474688857793808, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001914857275551185, "grad_norm": 5.642249584197998, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8573652505874634, "num_tokens": 270050608.0, "step": 7079 }, { "epoch": 0.9006487724208116, "ewc_loss": 0.047481656074523926, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019161343516316265, "grad_norm": 5.621613502502441, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8471744060516357, "num_tokens": 270096100.0, "step": 7080 }, { "epoch": 0.9007759826994021, "ewc_loss": 0.0475241094827652, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019203798728995025, "grad_norm": 5.631536960601807, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8721780776977539, "num_tokens": 270140915.0, "step": 7081 }, { "epoch": 0.9009031929779926, "ewc_loss": 0.047440607100725174, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019120295473840088, "grad_norm": 5.6201863288879395, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8610100746154785, "num_tokens": 270184233.0, "step": 7082 }, { "epoch": 0.9010304032565831, "ewc_loss": 0.047494787722826004, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019174475164618343, "grad_norm": 5.658615589141846, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8589285612106323, "num_tokens": 270223889.0, "step": 7083 }, { "epoch": 0.9011576135351737, "ewc_loss": 0.04750249534845352, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001918218331411481, "grad_norm": 5.656729698181152, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8666902184486389, "num_tokens": 270266357.0, "step": 7084 }, { "epoch": 0.9012848238137642, "ewc_loss": 0.04762598127126694, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019183600670658052, "grad_norm": 5.699596881866455, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8687182068824768, "num_tokens": 270298252.0, "step": 7085 }, { "epoch": 0.9014120340923547, "ewc_loss": 0.04738380014896393, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001906348770717159, "grad_norm": 5.617704391479492, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8541625738143921, "num_tokens": 270343187.0, "step": 7086 }, { "epoch": 0.9015392443709451, "ewc_loss": 0.047477975487709045, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019157660426571965, "grad_norm": 5.672008037567139, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8495877981185913, "num_tokens": 270385094.0, "step": 7087 }, { "epoch": 0.9016664546495357, "ewc_loss": 0.04748791083693504, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019167597929481417, "grad_norm": 5.653137683868408, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8484402298927307, "num_tokens": 270421772.0, "step": 7088 }, { "epoch": 0.9017936649281262, "ewc_loss": 0.047457076609134674, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019136766786687076, "grad_norm": 5.612260818481445, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8422056436538696, "num_tokens": 270465033.0, "step": 7089 }, { "epoch": 0.9019208752067167, "ewc_loss": 0.047660648822784424, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019218266243115067, "grad_norm": 5.718268871307373, "learning_rate": 1e-06, "loss": 0.3839, "mean_token_accuracy": 0.8833746910095215, "num_tokens": 270501370.0, "step": 7090 }, { "epoch": 0.9020480854853072, "ewc_loss": 0.047538451850414276, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019218138186261058, "grad_norm": 5.630883693695068, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8653584718704224, "num_tokens": 270543571.0, "step": 7091 }, { "epoch": 0.9021752957638978, "ewc_loss": 0.04752100259065628, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000192006875295192, "grad_norm": 5.659631252288818, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.870521068572998, "num_tokens": 270580635.0, "step": 7092 }, { "epoch": 0.9023025060424882, "ewc_loss": 0.04755315184593201, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019232839986216277, "grad_norm": 5.6988139152526855, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8739801645278931, "num_tokens": 270616534.0, "step": 7093 }, { "epoch": 0.9024297163210787, "ewc_loss": 0.04754924029111862, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019228925521019846, "grad_norm": 5.647607326507568, "learning_rate": 1e-06, "loss": 0.5253, "mean_token_accuracy": 0.8423218727111816, "num_tokens": 270653998.0, "step": 7094 }, { "epoch": 0.9025569265996692, "ewc_loss": 0.04754824936389923, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019227935990784317, "grad_norm": 5.66497802734375, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8607565760612488, "num_tokens": 270688023.0, "step": 7095 }, { "epoch": 0.9026841368782598, "ewc_loss": 0.047591134905815125, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019270824850536883, "grad_norm": 5.719128131866455, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8442111611366272, "num_tokens": 270721298.0, "step": 7096 }, { "epoch": 0.9028113471568503, "ewc_loss": 0.04759250208735466, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019272189820185304, "grad_norm": 5.61890983581543, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8663602471351624, "num_tokens": 270762182.0, "step": 7097 }, { "epoch": 0.9029385574354408, "ewc_loss": 0.047581788152456284, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019261475245002657, "grad_norm": 5.674732208251953, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8759500980377197, "num_tokens": 270799577.0, "step": 7098 }, { "epoch": 0.9030657677140312, "ewc_loss": 0.04759599640965462, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019275683735031635, "grad_norm": 5.610896587371826, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8622418642044067, "num_tokens": 270840218.0, "step": 7099 }, { "epoch": 0.9031929779926218, "ewc_loss": 0.04762222617864609, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019301913562230766, "grad_norm": 5.685360908508301, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8812663555145264, "num_tokens": 270872982.0, "step": 7100 }, { "epoch": 0.9033201882712123, "ewc_loss": 0.047638121992349625, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019317810074426234, "grad_norm": 5.671611785888672, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8663265109062195, "num_tokens": 270908381.0, "step": 7101 }, { "epoch": 0.9034473985498028, "ewc_loss": 0.047655221074819565, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019334908574819565, "grad_norm": 5.6485090255737305, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8592649698257446, "num_tokens": 270949217.0, "step": 7102 }, { "epoch": 0.9035746088283934, "ewc_loss": 0.04761255532503128, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001929224090417847, "grad_norm": 6.583992004394531, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.844122588634491, "num_tokens": 270989667.0, "step": 7103 }, { "epoch": 0.9037018191069839, "ewc_loss": 0.04799409955739975, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019673786300700158, "grad_norm": 5.615842819213867, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8510979413986206, "num_tokens": 271030395.0, "step": 7104 }, { "epoch": 0.9038290293855743, "ewc_loss": 0.047390688210725784, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019070375128649175, "grad_norm": 5.766315937042236, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8436123132705688, "num_tokens": 271068239.0, "step": 7105 }, { "epoch": 0.9039562396641648, "ewc_loss": 0.04760461300611496, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019284301379229873, "grad_norm": 5.651870250701904, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8545351624488831, "num_tokens": 271110294.0, "step": 7106 }, { "epoch": 0.9040834499427554, "ewc_loss": 0.047532081604003906, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019211771723348647, "grad_norm": 5.695348739624023, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8783904910087585, "num_tokens": 271143288.0, "step": 7107 }, { "epoch": 0.9042106602213459, "ewc_loss": 0.047565482556819916, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001924516836879775, "grad_norm": 5.6760711669921875, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8681005835533142, "num_tokens": 271181183.0, "step": 7108 }, { "epoch": 0.9043378704999364, "ewc_loss": 0.04758395999670029, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019263649301137775, "grad_norm": 5.667892932891846, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8593400716781616, "num_tokens": 271216201.0, "step": 7109 }, { "epoch": 0.9044650807785269, "ewc_loss": 0.04757913574576378, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019258822430856526, "grad_norm": 5.673151016235352, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8671729564666748, "num_tokens": 271250158.0, "step": 7110 }, { "epoch": 0.9045922910571175, "ewc_loss": 0.047535981982946396, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019215668726246804, "grad_norm": 5.612749099731445, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8733644485473633, "num_tokens": 271289994.0, "step": 7111 }, { "epoch": 0.9047195013357079, "ewc_loss": 0.0476156547665596, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019295340462122113, "grad_norm": 5.686749458312988, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8661349415779114, "num_tokens": 271325782.0, "step": 7112 }, { "epoch": 0.9048467116142984, "ewc_loss": 0.047595396637916565, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019275082740932703, "grad_norm": 5.760207176208496, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8474551439285278, "num_tokens": 271361604.0, "step": 7113 }, { "epoch": 0.9049739218928889, "ewc_loss": 0.04758869856595993, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019268387404736131, "grad_norm": 5.680366039276123, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8702596426010132, "num_tokens": 271398520.0, "step": 7114 }, { "epoch": 0.9051011321714795, "ewc_loss": 0.04758474975824356, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001926443655975163, "grad_norm": 5.661589622497559, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8551583290100098, "num_tokens": 271445138.0, "step": 7115 }, { "epoch": 0.90522834245007, "ewc_loss": 0.04776247963309288, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019320097635500133, "grad_norm": 6.639081954956055, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.865891695022583, "num_tokens": 271479147.0, "step": 7116 }, { "epoch": 0.9053555527286605, "ewc_loss": 0.04801804572343826, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019697735842783004, "grad_norm": 5.645402431488037, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8629223108291626, "num_tokens": 271512477.0, "step": 7117 }, { "epoch": 0.905482763007251, "ewc_loss": 0.04739955812692642, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019079244520980865, "grad_norm": 5.75758171081543, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8473759889602661, "num_tokens": 271552355.0, "step": 7118 }, { "epoch": 0.9056099732858415, "ewc_loss": 0.04760294407606125, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019282633729744703, "grad_norm": 5.666835784912109, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8577284812927246, "num_tokens": 271595825.0, "step": 7119 }, { "epoch": 0.905737183564432, "ewc_loss": 0.047560371458530426, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001924005919136107, "grad_norm": 5.756448268890381, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8492000102996826, "num_tokens": 271627384.0, "step": 7120 }, { "epoch": 0.9058643938430225, "ewc_loss": 0.04756786674261093, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019247551972512156, "grad_norm": 5.658149719238281, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8709909915924072, "num_tokens": 271666711.0, "step": 7121 }, { "epoch": 0.9059916041216131, "ewc_loss": 0.04765971750020981, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019217336375731975, "grad_norm": 5.712690830230713, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8670645356178284, "num_tokens": 271703145.0, "step": 7122 }, { "epoch": 0.9061188144002036, "ewc_loss": 0.047571659088134766, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001925134565681219, "grad_norm": 5.666982650756836, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8499373197555542, "num_tokens": 271743131.0, "step": 7123 }, { "epoch": 0.906246024678794, "ewc_loss": 0.0476265475153923, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019306232570670545, "grad_norm": 5.755231857299805, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8649073243141174, "num_tokens": 271778245.0, "step": 7124 }, { "epoch": 0.9063732349573845, "ewc_loss": 0.04757595807313919, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019255647202953696, "grad_norm": 5.650370121002197, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.85335773229599, "num_tokens": 271817279.0, "step": 7125 }, { "epoch": 0.9065004452359751, "ewc_loss": 0.0476006343960762, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019280319975223392, "grad_norm": 5.668256759643555, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8689416646957397, "num_tokens": 271857136.0, "step": 7126 }, { "epoch": 0.9066276555145656, "ewc_loss": 0.0476001501083374, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019279838306829333, "grad_norm": 5.810146808624268, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8703667521476746, "num_tokens": 271891651.0, "step": 7127 }, { "epoch": 0.9067548657931561, "ewc_loss": 0.04752340540289879, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001920309296110645, "grad_norm": 5.626592636108398, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8659161329269409, "num_tokens": 271925805.0, "step": 7128 }, { "epoch": 0.9068820760717466, "ewc_loss": 0.047574177384376526, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019253867503721267, "grad_norm": 5.700860023498535, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8651697039604187, "num_tokens": 271964954.0, "step": 7129 }, { "epoch": 0.9070092863503371, "ewc_loss": 0.04761742055416107, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001929710851982236, "grad_norm": 5.737709045410156, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8606529831886292, "num_tokens": 272000331.0, "step": 7130 }, { "epoch": 0.9071364966289276, "ewc_loss": 0.047582414001226425, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019262100977357477, "grad_norm": 5.65628719329834, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8685741424560547, "num_tokens": 272037853.0, "step": 7131 }, { "epoch": 0.9072637069075181, "ewc_loss": 0.047565385699272156, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019245075236540288, "grad_norm": 5.644143581390381, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.871893048286438, "num_tokens": 272077663.0, "step": 7132 }, { "epoch": 0.9073909171861086, "ewc_loss": 0.04762401431798935, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019303699082229286, "grad_norm": 5.6951494216918945, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8572220802307129, "num_tokens": 272116942.0, "step": 7133 }, { "epoch": 0.9075181274646992, "ewc_loss": 0.04761764407157898, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019297331164125353, "grad_norm": 5.673608779907227, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8547346591949463, "num_tokens": 272151319.0, "step": 7134 }, { "epoch": 0.9076453377432897, "ewc_loss": 0.047640129923820496, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019319816783536226, "grad_norm": 5.748292922973633, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8480203747749329, "num_tokens": 272190817.0, "step": 7135 }, { "epoch": 0.9077725480218801, "ewc_loss": 0.04763718321919441, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019316871475894004, "grad_norm": 5.665021896362305, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8556220531463623, "num_tokens": 272230461.0, "step": 7136 }, { "epoch": 0.9078997583004706, "ewc_loss": 0.047624729573726654, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001930441940203309, "grad_norm": 5.739058494567871, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8559876680374146, "num_tokens": 272268510.0, "step": 7137 }, { "epoch": 0.9080269685790612, "ewc_loss": 0.047639183700084686, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001931886945385486, "grad_norm": 5.639764308929443, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8612615466117859, "num_tokens": 272312430.0, "step": 7138 }, { "epoch": 0.9081541788576517, "ewc_loss": 0.04758910834789276, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019268796313554049, "grad_norm": 5.716775417327881, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8600260019302368, "num_tokens": 272348437.0, "step": 7139 }, { "epoch": 0.9082813891362422, "ewc_loss": 0.0476815365254879, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019361224258318543, "grad_norm": 5.757989883422852, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8535263538360596, "num_tokens": 272378902.0, "step": 7140 }, { "epoch": 0.9084085994148328, "ewc_loss": 0.04770023003220558, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019379917648620903, "grad_norm": 5.691729545593262, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8743910789489746, "num_tokens": 272421257.0, "step": 7141 }, { "epoch": 0.9085358096934232, "ewc_loss": 0.04762085899710655, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000193005456821993, "grad_norm": 5.665402412414551, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8603100180625916, "num_tokens": 272459800.0, "step": 7142 }, { "epoch": 0.9086630199720137, "ewc_loss": 0.04770352691411972, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019383216567803174, "grad_norm": 5.656487941741943, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.8801860809326172, "num_tokens": 272501245.0, "step": 7143 }, { "epoch": 0.9087902302506042, "ewc_loss": 0.047697361558675766, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019377049466129392, "grad_norm": 5.73328971862793, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8622236847877502, "num_tokens": 272535873.0, "step": 7144 }, { "epoch": 0.9089174405291948, "ewc_loss": 0.047703102231025696, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019382790196686983, "grad_norm": 5.704706192016602, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8590215444564819, "num_tokens": 272575213.0, "step": 7145 }, { "epoch": 0.9090446508077853, "ewc_loss": 0.04767246171832085, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019352149683982134, "grad_norm": 5.712473392486572, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8639419674873352, "num_tokens": 272611960.0, "step": 7146 }, { "epoch": 0.9091718610863758, "ewc_loss": 0.04766199737787247, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019341686856932938, "grad_norm": 5.678599834442139, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8614683151245117, "num_tokens": 272650895.0, "step": 7147 }, { "epoch": 0.9092990713649662, "ewc_loss": 0.047662414610385895, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019342100131325424, "grad_norm": 5.7303667068481445, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8612058758735657, "num_tokens": 272690617.0, "step": 7148 }, { "epoch": 0.9094262816435568, "ewc_loss": 0.047659579664468765, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001933926687343046, "grad_norm": 5.677793025970459, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8532192707061768, "num_tokens": 272732526.0, "step": 7149 }, { "epoch": 0.9095534919221473, "ewc_loss": 0.04762950539588928, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019309193885419518, "grad_norm": 5.696332931518555, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8591925501823425, "num_tokens": 272769882.0, "step": 7150 }, { "epoch": 0.9096807022007378, "ewc_loss": 0.047634921967983246, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001931461156345904, "grad_norm": 5.734428405761719, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8642138838768005, "num_tokens": 272809614.0, "step": 7151 }, { "epoch": 0.9098079124793284, "ewc_loss": 0.04762705788016319, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019306746253278106, "grad_norm": 5.710865020751953, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8541102409362793, "num_tokens": 272844819.0, "step": 7152 }, { "epoch": 0.9099351227579189, "ewc_loss": 0.04758540540933609, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019265094306319952, "grad_norm": 5.688604354858398, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8666682243347168, "num_tokens": 272882798.0, "step": 7153 }, { "epoch": 0.9100623330365093, "ewc_loss": 0.04762149602174759, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019301183056086302, "grad_norm": 5.721742153167725, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.862831711769104, "num_tokens": 272917472.0, "step": 7154 }, { "epoch": 0.9101895433150998, "ewc_loss": 0.0475802943110466, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001925997930811718, "grad_norm": 5.6635966300964355, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.850414514541626, "num_tokens": 272959947.0, "step": 7155 }, { "epoch": 0.9103167535936904, "ewc_loss": 0.047644853591918945, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.000193245432456024, "grad_norm": 5.7807698249816895, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8446564674377441, "num_tokens": 272998194.0, "step": 7156 }, { "epoch": 0.9104439638722809, "ewc_loss": 0.047664105892181396, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019343792519066483, "grad_norm": 5.761377334594727, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8790802359580994, "num_tokens": 273034688.0, "step": 7157 }, { "epoch": 0.9105711741508714, "ewc_loss": 0.04760134965181351, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001928103738464415, "grad_norm": 5.652781009674072, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8713624477386475, "num_tokens": 273071575.0, "step": 7158 }, { "epoch": 0.9106983844294619, "ewc_loss": 0.047613441944122314, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019293127115815878, "grad_norm": 5.892240047454834, "learning_rate": 1e-06, "loss": 0.54, "mean_token_accuracy": 0.8364766836166382, "num_tokens": 273106106.0, "step": 7159 }, { "epoch": 0.9108255947080524, "ewc_loss": 0.04760098457336426, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019280670676380396, "grad_norm": 5.691837310791016, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8561880588531494, "num_tokens": 273142959.0, "step": 7160 }, { "epoch": 0.9109528049866429, "ewc_loss": 0.047547899186611176, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019227586744818836, "grad_norm": 5.8080220222473145, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8460609912872314, "num_tokens": 273182781.0, "step": 7161 }, { "epoch": 0.9110800152652334, "ewc_loss": 0.04754826799035072, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019227956363465637, "grad_norm": 5.615095615386963, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8681856989860535, "num_tokens": 273225738.0, "step": 7162 }, { "epoch": 0.9112072255438239, "ewc_loss": 0.04761224985122681, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001929193822434172, "grad_norm": 5.869537830352783, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8687669038772583, "num_tokens": 273269166.0, "step": 7163 }, { "epoch": 0.9113344358224145, "ewc_loss": 0.0475766658782959, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019256354426033795, "grad_norm": 5.661247253417969, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8560028076171875, "num_tokens": 273309093.0, "step": 7164 }, { "epoch": 0.911461646101005, "ewc_loss": 0.04762991890311241, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019309607159812003, "grad_norm": 5.953186511993408, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8663841485977173, "num_tokens": 273346857.0, "step": 7165 }, { "epoch": 0.9115888563795955, "ewc_loss": 0.0476100891828537, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019289777264930308, "grad_norm": 5.660534381866455, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8569203019142151, "num_tokens": 273385720.0, "step": 7166 }, { "epoch": 0.9117160666581859, "ewc_loss": 0.04759691655635834, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019276601960882545, "grad_norm": 6.068097114562988, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8618115782737732, "num_tokens": 273431079.0, "step": 7167 }, { "epoch": 0.9118432769367765, "ewc_loss": 0.047660067677497864, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001933975436259061, "grad_norm": 5.625095367431641, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8619713187217712, "num_tokens": 273477316.0, "step": 7168 }, { "epoch": 0.911970487215367, "ewc_loss": 0.047544099390506744, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001922378723975271, "grad_norm": 5.87899923324585, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8646031618118286, "num_tokens": 273512831.0, "step": 7169 }, { "epoch": 0.9120976974939575, "ewc_loss": 0.0476512610912323, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001933095045387745, "grad_norm": 5.767984390258789, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8549739122390747, "num_tokens": 273551927.0, "step": 7170 }, { "epoch": 0.912224907772548, "ewc_loss": 0.04746554046869278, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019145230180583894, "grad_norm": 5.730923175811768, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8654454946517944, "num_tokens": 273583053.0, "step": 7171 }, { "epoch": 0.9123521180511386, "ewc_loss": 0.04760223627090454, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001928192505147308, "grad_norm": 5.83012056350708, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8513949513435364, "num_tokens": 273617953.0, "step": 7172 }, { "epoch": 0.912479328329729, "ewc_loss": 0.047535814344882965, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019215501379221678, "grad_norm": 5.744566917419434, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8462001085281372, "num_tokens": 273649959.0, "step": 7173 }, { "epoch": 0.9126065386083195, "ewc_loss": 0.04756305366754532, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019242741109337658, "grad_norm": 5.728017330169678, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8508492708206177, "num_tokens": 273682666.0, "step": 7174 }, { "epoch": 0.9127337488869101, "ewc_loss": 0.047618843615055084, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019298528786748648, "grad_norm": 5.748122692108154, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8798460364341736, "num_tokens": 273718641.0, "step": 7175 }, { "epoch": 0.9128609591655006, "ewc_loss": 0.04761470854282379, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019294396042823792, "grad_norm": 5.717715740203857, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8711869120597839, "num_tokens": 273760502.0, "step": 7176 }, { "epoch": 0.9129881694440911, "ewc_loss": 0.04764937609434128, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.00019329061615280807, "grad_norm": 5.683615684509277, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8563714027404785, "num_tokens": 273795068.0, "step": 7177 }, { "epoch": 0.9131153797226816, "ewc_loss": 0.04758721590042114, "ewc_loss_diag": 2.8371810913085938e-05, "ewc_loss_parallel": 0.0001926690456457436, "grad_norm": 5.854124069213867, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8429480791091919, "num_tokens": 273825099.0, "step": 7178 }, { "epoch": 0.9132425900012721, "ewc_loss": 0.04780140146613121, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019359018187969923, "grad_norm": 5.681265830993652, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8667370080947876, "num_tokens": 273862566.0, "step": 7179 }, { "epoch": 0.9133698002798626, "ewc_loss": 0.047737110406160355, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019294727826490998, "grad_norm": 5.750927925109863, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.861281156539917, "num_tokens": 273896107.0, "step": 7180 }, { "epoch": 0.9134970105584531, "ewc_loss": 0.047784797847270966, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019342417363077402, "grad_norm": 5.679140090942383, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8653364777565002, "num_tokens": 273934066.0, "step": 7181 }, { "epoch": 0.9136242208370436, "ewc_loss": 0.0477629154920578, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001932053273776546, "grad_norm": 5.646899223327637, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.860027015209198, "num_tokens": 273971843.0, "step": 7182 }, { "epoch": 0.9137514311156342, "ewc_loss": 0.04780516400933266, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001936278131324798, "grad_norm": 5.7643818855285645, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8538451194763184, "num_tokens": 274001646.0, "step": 7183 }, { "epoch": 0.9138786413942247, "ewc_loss": 0.047859638929367065, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019417253497522324, "grad_norm": 5.678923606872559, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8465168476104736, "num_tokens": 274042656.0, "step": 7184 }, { "epoch": 0.9140058516728151, "ewc_loss": 0.047870539128780365, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019428158702794462, "grad_norm": 5.6514105796813965, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8474034070968628, "num_tokens": 274084609.0, "step": 7185 }, { "epoch": 0.9141330619514056, "ewc_loss": 0.04788018390536308, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019437800801824778, "grad_norm": 5.662754058837891, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8640556335449219, "num_tokens": 274127503.0, "step": 7186 }, { "epoch": 0.9142602722299962, "ewc_loss": 0.047942034900188446, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019499653717502952, "grad_norm": 5.6880598068237305, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8652764558792114, "num_tokens": 274164126.0, "step": 7187 }, { "epoch": 0.9143874825085867, "ewc_loss": 0.04796215519309044, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001951977174030617, "grad_norm": 5.796895503997803, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8616836071014404, "num_tokens": 274198104.0, "step": 7188 }, { "epoch": 0.9145146927871772, "ewc_loss": 0.04791189730167389, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019469516701065004, "grad_norm": 5.700531482696533, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8502050638198853, "num_tokens": 274233487.0, "step": 7189 }, { "epoch": 0.9146419030657678, "ewc_loss": 0.047918058931827545, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019475673616398126, "grad_norm": 5.687723159790039, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.863667368888855, "num_tokens": 274267317.0, "step": 7190 }, { "epoch": 0.9147691133443582, "ewc_loss": 0.04794497415423393, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001950259174918756, "grad_norm": 5.664052963256836, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8532180786132812, "num_tokens": 274304169.0, "step": 7191 }, { "epoch": 0.9148963236229487, "ewc_loss": 0.047931820154190063, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019489436817821115, "grad_norm": 5.7084879875183105, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8611871004104614, "num_tokens": 274343265.0, "step": 7192 }, { "epoch": 0.9150235339015392, "ewc_loss": 0.047892339527606964, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019449956016615033, "grad_norm": 5.702795028686523, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8439964056015015, "num_tokens": 274380756.0, "step": 7193 }, { "epoch": 0.9151507441801298, "ewc_loss": 0.04790809750556946, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019465717195998877, "grad_norm": 5.706381320953369, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8707582354545593, "num_tokens": 274422035.0, "step": 7194 }, { "epoch": 0.9152779544587203, "ewc_loss": 0.04793897271156311, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001949659053934738, "grad_norm": 5.691088676452637, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.851658046245575, "num_tokens": 274460875.0, "step": 7195 }, { "epoch": 0.9154051647373108, "ewc_loss": 0.04788092523813248, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019438541494309902, "grad_norm": 5.7241010665893555, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8638667464256287, "num_tokens": 274495807.0, "step": 7196 }, { "epoch": 0.9155323750159012, "ewc_loss": 0.04788507521152496, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001944269024534151, "grad_norm": 5.666323184967041, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8698953986167908, "num_tokens": 274533098.0, "step": 7197 }, { "epoch": 0.9156595852944918, "ewc_loss": 0.047886431217193604, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001944404939422384, "grad_norm": 5.787777900695801, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.855859100818634, "num_tokens": 274565282.0, "step": 7198 }, { "epoch": 0.9157867955730823, "ewc_loss": 0.047938309609889984, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019495926972012967, "grad_norm": 5.719405651092529, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8645410537719727, "num_tokens": 274605401.0, "step": 7199 }, { "epoch": 0.9159140058516728, "ewc_loss": 0.04786273092031479, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001942035014508292, "grad_norm": 5.8511199951171875, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8552165031433105, "num_tokens": 274645018.0, "step": 7200 }, { "epoch": 0.9160412161302633, "ewc_loss": 0.04788593202829361, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019443548808339983, "grad_norm": 5.651837348937988, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8690817356109619, "num_tokens": 274681541.0, "step": 7201 }, { "epoch": 0.9161684264088539, "ewc_loss": 0.047876130789518356, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019433748093433678, "grad_norm": 5.755614280700684, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8643554449081421, "num_tokens": 274716883.0, "step": 7202 }, { "epoch": 0.9162956366874443, "ewc_loss": 0.04781082272529602, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001936844055308029, "grad_norm": 5.770584583282471, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.855217456817627, "num_tokens": 274753038.0, "step": 7203 }, { "epoch": 0.9164228469660348, "ewc_loss": 0.04810405150055885, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001941752852872014, "grad_norm": 5.648015975952148, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8669055700302124, "num_tokens": 274790760.0, "step": 7204 }, { "epoch": 0.9165500572446253, "ewc_loss": 0.047860901802778244, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001941851805895567, "grad_norm": 5.650949478149414, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.853773832321167, "num_tokens": 274830872.0, "step": 7205 }, { "epoch": 0.9166772675232159, "ewc_loss": 0.04786381870508194, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001942143717315048, "grad_norm": 5.6938300132751465, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8488734364509583, "num_tokens": 274869832.0, "step": 7206 }, { "epoch": 0.9168044778018064, "ewc_loss": 0.04792030155658722, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001947791752172634, "grad_norm": 5.763760089874268, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8530616760253906, "num_tokens": 274903853.0, "step": 7207 }, { "epoch": 0.9169316880803969, "ewc_loss": 0.04791977256536484, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019477390742395073, "grad_norm": 5.631088733673096, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8592111468315125, "num_tokens": 274942911.0, "step": 7208 }, { "epoch": 0.9170588983589874, "ewc_loss": 0.047891031950712204, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019448649254627526, "grad_norm": 5.65125846862793, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8680355548858643, "num_tokens": 274986427.0, "step": 7209 }, { "epoch": 0.9171861086375779, "ewc_loss": 0.04797159135341644, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019529208657331765, "grad_norm": 5.730243682861328, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8635313510894775, "num_tokens": 275021557.0, "step": 7210 }, { "epoch": 0.9173133189161684, "ewc_loss": 0.04788863658905029, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001944625109899789, "grad_norm": 5.658019542694092, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8492246866226196, "num_tokens": 275061308.0, "step": 7211 }, { "epoch": 0.9174405291947589, "ewc_loss": 0.04791979864239693, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019477415480650961, "grad_norm": 5.690755367279053, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8545205593109131, "num_tokens": 275101497.0, "step": 7212 }, { "epoch": 0.9175677394733495, "ewc_loss": 0.0478421226143837, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019399740267544985, "grad_norm": 5.694987773895264, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8491572141647339, "num_tokens": 275138317.0, "step": 7213 }, { "epoch": 0.91769494975194, "ewc_loss": 0.047938261181116104, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019495878950692713, "grad_norm": 5.653679847717285, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8602952361106873, "num_tokens": 275180119.0, "step": 7214 }, { "epoch": 0.9178221600305305, "ewc_loss": 0.04787059128284454, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019428206724114716, "grad_norm": 5.71251916885376, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8594731092453003, "num_tokens": 275212083.0, "step": 7215 }, { "epoch": 0.9179493703091209, "ewc_loss": 0.047864969819784164, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001942258677445352, "grad_norm": 5.686741828918457, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8493168354034424, "num_tokens": 275247872.0, "step": 7216 }, { "epoch": 0.9180765805877115, "ewc_loss": 0.047882333397865295, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001943995157489553, "grad_norm": 5.674434661865234, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8652117252349854, "num_tokens": 275287022.0, "step": 7217 }, { "epoch": 0.918203790866302, "ewc_loss": 0.047877237200737, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019434855494182557, "grad_norm": 5.693253040313721, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8623762130737305, "num_tokens": 275322259.0, "step": 7218 }, { "epoch": 0.9183310011448925, "ewc_loss": 0.04787915199995041, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019436769071035087, "grad_norm": 5.707769393920898, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8740711808204651, "num_tokens": 275357410.0, "step": 7219 }, { "epoch": 0.918458211423483, "ewc_loss": 0.047852009534835815, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019409626838751137, "grad_norm": 5.695691108703613, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8718949556350708, "num_tokens": 275388720.0, "step": 7220 }, { "epoch": 0.9185854217020736, "ewc_loss": 0.04788271337747574, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019440332835074514, "grad_norm": 5.678491592407227, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8660759925842285, "num_tokens": 275427699.0, "step": 7221 }, { "epoch": 0.918712631980664, "ewc_loss": 0.04781899228692055, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019376608543097973, "grad_norm": 5.583486557006836, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8640516996383667, "num_tokens": 275472220.0, "step": 7222 }, { "epoch": 0.9188398422592545, "ewc_loss": 0.04790860414505005, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019466219237074256, "grad_norm": 5.718561172485352, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8495521545410156, "num_tokens": 275506693.0, "step": 7223 }, { "epoch": 0.918967052537845, "ewc_loss": 0.04788646474480629, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019444081408437341, "grad_norm": 5.630806922912598, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8641715049743652, "num_tokens": 275547295.0, "step": 7224 }, { "epoch": 0.9190942628164356, "ewc_loss": 0.047921422868967056, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001947904092958197, "grad_norm": 5.659956932067871, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8568860292434692, "num_tokens": 275591552.0, "step": 7225 }, { "epoch": 0.9192214730950261, "ewc_loss": 0.04791703075170517, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019474646251183003, "grad_norm": 5.734163284301758, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8354751467704773, "num_tokens": 275632197.0, "step": 7226 }, { "epoch": 0.9193486833736166, "ewc_loss": 0.047892920672893524, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019450538093224168, "grad_norm": 5.72068452835083, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8531672358512878, "num_tokens": 275670680.0, "step": 7227 }, { "epoch": 0.919475893652207, "ewc_loss": 0.047855194658041, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019412812252994627, "grad_norm": 5.679997444152832, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8588829040527344, "num_tokens": 275709315.0, "step": 7228 }, { "epoch": 0.9196031039307976, "ewc_loss": 0.04781073331832886, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019368348876014352, "grad_norm": 5.752740383148193, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8577873706817627, "num_tokens": 275740405.0, "step": 7229 }, { "epoch": 0.9197303142093881, "ewc_loss": 0.047834523022174835, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019392138347029686, "grad_norm": 5.6951398849487305, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8782919645309448, "num_tokens": 275780915.0, "step": 7230 }, { "epoch": 0.9198575244879786, "ewc_loss": 0.047814056277275085, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019371673988644034, "grad_norm": 5.7078537940979, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8548781871795654, "num_tokens": 275817832.0, "step": 7231 }, { "epoch": 0.9199847347665692, "ewc_loss": 0.047779131680727005, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001933674793690443, "grad_norm": 5.652066230773926, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8635114431381226, "num_tokens": 275857621.0, "step": 7232 }, { "epoch": 0.9201119450451597, "ewc_loss": 0.047824371606111526, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.000193819883861579, "grad_norm": 5.738307952880859, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8569532036781311, "num_tokens": 275898917.0, "step": 7233 }, { "epoch": 0.9202391553237501, "ewc_loss": 0.047787413001060486, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019345029431860894, "grad_norm": 5.7410807609558105, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8649201393127441, "num_tokens": 275930151.0, "step": 7234 }, { "epoch": 0.9203663656023406, "ewc_loss": 0.04778207093477249, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019339688878972083, "grad_norm": 5.678967475891113, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.865769624710083, "num_tokens": 275966901.0, "step": 7235 }, { "epoch": 0.9204935758809312, "ewc_loss": 0.047776736319065094, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019334355602040887, "grad_norm": 5.685443878173828, "learning_rate": 1e-06, "loss": 0.3829, "mean_token_accuracy": 0.8823668956756592, "num_tokens": 276002845.0, "step": 7236 }, { "epoch": 0.9206207861595217, "ewc_loss": 0.04782938212156296, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019386997155379504, "grad_norm": 5.727042198181152, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8587502241134644, "num_tokens": 276038476.0, "step": 7237 }, { "epoch": 0.9207479964381122, "ewc_loss": 0.04778164252638817, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019339259597472847, "grad_norm": 5.662197113037109, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8553590774536133, "num_tokens": 276077867.0, "step": 7238 }, { "epoch": 0.9208752067167028, "ewc_loss": 0.04781334847211838, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019370968220755458, "grad_norm": 5.655206680297852, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8503340482711792, "num_tokens": 276123289.0, "step": 7239 }, { "epoch": 0.9210024169952932, "ewc_loss": 0.047825977206230164, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019383596372790635, "grad_norm": 5.7549872398376465, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8477611541748047, "num_tokens": 276157357.0, "step": 7240 }, { "epoch": 0.9211296272738837, "ewc_loss": 0.04786195605993271, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001941957452800125, "grad_norm": 5.648921489715576, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8811065554618835, "num_tokens": 276195548.0, "step": 7241 }, { "epoch": 0.9212568375524742, "ewc_loss": 0.04791395738720894, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019471574341878295, "grad_norm": 5.740355968475342, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8627033233642578, "num_tokens": 276233533.0, "step": 7242 }, { "epoch": 0.9213840478310648, "ewc_loss": 0.047816887497901917, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019374502880964428, "grad_norm": 5.693490505218506, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8626807332038879, "num_tokens": 276274310.0, "step": 7243 }, { "epoch": 0.9215112581096553, "ewc_loss": 0.047860756516456604, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019418373994994909, "grad_norm": 5.736538887023926, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8583847880363464, "num_tokens": 276310272.0, "step": 7244 }, { "epoch": 0.9216384683882458, "ewc_loss": 0.04783362150192261, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019391239038668573, "grad_norm": 5.679738998413086, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8549675345420837, "num_tokens": 276350334.0, "step": 7245 }, { "epoch": 0.9217656786668362, "ewc_loss": 0.04786718636751175, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019424805941525847, "grad_norm": 5.701903343200684, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8551410436630249, "num_tokens": 276393913.0, "step": 7246 }, { "epoch": 0.9218928889454268, "ewc_loss": 0.04782826453447342, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019385881023481488, "grad_norm": 5.674294471740723, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8566113710403442, "num_tokens": 276430368.0, "step": 7247 }, { "epoch": 0.9220200992240173, "ewc_loss": 0.04788053035736084, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001943815004779026, "grad_norm": 5.704833030700684, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8549751043319702, "num_tokens": 276470930.0, "step": 7248 }, { "epoch": 0.9221473095026078, "ewc_loss": 0.04781343787908554, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019371052621863782, "grad_norm": 5.715825080871582, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.865088701248169, "num_tokens": 276509359.0, "step": 7249 }, { "epoch": 0.9222745197811983, "ewc_loss": 0.04789428785443306, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019451904518064111, "grad_norm": 5.716935157775879, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8518202900886536, "num_tokens": 276546980.0, "step": 7250 }, { "epoch": 0.9224017300597889, "ewc_loss": 0.047840118408203125, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019397735013626516, "grad_norm": 5.681877613067627, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8478063344955444, "num_tokens": 276586212.0, "step": 7251 }, { "epoch": 0.9225289403383793, "ewc_loss": 0.047881852835416794, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019439469906501472, "grad_norm": 5.683608055114746, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8611869812011719, "num_tokens": 276623946.0, "step": 7252 }, { "epoch": 0.9226561506169698, "ewc_loss": 0.04792128503322601, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019478901231195778, "grad_norm": 5.716894149780273, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8555318117141724, "num_tokens": 276667567.0, "step": 7253 }, { "epoch": 0.9227833608955603, "ewc_loss": 0.04788857698440552, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019446191436145455, "grad_norm": 5.6811747550964355, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8537676334381104, "num_tokens": 276708479.0, "step": 7254 }, { "epoch": 0.9229105711741509, "ewc_loss": 0.04785941168665886, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019417027942836285, "grad_norm": 5.734443664550781, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8545048832893372, "num_tokens": 276744627.0, "step": 7255 }, { "epoch": 0.9230377814527414, "ewc_loss": 0.04794018343091011, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019497799803502858, "grad_norm": 5.687532424926758, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8609635829925537, "num_tokens": 276782830.0, "step": 7256 }, { "epoch": 0.9231649917313319, "ewc_loss": 0.047887444496154785, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001944506075233221, "grad_norm": 5.689505100250244, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.858186662197113, "num_tokens": 276825745.0, "step": 7257 }, { "epoch": 0.9232922020099223, "ewc_loss": 0.04793865606188774, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019496273307595402, "grad_norm": 5.715651035308838, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8652487993240356, "num_tokens": 276866743.0, "step": 7258 }, { "epoch": 0.9234194122885129, "ewc_loss": 0.04793289303779602, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019490509293973446, "grad_norm": 5.690555095672607, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.85502028465271, "num_tokens": 276903884.0, "step": 7259 }, { "epoch": 0.9235466225671034, "ewc_loss": 0.047926537692546844, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001948415592778474, "grad_norm": 5.678703308105469, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.865684986114502, "num_tokens": 276939142.0, "step": 7260 }, { "epoch": 0.9236738328456939, "ewc_loss": 0.04795482009649277, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001951243611983955, "grad_norm": 5.709364891052246, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8614441156387329, "num_tokens": 276979041.0, "step": 7261 }, { "epoch": 0.9238010431242845, "ewc_loss": 0.048010025173425674, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019567643175832927, "grad_norm": 5.690933704376221, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8734616041183472, "num_tokens": 277015590.0, "step": 7262 }, { "epoch": 0.923928253402875, "ewc_loss": 0.04799012094736099, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019547736155800521, "grad_norm": 5.764815330505371, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.852649986743927, "num_tokens": 277055745.0, "step": 7263 }, { "epoch": 0.9240554636814655, "ewc_loss": 0.0479709655046463, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.000195285800145939, "grad_norm": 5.700476169586182, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8553586006164551, "num_tokens": 277098856.0, "step": 7264 }, { "epoch": 0.9241826739600559, "ewc_loss": 0.04802749305963516, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001958511129487306, "grad_norm": 5.7250895500183105, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.871391773223877, "num_tokens": 277139165.0, "step": 7265 }, { "epoch": 0.9243098842386465, "ewc_loss": 0.048005081713199615, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019562701345421374, "grad_norm": 5.736440658569336, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8640460968017578, "num_tokens": 277171879.0, "step": 7266 }, { "epoch": 0.924437094517237, "ewc_loss": 0.04800056666135788, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001955818443093449, "grad_norm": 5.699103832244873, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8603779077529907, "num_tokens": 277211719.0, "step": 7267 }, { "epoch": 0.9245643047958275, "ewc_loss": 0.04801976680755615, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019577382772695273, "grad_norm": 5.722153663635254, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8506327271461487, "num_tokens": 277252675.0, "step": 7268 }, { "epoch": 0.924691515074418, "ewc_loss": 0.04798988997936249, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019547506235539913, "grad_norm": 5.777433395385742, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8481267690658569, "num_tokens": 277287529.0, "step": 7269 }, { "epoch": 0.9248187253530086, "ewc_loss": 0.04800577461719513, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019563389651011676, "grad_norm": 5.728625774383545, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8595709204673767, "num_tokens": 277328392.0, "step": 7270 }, { "epoch": 0.924945935631599, "ewc_loss": 0.047968048602342606, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001952566672116518, "grad_norm": 5.679798603057861, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8697466254234314, "num_tokens": 277368412.0, "step": 7271 }, { "epoch": 0.9250731459101895, "ewc_loss": 0.04797164350748062, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001952926249941811, "grad_norm": 5.716917514801025, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8653364777565002, "num_tokens": 277404373.0, "step": 7272 }, { "epoch": 0.92520035618878, "ewc_loss": 0.047973282635211945, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019530901045072824, "grad_norm": 5.691109657287598, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8568942546844482, "num_tokens": 277444988.0, "step": 7273 }, { "epoch": 0.9253275664673706, "ewc_loss": 0.0479658842086792, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019523499940987676, "grad_norm": 5.732907772064209, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8690972328186035, "num_tokens": 277487501.0, "step": 7274 }, { "epoch": 0.9254547767459611, "ewc_loss": 0.0479976050555706, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019555221660993993, "grad_norm": 5.684201240539551, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.857451319694519, "num_tokens": 277529056.0, "step": 7275 }, { "epoch": 0.9255819870245516, "ewc_loss": 0.047989796847105026, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019547413103282452, "grad_norm": 5.750824451446533, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8716489672660828, "num_tokens": 277563022.0, "step": 7276 }, { "epoch": 0.925709197303142, "ewc_loss": 0.048027582466602325, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019585197151172906, "grad_norm": 5.706291198730469, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8668038845062256, "num_tokens": 277603879.0, "step": 7277 }, { "epoch": 0.9258364075817326, "ewc_loss": 0.04802772402763367, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001958534267032519, "grad_norm": 5.696805477142334, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8563697934150696, "num_tokens": 277643711.0, "step": 7278 }, { "epoch": 0.9259636178603231, "ewc_loss": 0.048002809286117554, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019560428336262703, "grad_norm": 5.761846542358398, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.860327959060669, "num_tokens": 277680667.0, "step": 7279 }, { "epoch": 0.9260908281389136, "ewc_loss": 0.048019781708717346, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019577397324610502, "grad_norm": 5.730005741119385, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8580138683319092, "num_tokens": 277715548.0, "step": 7280 }, { "epoch": 0.9262180384175042, "ewc_loss": 0.048005834221839905, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019563449313864112, "grad_norm": 5.7648210525512695, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.860502302646637, "num_tokens": 277745287.0, "step": 7281 }, { "epoch": 0.9263452486960947, "ewc_loss": 0.04802674055099487, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001958435750566423, "grad_norm": 5.73800802230835, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8548476696014404, "num_tokens": 277781484.0, "step": 7282 }, { "epoch": 0.9264724589746851, "ewc_loss": 0.04801855981349945, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019576179329305887, "grad_norm": 5.728141784667969, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8628919720649719, "num_tokens": 277818429.0, "step": 7283 }, { "epoch": 0.9265996692532756, "ewc_loss": 0.047986894845962524, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019544511451385915, "grad_norm": 5.728726387023926, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8679771423339844, "num_tokens": 277853077.0, "step": 7284 }, { "epoch": 0.9267268795318662, "ewc_loss": 0.04799327999353409, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019550899742171168, "grad_norm": 5.609343528747559, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8784070611000061, "num_tokens": 277893997.0, "step": 7285 }, { "epoch": 0.9268540898104567, "ewc_loss": 0.0480647012591362, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019622317631728947, "grad_norm": 5.786587238311768, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8491482138633728, "num_tokens": 277934680.0, "step": 7286 }, { "epoch": 0.9269813000890472, "ewc_loss": 0.0480855293571949, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019643145787995309, "grad_norm": 5.650084018707275, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8525017499923706, "num_tokens": 277980370.0, "step": 7287 }, { "epoch": 0.9271085103676378, "ewc_loss": 0.048030443489551544, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001958806096808985, "grad_norm": 5.785470008850098, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.8718001842498779, "num_tokens": 278017257.0, "step": 7288 }, { "epoch": 0.9272357206462282, "ewc_loss": 0.048059649765491486, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019617265206761658, "grad_norm": 5.755281925201416, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8602731227874756, "num_tokens": 278054835.0, "step": 7289 }, { "epoch": 0.9273629309248187, "ewc_loss": 0.04804038256406784, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019597997015807778, "grad_norm": 5.710571765899658, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8589744567871094, "num_tokens": 278089510.0, "step": 7290 }, { "epoch": 0.9274901412034092, "ewc_loss": 0.04804056137800217, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001959817745955661, "grad_norm": 5.705031871795654, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8617650270462036, "num_tokens": 278126179.0, "step": 7291 }, { "epoch": 0.9276173514819998, "ewc_loss": 0.04807186871767044, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001962948590517044, "grad_norm": 5.78543758392334, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8380353450775146, "num_tokens": 278170837.0, "step": 7292 }, { "epoch": 0.9277445617605903, "ewc_loss": 0.048034727573394775, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001959234505193308, "grad_norm": 5.6790947914123535, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8645331263542175, "num_tokens": 278207410.0, "step": 7293 }, { "epoch": 0.9278717720391808, "ewc_loss": 0.048050474375486374, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019608091679401696, "grad_norm": 6.023624897003174, "learning_rate": 1e-06, "loss": 0.3956, "mean_token_accuracy": 0.8807336091995239, "num_tokens": 278240495.0, "step": 7294 }, { "epoch": 0.9279989823177712, "ewc_loss": 0.048076242208480835, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019633860210888088, "grad_norm": 5.65441370010376, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.851929247379303, "num_tokens": 278278449.0, "step": 7295 }, { "epoch": 0.9281261925963618, "ewc_loss": 0.04793085902929306, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019488476391416043, "grad_norm": 5.709706783294678, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8698738217353821, "num_tokens": 278321434.0, "step": 7296 }, { "epoch": 0.9282534028749523, "ewc_loss": 0.048008255660533905, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019565873662941158, "grad_norm": 5.7018866539001465, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8548480272293091, "num_tokens": 278363730.0, "step": 7297 }, { "epoch": 0.9283806131535428, "ewc_loss": 0.04800570756196976, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001956332562258467, "grad_norm": 5.757713317871094, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8615835905075073, "num_tokens": 278395493.0, "step": 7298 }, { "epoch": 0.9285078234321333, "ewc_loss": 0.04802640527486801, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019584021356422454, "grad_norm": 5.674627304077148, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.863303542137146, "num_tokens": 278433024.0, "step": 7299 }, { "epoch": 0.9286350337107239, "ewc_loss": 0.04797434061765671, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001953196042450145, "grad_norm": 5.717007160186768, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8613597750663757, "num_tokens": 278471311.0, "step": 7300 }, { "epoch": 0.9287622439893143, "ewc_loss": 0.04803679883480072, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019594414334278554, "grad_norm": 5.685247421264648, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8795570135116577, "num_tokens": 278509151.0, "step": 7301 }, { "epoch": 0.9288894542679048, "ewc_loss": 0.04803746938705444, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019595086632762104, "grad_norm": 5.7134552001953125, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8677308559417725, "num_tokens": 278548743.0, "step": 7302 }, { "epoch": 0.9290166645464953, "ewc_loss": 0.048061564564704895, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019619181693997234, "grad_norm": 5.7093024253845215, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8722086548805237, "num_tokens": 278590722.0, "step": 7303 }, { "epoch": 0.9291438748250859, "ewc_loss": 0.048066411167383194, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001962402748176828, "grad_norm": 5.762901306152344, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8479155898094177, "num_tokens": 278628150.0, "step": 7304 }, { "epoch": 0.9292710851036764, "ewc_loss": 0.048014312982559204, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019571930170059204, "grad_norm": 5.728682041168213, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8454679250717163, "num_tokens": 278672543.0, "step": 7305 }, { "epoch": 0.9293982953822669, "ewc_loss": 0.04800141602754593, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019559034262783825, "grad_norm": 5.735795974731445, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8693751096725464, "num_tokens": 278707234.0, "step": 7306 }, { "epoch": 0.9295255056608573, "ewc_loss": 0.04801347106695175, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001957108615897596, "grad_norm": 5.856241226196289, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8623067140579224, "num_tokens": 278734678.0, "step": 7307 }, { "epoch": 0.9296527159394479, "ewc_loss": 0.04798521846532822, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019542837981134653, "grad_norm": 5.672183990478516, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8706945180892944, "num_tokens": 278778284.0, "step": 7308 }, { "epoch": 0.9297799262180384, "ewc_loss": 0.04805529862642288, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019612918549682945, "grad_norm": 5.781497955322266, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8597954511642456, "num_tokens": 278815363.0, "step": 7309 }, { "epoch": 0.9299071364966289, "ewc_loss": 0.04798935726284981, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019546973635442555, "grad_norm": 5.7284955978393555, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8486875295639038, "num_tokens": 278860777.0, "step": 7310 }, { "epoch": 0.9300343467752195, "ewc_loss": 0.04807104915380478, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019628666632343084, "grad_norm": 5.8200225830078125, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8750659227371216, "num_tokens": 278890552.0, "step": 7311 }, { "epoch": 0.93016155705381, "ewc_loss": 0.04796352982521057, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019521148351486772, "grad_norm": 5.776395797729492, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8492048382759094, "num_tokens": 278924741.0, "step": 7312 }, { "epoch": 0.9302887673324005, "ewc_loss": 0.04803190380334854, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.000195895234355703, "grad_norm": 5.728805065155029, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8698904514312744, "num_tokens": 278965185.0, "step": 7313 }, { "epoch": 0.9304159776109909, "ewc_loss": 0.048000454902648926, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019558069470804185, "grad_norm": 5.78971004486084, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8502873182296753, "num_tokens": 278998961.0, "step": 7314 }, { "epoch": 0.9305431878895815, "ewc_loss": 0.04799602925777435, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001955364568857476, "grad_norm": 5.766719341278076, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8487247228622437, "num_tokens": 279032689.0, "step": 7315 }, { "epoch": 0.930670398168172, "ewc_loss": 0.047970738261938095, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019528355915099382, "grad_norm": 5.669748783111572, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8474786281585693, "num_tokens": 279080305.0, "step": 7316 }, { "epoch": 0.9307976084467625, "ewc_loss": 0.04803554713726044, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019593164324760437, "grad_norm": 5.743054389953613, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8610812425613403, "num_tokens": 279120581.0, "step": 7317 }, { "epoch": 0.930924818725353, "ewc_loss": 0.04799044877290726, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019548067939467728, "grad_norm": 5.699143886566162, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8726028203964233, "num_tokens": 279165050.0, "step": 7318 }, { "epoch": 0.9310520290039436, "ewc_loss": 0.047988615930080414, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001954623294295743, "grad_norm": 5.675283432006836, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8647787570953369, "num_tokens": 279209393.0, "step": 7319 }, { "epoch": 0.931179239282534, "ewc_loss": 0.04809608310461044, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019653700292110443, "grad_norm": 5.778414726257324, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8482990860939026, "num_tokens": 279250569.0, "step": 7320 }, { "epoch": 0.9313064495611245, "ewc_loss": 0.048029422760009766, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019587037968449295, "grad_norm": 5.687375068664551, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8662233948707581, "num_tokens": 279291154.0, "step": 7321 }, { "epoch": 0.931433659839715, "ewc_loss": 0.048039961606264114, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.000195975779206492, "grad_norm": 5.760693550109863, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8602381944656372, "num_tokens": 279326967.0, "step": 7322 }, { "epoch": 0.9315608701183056, "ewc_loss": 0.048045795410871506, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019603413238655776, "grad_norm": 5.71729850769043, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8736693859100342, "num_tokens": 279367129.0, "step": 7323 }, { "epoch": 0.9316880803968961, "ewc_loss": 0.0480453297495842, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019602946122176945, "grad_norm": 5.752216815948486, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8667114973068237, "num_tokens": 279403651.0, "step": 7324 }, { "epoch": 0.9318152906754866, "ewc_loss": 0.04803744703531265, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019595064804889262, "grad_norm": 5.887980937957764, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8540218472480774, "num_tokens": 279439166.0, "step": 7325 }, { "epoch": 0.931942500954077, "ewc_loss": 0.04799880087375641, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019556416373234242, "grad_norm": 5.7120280265808105, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8589417338371277, "num_tokens": 279473489.0, "step": 7326 }, { "epoch": 0.9320697112326676, "ewc_loss": 0.047973766922950745, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019531382713466883, "grad_norm": 5.725202560424805, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.864288330078125, "num_tokens": 279511709.0, "step": 7327 }, { "epoch": 0.9321969215112581, "ewc_loss": 0.047983311116695404, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019540925859473646, "grad_norm": 5.714011192321777, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.848771870136261, "num_tokens": 279551638.0, "step": 7328 }, { "epoch": 0.9323241317898486, "ewc_loss": 0.0480692982673645, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001962691603694111, "grad_norm": 5.752102375030518, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8722938895225525, "num_tokens": 279587304.0, "step": 7329 }, { "epoch": 0.9324513420684392, "ewc_loss": 0.047996461391448975, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001955408079084009, "grad_norm": 5.756866455078125, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8519591689109802, "num_tokens": 279628138.0, "step": 7330 }, { "epoch": 0.9325785523470297, "ewc_loss": 0.048050206154584885, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019607823924161494, "grad_norm": 5.737237930297852, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8649823665618896, "num_tokens": 279663064.0, "step": 7331 }, { "epoch": 0.9327057626256201, "ewc_loss": 0.048014454543590546, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019572072778828442, "grad_norm": 5.715261936187744, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8519986867904663, "num_tokens": 279704937.0, "step": 7332 }, { "epoch": 0.9328329729042106, "ewc_loss": 0.04805509001016617, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019612704636529088, "grad_norm": 5.781088829040527, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8631965517997742, "num_tokens": 279738215.0, "step": 7333 }, { "epoch": 0.9329601831828012, "ewc_loss": 0.04806876182556152, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019626377616077662, "grad_norm": 5.750045299530029, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8442690372467041, "num_tokens": 279771996.0, "step": 7334 }, { "epoch": 0.9330873934613917, "ewc_loss": 0.04804808273911476, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019605700799729675, "grad_norm": 5.704326152801514, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8749753832817078, "num_tokens": 279810037.0, "step": 7335 }, { "epoch": 0.9332146037399822, "ewc_loss": 0.04806336760520935, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019620986131485552, "grad_norm": 5.75604772567749, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8547412157058716, "num_tokens": 279847394.0, "step": 7336 }, { "epoch": 0.9333418140185727, "ewc_loss": 0.04806242138147354, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001962003589142114, "grad_norm": 5.773416519165039, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8459179401397705, "num_tokens": 279879057.0, "step": 7337 }, { "epoch": 0.9334690242971632, "ewc_loss": 0.048120543360710144, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019678162061609328, "grad_norm": 5.818941116333008, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8544197082519531, "num_tokens": 279911775.0, "step": 7338 }, { "epoch": 0.9335962345757537, "ewc_loss": 0.04827665537595749, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019590131705626845, "grad_norm": 5.725086688995361, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8629339933395386, "num_tokens": 279947661.0, "step": 7339 }, { "epoch": 0.9337234448543442, "ewc_loss": 0.04812202975153923, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019679647812154144, "grad_norm": 5.852772235870361, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8618822693824768, "num_tokens": 279977507.0, "step": 7340 }, { "epoch": 0.9338506551329347, "ewc_loss": 0.04804317280650139, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.000196007895283401, "grad_norm": 5.630335330963135, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8567752838134766, "num_tokens": 280016749.0, "step": 7341 }, { "epoch": 0.9339778654115253, "ewc_loss": 0.048170097172260284, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019727715698536485, "grad_norm": 5.790716648101807, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8624752759933472, "num_tokens": 280056092.0, "step": 7342 }, { "epoch": 0.9341050756901158, "ewc_loss": 0.04815031960606575, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001970793673535809, "grad_norm": 5.701255798339844, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8723632097244263, "num_tokens": 280089803.0, "step": 7343 }, { "epoch": 0.9342322859687062, "ewc_loss": 0.048153407871723175, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001971102465176955, "grad_norm": 5.761649131774902, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8491259217262268, "num_tokens": 280126190.0, "step": 7344 }, { "epoch": 0.9343594962472968, "ewc_loss": 0.04814692959189415, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001970454613910988, "grad_norm": 5.727319717407227, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8716475963592529, "num_tokens": 280162743.0, "step": 7345 }, { "epoch": 0.9344867065258873, "ewc_loss": 0.04809316247701645, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019650778267532587, "grad_norm": 5.714609146118164, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8605388402938843, "num_tokens": 280203676.0, "step": 7346 }, { "epoch": 0.9346139168044778, "ewc_loss": 0.04816185683012009, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019719473493751138, "grad_norm": 5.732977867126465, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8771696090698242, "num_tokens": 280241765.0, "step": 7347 }, { "epoch": 0.9347411270830683, "ewc_loss": 0.04807140305638313, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019629020243883133, "grad_norm": 5.7972869873046875, "learning_rate": 1e-06, "loss": 0.5565, "mean_token_accuracy": 0.83974289894104, "num_tokens": 280280103.0, "step": 7348 }, { "epoch": 0.9348683373616589, "ewc_loss": 0.04813252389431, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001969014119822532, "grad_norm": 5.811554908752441, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8436218500137329, "num_tokens": 280314674.0, "step": 7349 }, { "epoch": 0.9349955476402493, "ewc_loss": 0.048057787120342255, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019615406927186996, "grad_norm": 5.669647216796875, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8623676300048828, "num_tokens": 280353890.0, "step": 7350 }, { "epoch": 0.9351227579188398, "ewc_loss": 0.048072442412376404, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019630062161013484, "grad_norm": 5.76154088973999, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8532025814056396, "num_tokens": 280396403.0, "step": 7351 }, { "epoch": 0.9352499681974303, "ewc_loss": 0.04806990921497345, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019627524306997657, "grad_norm": 5.7203755378723145, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8608043193817139, "num_tokens": 280436154.0, "step": 7352 }, { "epoch": 0.9353771784760209, "ewc_loss": 0.048028551042079926, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019586166308727115, "grad_norm": 5.732702255249023, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8674864768981934, "num_tokens": 280476480.0, "step": 7353 }, { "epoch": 0.9355043887546114, "ewc_loss": 0.04802509397268295, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019582713139243424, "grad_norm": 5.794819355010986, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8648192286491394, "num_tokens": 280505976.0, "step": 7354 }, { "epoch": 0.9356315990332019, "ewc_loss": 0.048033714294433594, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019591333693824708, "grad_norm": 5.778202056884766, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8572922348976135, "num_tokens": 280548365.0, "step": 7355 }, { "epoch": 0.9357588093117923, "ewc_loss": 0.04821265861392021, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019526135292835534, "grad_norm": 5.706528663635254, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8791080117225647, "num_tokens": 280588074.0, "step": 7356 }, { "epoch": 0.9358860195903829, "ewc_loss": 0.04809737205505371, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019532919395714998, "grad_norm": 5.70960807800293, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.869929313659668, "num_tokens": 280626915.0, "step": 7357 }, { "epoch": 0.9360132298689734, "ewc_loss": 0.048110246658325195, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019545793475117534, "grad_norm": 5.781871318817139, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8568326234817505, "num_tokens": 280660088.0, "step": 7358 }, { "epoch": 0.9361404401475639, "ewc_loss": 0.048115700483322144, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001955124898813665, "grad_norm": 5.7191596031188965, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8586097955703735, "num_tokens": 280701326.0, "step": 7359 }, { "epoch": 0.9362676504261545, "ewc_loss": 0.0481061190366745, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019541665096767247, "grad_norm": 5.702971935272217, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8706496357917786, "num_tokens": 280740256.0, "step": 7360 }, { "epoch": 0.936394860704745, "ewc_loss": 0.048127200454473495, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019562747911550105, "grad_norm": 5.755608081817627, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8750476241111755, "num_tokens": 280773515.0, "step": 7361 }, { "epoch": 0.9365220709833355, "ewc_loss": 0.048169396817684174, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019604945555329323, "grad_norm": 5.738495349884033, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8361436128616333, "num_tokens": 280817207.0, "step": 7362 }, { "epoch": 0.9366492812619259, "ewc_loss": 0.048135362565517426, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019570910080801696, "grad_norm": 5.71816349029541, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8479634523391724, "num_tokens": 280860253.0, "step": 7363 }, { "epoch": 0.9367764915405165, "ewc_loss": 0.048283651471138, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019597129721660167, "grad_norm": 5.786423206329346, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8578174114227295, "num_tokens": 280892634.0, "step": 7364 }, { "epoch": 0.936903701819107, "ewc_loss": 0.04816422238945961, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019599769439082593, "grad_norm": 5.702349662780762, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8698868751525879, "num_tokens": 280930912.0, "step": 7365 }, { "epoch": 0.9370309120976975, "ewc_loss": 0.04818373918533325, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019619283557403833, "grad_norm": 5.726922988891602, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8639324307441711, "num_tokens": 280971588.0, "step": 7366 }, { "epoch": 0.937158122376288, "ewc_loss": 0.04821094125509262, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019646486907731742, "grad_norm": 5.7808380126953125, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.868804931640625, "num_tokens": 281009898.0, "step": 7367 }, { "epoch": 0.9372853326548786, "ewc_loss": 0.04816742241382599, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019602970860432833, "grad_norm": 5.711486339569092, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8531331419944763, "num_tokens": 281046726.0, "step": 7368 }, { "epoch": 0.937412542933469, "ewc_loss": 0.04831888526678085, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019632359908428043, "grad_norm": 5.708188056945801, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8744960427284241, "num_tokens": 281085163.0, "step": 7369 }, { "epoch": 0.9375397532120595, "ewc_loss": 0.04812411963939667, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019681734556797892, "grad_norm": 5.729034900665283, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8665564656257629, "num_tokens": 281128660.0, "step": 7370 }, { "epoch": 0.93766696349065, "ewc_loss": 0.048140957951545715, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019698574033100158, "grad_norm": 5.800009727478027, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8432459831237793, "num_tokens": 281163780.0, "step": 7371 }, { "epoch": 0.9377941737692406, "ewc_loss": 0.04810629040002823, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019663908460643142, "grad_norm": 5.745045185089111, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8692574501037598, "num_tokens": 281204247.0, "step": 7372 }, { "epoch": 0.9379213840478311, "ewc_loss": 0.04809766262769699, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001965527917491272, "grad_norm": 5.739752769470215, "learning_rate": 1e-06, "loss": 0.3841, "mean_token_accuracy": 0.8853007555007935, "num_tokens": 281239518.0, "step": 7373 }, { "epoch": 0.9380485943264216, "ewc_loss": 0.048096805810928345, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001965442206710577, "grad_norm": 5.748567581176758, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.869492769241333, "num_tokens": 281275916.0, "step": 7374 }, { "epoch": 0.938175804605012, "ewc_loss": 0.04807719960808754, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019634816271718591, "grad_norm": 5.752359867095947, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8712799549102783, "num_tokens": 281307615.0, "step": 7375 }, { "epoch": 0.9383030148836026, "ewc_loss": 0.04832674190402031, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001964021794265136, "grad_norm": 5.774260520935059, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8660674095153809, "num_tokens": 281342677.0, "step": 7376 }, { "epoch": 0.9384302251621931, "ewc_loss": 0.04811602830886841, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001967364369193092, "grad_norm": 5.788740158081055, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8562287092208862, "num_tokens": 281379545.0, "step": 7377 }, { "epoch": 0.9385574354407836, "ewc_loss": 0.048089876770973206, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.00019647492445074022, "grad_norm": 5.708291053771973, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8426530361175537, "num_tokens": 281424169.0, "step": 7378 }, { "epoch": 0.9386846457193742, "ewc_loss": 0.04806821793317795, "ewc_loss_diag": 2.849102020263672e-05, "ewc_loss_parallel": 0.0001962583774002269, "grad_norm": 5.739121913909912, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8813174962997437, "num_tokens": 281461306.0, "step": 7379 }, { "epoch": 0.9388118559979647, "ewc_loss": 0.048195019364356995, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019630567112471908, "grad_norm": 5.675701141357422, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8757511377334595, "num_tokens": 281498753.0, "step": 7380 }, { "epoch": 0.9389390662765551, "ewc_loss": 0.04825346916913986, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019689014879986644, "grad_norm": 5.733945369720459, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8707976341247559, "num_tokens": 281537163.0, "step": 7381 }, { "epoch": 0.9390662765551456, "ewc_loss": 0.04819389060139656, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019629437883850187, "grad_norm": 5.7407073974609375, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8522495031356812, "num_tokens": 281577926.0, "step": 7382 }, { "epoch": 0.9391934868337362, "ewc_loss": 0.048228755593299866, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019664301362354308, "grad_norm": 5.7034525871276855, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8800901174545288, "num_tokens": 281621283.0, "step": 7383 }, { "epoch": 0.9393206971123267, "ewc_loss": 0.048214852809906006, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019650401372928172, "grad_norm": 5.716090202331543, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8646637797355652, "num_tokens": 281658336.0, "step": 7384 }, { "epoch": 0.9394479073909172, "ewc_loss": 0.04823184013366699, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019667389278765768, "grad_norm": 5.709983825683594, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8651418685913086, "num_tokens": 281695137.0, "step": 7385 }, { "epoch": 0.9395751176695077, "ewc_loss": 0.048240453004837036, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019675999647006392, "grad_norm": 5.676862716674805, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8757107257843018, "num_tokens": 281733570.0, "step": 7386 }, { "epoch": 0.9397023279480982, "ewc_loss": 0.04832380637526512, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019759353017434478, "grad_norm": 5.816812515258789, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8637338280677795, "num_tokens": 281771163.0, "step": 7387 }, { "epoch": 0.9398295382266887, "ewc_loss": 0.04827231168746948, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019707858155015856, "grad_norm": 5.690797328948975, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8639371395111084, "num_tokens": 281813715.0, "step": 7388 }, { "epoch": 0.9399567485052792, "ewc_loss": 0.048314303159713745, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019749847706407309, "grad_norm": 5.78001594543457, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.859564483165741, "num_tokens": 281856921.0, "step": 7389 }, { "epoch": 0.9400839587838697, "ewc_loss": 0.04819731414318085, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019632863404694945, "grad_norm": 5.748668670654297, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8527930974960327, "num_tokens": 281893273.0, "step": 7390 }, { "epoch": 0.9402111690624603, "ewc_loss": 0.04825102537870407, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019686574523802847, "grad_norm": 5.76967716217041, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8658984303474426, "num_tokens": 281927452.0, "step": 7391 }, { "epoch": 0.9403383793410508, "ewc_loss": 0.0482402965426445, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019675841031130403, "grad_norm": 5.676708698272705, "learning_rate": 1e-06, "loss": 0.3698, "mean_token_accuracy": 0.886782705783844, "num_tokens": 281965359.0, "step": 7392 }, { "epoch": 0.9404655896196412, "ewc_loss": 0.0482892245054245, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001972477330127731, "grad_norm": 5.815288066864014, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8590757846832275, "num_tokens": 281999209.0, "step": 7393 }, { "epoch": 0.9405927998982317, "ewc_loss": 0.04825780540704727, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019693354261107743, "grad_norm": 5.711076259613037, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8522718548774719, "num_tokens": 282042129.0, "step": 7394 }, { "epoch": 0.9407200101768223, "ewc_loss": 0.04828470200300217, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019720246200449765, "grad_norm": 5.764042854309082, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8525245189666748, "num_tokens": 282087941.0, "step": 7395 }, { "epoch": 0.9408472204554128, "ewc_loss": 0.04828634113073349, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019721887656487525, "grad_norm": 5.783482551574707, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8663680553436279, "num_tokens": 282123130.0, "step": 7396 }, { "epoch": 0.9409744307340033, "ewc_loss": 0.04824677109718323, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019682319543790072, "grad_norm": 5.758983612060547, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8572700023651123, "num_tokens": 282161142.0, "step": 7397 }, { "epoch": 0.9411016410125939, "ewc_loss": 0.048394858837127686, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019708335457835346, "grad_norm": 5.75379753112793, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8843832612037659, "num_tokens": 282196363.0, "step": 7398 }, { "epoch": 0.9412288512911843, "ewc_loss": 0.048388730734586716, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001970220764633268, "grad_norm": 5.8087897300720215, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8426881432533264, "num_tokens": 282233202.0, "step": 7399 }, { "epoch": 0.9413560615697748, "ewc_loss": 0.04920013248920441, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00019659119425341487, "grad_norm": 35.71591567993164, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8624906539916992, "num_tokens": 282270976.0, "step": 7400 }, { "epoch": 0.9414832718483653, "ewc_loss": 0.06794657558202744, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0003913798136636615, "grad_norm": 11.073311805725098, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8712038397789001, "num_tokens": 282312223.0, "step": 7401 }, { "epoch": 0.9416104821269559, "ewc_loss": 0.05485297366976738, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00026166450697928667, "grad_norm": 5.9755401611328125, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.8448968529701233, "num_tokens": 282350418.0, "step": 7402 }, { "epoch": 0.9417376924055464, "ewc_loss": 0.05368310958147049, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002499658730812371, "grad_norm": 7.302571773529053, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8621124029159546, "num_tokens": 282380751.0, "step": 7403 }, { "epoch": 0.9418649026841369, "ewc_loss": 0.058509744703769684, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002982321893796325, "grad_norm": 7.539618015289307, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8729256391525269, "num_tokens": 282420248.0, "step": 7404 }, { "epoch": 0.9419921129627273, "ewc_loss": 0.05193152278661728, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00023367068206425756, "grad_norm": 6.063706874847412, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8662174940109253, "num_tokens": 282457951.0, "step": 7405 }, { "epoch": 0.9421193232413179, "ewc_loss": 0.05204112082719803, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00023354600125458091, "grad_norm": 6.778571605682373, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8615555763244629, "num_tokens": 282507379.0, "step": 7406 }, { "epoch": 0.9422465335199084, "ewc_loss": 0.05295030027627945, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00024263774685095996, "grad_norm": 6.4869561195373535, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8501695990562439, "num_tokens": 282546905.0, "step": 7407 }, { "epoch": 0.9423737437984989, "ewc_loss": 0.050410568714141846, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00021846118033863604, "grad_norm": 6.183478832244873, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8329117298126221, "num_tokens": 282591368.0, "step": 7408 }, { "epoch": 0.9425009540770894, "ewc_loss": 0.050640106201171875, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00022075652668718249, "grad_norm": 6.357253551483154, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8608881235122681, "num_tokens": 282617331.0, "step": 7409 }, { "epoch": 0.94262816435568, "ewc_loss": 0.050243474543094635, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00021679022756870836, "grad_norm": 6.108443737030029, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8752007484436035, "num_tokens": 282652346.0, "step": 7410 }, { "epoch": 0.9427553746342705, "ewc_loss": 0.04964315891265869, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00021078706777188927, "grad_norm": 6.068761825561523, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.866182267665863, "num_tokens": 282691564.0, "step": 7411 }, { "epoch": 0.9428825849128609, "ewc_loss": 0.04953896999359131, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002097451506415382, "grad_norm": 6.040354251861572, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8719708323478699, "num_tokens": 282728186.0, "step": 7412 }, { "epoch": 0.9430097951914514, "ewc_loss": 0.0492648109793663, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020700359891634434, "grad_norm": 6.026030540466309, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8550416231155396, "num_tokens": 282770491.0, "step": 7413 }, { "epoch": 0.943137005470042, "ewc_loss": 0.04902860149741173, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020464148838073015, "grad_norm": 5.891770362854004, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8552436828613281, "num_tokens": 282809824.0, "step": 7414 }, { "epoch": 0.9432642157486325, "ewc_loss": 0.04889407753944397, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.000203296251129359, "grad_norm": 5.974503517150879, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8672510385513306, "num_tokens": 282846735.0, "step": 7415 }, { "epoch": 0.943391426027223, "ewc_loss": 0.04877031221985817, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020205858163535595, "grad_norm": 5.87239408493042, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8804399371147156, "num_tokens": 282882242.0, "step": 7416 }, { "epoch": 0.9435186363058136, "ewc_loss": 0.04860062524676323, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020036171190440655, "grad_norm": 5.878366470336914, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8503763675689697, "num_tokens": 282918774.0, "step": 7417 }, { "epoch": 0.943645846584404, "ewc_loss": 0.048585131764411926, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020020680676680058, "grad_norm": 5.8651018142700195, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.853726863861084, "num_tokens": 282957766.0, "step": 7418 }, { "epoch": 0.9437730568629945, "ewc_loss": 0.04852570593357086, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019961253565270454, "grad_norm": 5.909402847290039, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8502957820892334, "num_tokens": 282995980.0, "step": 7419 }, { "epoch": 0.943900267141585, "ewc_loss": 0.04842450097203255, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019860047905240208, "grad_norm": 5.80783224105835, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8623729348182678, "num_tokens": 283033148.0, "step": 7420 }, { "epoch": 0.9440274774201756, "ewc_loss": 0.048475466668605804, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001991101453313604, "grad_norm": 5.903679370880127, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8657064437866211, "num_tokens": 283069781.0, "step": 7421 }, { "epoch": 0.9441546876987661, "ewc_loss": 0.04839252680540085, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019828071526717395, "grad_norm": 5.83012580871582, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8541049957275391, "num_tokens": 283107270.0, "step": 7422 }, { "epoch": 0.9442818979773566, "ewc_loss": 0.04833751171827316, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001977305655600503, "grad_norm": 5.881882667541504, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8733073472976685, "num_tokens": 283143571.0, "step": 7423 }, { "epoch": 0.944409108255947, "ewc_loss": 0.04830306023359299, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001973860926227644, "grad_norm": 5.820398330688477, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8660776615142822, "num_tokens": 283179995.0, "step": 7424 }, { "epoch": 0.9445363185345376, "ewc_loss": 0.048299942165613174, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001973548933165148, "grad_norm": 5.7714762687683105, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8616955876350403, "num_tokens": 283218370.0, "step": 7425 }, { "epoch": 0.9446635288131281, "ewc_loss": 0.048289306461811066, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019724853336811066, "grad_norm": 5.944645404815674, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8556843400001526, "num_tokens": 283247879.0, "step": 7426 }, { "epoch": 0.9447907390917186, "ewc_loss": 0.048320040106773376, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019755585526581854, "grad_norm": 5.774012088775635, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8767926692962646, "num_tokens": 283285027.0, "step": 7427 }, { "epoch": 0.9449179493703092, "ewc_loss": 0.04825863987207413, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019694185175467283, "grad_norm": 5.810328483581543, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8652496337890625, "num_tokens": 283324934.0, "step": 7428 }, { "epoch": 0.9450451596488997, "ewc_loss": 0.048281192779541016, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019716737733688205, "grad_norm": 5.797195911407471, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8693646192550659, "num_tokens": 283357540.0, "step": 7429 }, { "epoch": 0.9451723699274901, "ewc_loss": 0.048289284110069275, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019724831508938223, "grad_norm": 5.85940408706665, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8599098920822144, "num_tokens": 283393470.0, "step": 7430 }, { "epoch": 0.9452995802060806, "ewc_loss": 0.048348333686590195, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001978388027055189, "grad_norm": 5.776500701904297, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8623389005661011, "num_tokens": 283429916.0, "step": 7431 }, { "epoch": 0.9454267904846712, "ewc_loss": 0.04831317067146301, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019748717022594064, "grad_norm": 5.792454242706299, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8495649099349976, "num_tokens": 283467997.0, "step": 7432 }, { "epoch": 0.9455540007632617, "ewc_loss": 0.04836048185825348, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019796031119767576, "grad_norm": 5.907543659210205, "learning_rate": 1e-06, "loss": 0.5583, "mean_token_accuracy": 0.826729953289032, "num_tokens": 283504741.0, "step": 7433 }, { "epoch": 0.9456812110418522, "ewc_loss": 0.04833872616291046, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019774273096118122, "grad_norm": 5.841701984405518, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8712416887283325, "num_tokens": 283534535.0, "step": 7434 }, { "epoch": 0.9458084213204427, "ewc_loss": 0.04833013564348221, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001976568455575034, "grad_norm": 5.814877986907959, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.872806191444397, "num_tokens": 283571600.0, "step": 7435 }, { "epoch": 0.9459356315990332, "ewc_loss": 0.04830916225910187, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019744710880331695, "grad_norm": 5.788179874420166, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8480926156044006, "num_tokens": 283605662.0, "step": 7436 }, { "epoch": 0.9460628418776237, "ewc_loss": 0.048346273601055145, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019781819719355553, "grad_norm": 5.783119201660156, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8628184795379639, "num_tokens": 283642868.0, "step": 7437 }, { "epoch": 0.9461900521562142, "ewc_loss": 0.04833553358912468, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001977108040591702, "grad_norm": 5.822118282318115, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8515984416007996, "num_tokens": 283675604.0, "step": 7438 }, { "epoch": 0.9463172624348047, "ewc_loss": 0.04847963899374008, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019793116371147335, "grad_norm": 5.727276802062988, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8822631239891052, "num_tokens": 283709459.0, "step": 7439 }, { "epoch": 0.9464444727133953, "ewc_loss": 0.04847956448793411, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019793040701188147, "grad_norm": 5.7810378074646, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8614745140075684, "num_tokens": 283748020.0, "step": 7440 }, { "epoch": 0.9465716829919858, "ewc_loss": 0.0485033243894577, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019816801068373024, "grad_norm": 5.806965351104736, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8557795882225037, "num_tokens": 283778514.0, "step": 7441 }, { "epoch": 0.9466988932705762, "ewc_loss": 0.048386119306087494, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019821668684016913, "grad_norm": 5.7669291496276855, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8652463555335999, "num_tokens": 283819764.0, "step": 7442 }, { "epoch": 0.9468261035491667, "ewc_loss": 0.04835920035839081, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019794749096035957, "grad_norm": 5.8345046043396, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8471169471740723, "num_tokens": 283853897.0, "step": 7443 }, { "epoch": 0.9469533138277573, "ewc_loss": 0.04838431626558304, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001981986133614555, "grad_norm": 5.701241970062256, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8755091428756714, "num_tokens": 283891826.0, "step": 7444 }, { "epoch": 0.9470805241063478, "ewc_loss": 0.04838218539953232, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019817733846139163, "grad_norm": 5.814033031463623, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8609967231750488, "num_tokens": 283931690.0, "step": 7445 }, { "epoch": 0.9472077343849383, "ewc_loss": 0.04850100725889206, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019814482948277146, "grad_norm": 5.733470439910889, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8624471426010132, "num_tokens": 283971107.0, "step": 7446 }, { "epoch": 0.9473349446635289, "ewc_loss": 0.048445895314216614, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019881443586200476, "grad_norm": 5.848483562469482, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8679440021514893, "num_tokens": 284005286.0, "step": 7447 }, { "epoch": 0.9474621549421193, "ewc_loss": 0.048532597720623016, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001984607515623793, "grad_norm": 5.758361339569092, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8758232593536377, "num_tokens": 284044089.0, "step": 7448 }, { "epoch": 0.9475893652207098, "ewc_loss": 0.04840827360749245, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019843819609377533, "grad_norm": 5.8541107177734375, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8563855886459351, "num_tokens": 284081172.0, "step": 7449 }, { "epoch": 0.9477165754993003, "ewc_loss": 0.048395536839962006, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019831082317978144, "grad_norm": 5.819681644439697, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.869557797908783, "num_tokens": 284113362.0, "step": 7450 }, { "epoch": 0.9478437857778909, "ewc_loss": 0.04837258160114288, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019808128126896918, "grad_norm": 5.81455659866333, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8711855411529541, "num_tokens": 284155272.0, "step": 7451 }, { "epoch": 0.9479709960564814, "ewc_loss": 0.048305533826351166, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001974108163267374, "grad_norm": 5.811000347137451, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8684276342391968, "num_tokens": 284188218.0, "step": 7452 }, { "epoch": 0.9480982063350719, "ewc_loss": 0.04843432083725929, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019747797341551632, "grad_norm": 9.296571731567383, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8653567433357239, "num_tokens": 284228838.0, "step": 7453 }, { "epoch": 0.9482254166136623, "ewc_loss": 0.052262015640735626, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00023697561118751764, "grad_norm": 6.335172653198242, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8595216274261475, "num_tokens": 284265570.0, "step": 7454 }, { "epoch": 0.9483526268922529, "ewc_loss": 0.04730813577771187, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00018743681721389294, "grad_norm": 5.62266206741333, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8530237078666687, "num_tokens": 284307062.0, "step": 7455 }, { "epoch": 0.9484798371708434, "ewc_loss": 0.04897955432534218, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020293031411711127, "grad_norm": 6.046032905578613, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8603565692901611, "num_tokens": 284340488.0, "step": 7456 }, { "epoch": 0.9486070474494339, "ewc_loss": 0.04847995191812515, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019915500888600945, "grad_norm": 5.744975566864014, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8609224557876587, "num_tokens": 284382399.0, "step": 7457 }, { "epoch": 0.9487342577280244, "ewc_loss": 0.048498306423425674, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001981178211281076, "grad_norm": 6.072700023651123, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8479886054992676, "num_tokens": 284416391.0, "step": 7458 }, { "epoch": 0.948861468006615, "ewc_loss": 0.048603303730487823, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019916782912332565, "grad_norm": 5.738593578338623, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8618689775466919, "num_tokens": 284457049.0, "step": 7459 }, { "epoch": 0.9489886782852054, "ewc_loss": 0.04852508008480072, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019838557636830956, "grad_norm": 5.870452880859375, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.859968900680542, "num_tokens": 284494426.0, "step": 7460 }, { "epoch": 0.9491158885637959, "ewc_loss": 0.048659853637218475, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019973331654909998, "grad_norm": 5.791933536529541, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8801099061965942, "num_tokens": 284533349.0, "step": 7461 }, { "epoch": 0.9492430988423864, "ewc_loss": 0.04853339493274689, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000198468696908094, "grad_norm": 5.902132511138916, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8683153390884399, "num_tokens": 284567777.0, "step": 7462 }, { "epoch": 0.949370309120977, "ewc_loss": 0.04842786490917206, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001986341376323253, "grad_norm": 5.800708770751953, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8547215461730957, "num_tokens": 284609120.0, "step": 7463 }, { "epoch": 0.9494975193995675, "ewc_loss": 0.048474013805389404, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019787487690337002, "grad_norm": 5.8136887550354, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8535708785057068, "num_tokens": 284649221.0, "step": 7464 }, { "epoch": 0.949624729678158, "ewc_loss": 0.048352427780628204, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019787976634688675, "grad_norm": 5.838607311248779, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.867933988571167, "num_tokens": 284682595.0, "step": 7465 }, { "epoch": 0.9497519399567486, "ewc_loss": 0.04832589626312256, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019761441217269748, "grad_norm": 5.756738662719727, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8583414554595947, "num_tokens": 284722903.0, "step": 7466 }, { "epoch": 0.949879150235339, "ewc_loss": 0.04834125190973282, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019776797853410244, "grad_norm": 5.815323829650879, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8665271997451782, "num_tokens": 284759166.0, "step": 7467 }, { "epoch": 0.9500063605139295, "ewc_loss": 0.0483182892203331, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019753837841562927, "grad_norm": 5.762571334838867, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8580945730209351, "num_tokens": 284803192.0, "step": 7468 }, { "epoch": 0.95013357079252, "ewc_loss": 0.04844682663679123, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019760301802307367, "grad_norm": 5.80739164352417, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.863983690738678, "num_tokens": 284837088.0, "step": 7469 }, { "epoch": 0.9502607810711106, "ewc_loss": 0.04843129217624664, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019744769087992609, "grad_norm": 9.277422904968262, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.8425778150558472, "num_tokens": 284879641.0, "step": 7470 }, { "epoch": 0.9503879913497011, "ewc_loss": 0.05219502002000809, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00023508498270530254, "grad_norm": 6.2595133781433105, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8508248925209045, "num_tokens": 284917519.0, "step": 7471 }, { "epoch": 0.9505152016282916, "ewc_loss": 0.04746072739362717, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00018774201453197747, "grad_norm": 5.705997943878174, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8521199226379395, "num_tokens": 284950422.0, "step": 7472 }, { "epoch": 0.950642411906882, "ewc_loss": 0.04879136383533478, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002022690896410495, "grad_norm": 5.960493087768555, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8684045076370239, "num_tokens": 284986392.0, "step": 7473 }, { "epoch": 0.9507696221854726, "ewc_loss": 0.04850427806377411, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019817757129203528, "grad_norm": 5.777152061462402, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8634361028671265, "num_tokens": 285027684.0, "step": 7474 }, { "epoch": 0.9508968324640631, "ewc_loss": 0.04848418012261391, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019797656568698585, "grad_norm": 5.905677795410156, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8530358672142029, "num_tokens": 285059551.0, "step": 7475 }, { "epoch": 0.9510240427426536, "ewc_loss": 0.04846058785915375, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019774065003730357, "grad_norm": 5.711887836456299, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8629640340805054, "num_tokens": 285100197.0, "step": 7476 }, { "epoch": 0.9511512530212441, "ewc_loss": 0.04847170412540436, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019785179756581783, "grad_norm": 5.939811706542969, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.862114429473877, "num_tokens": 285130115.0, "step": 7477 }, { "epoch": 0.9512784632998347, "ewc_loss": 0.04854759946465492, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019861076725646853, "grad_norm": 5.743386268615723, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8631608486175537, "num_tokens": 285169069.0, "step": 7478 }, { "epoch": 0.9514056735784251, "ewc_loss": 0.04845932126045227, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019772800442297012, "grad_norm": 5.811405181884766, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.8420267105102539, "num_tokens": 285209971.0, "step": 7479 }, { "epoch": 0.9515328838570156, "ewc_loss": 0.04847513884305954, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019788614008575678, "grad_norm": 5.7476487159729, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8571943044662476, "num_tokens": 285250024.0, "step": 7480 }, { "epoch": 0.9516600941356061, "ewc_loss": 0.04853520542383194, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001984868140425533, "grad_norm": 5.8393120765686035, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8580101728439331, "num_tokens": 285284900.0, "step": 7481 }, { "epoch": 0.9517873044141967, "ewc_loss": 0.04841964319348335, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000197331202798523, "grad_norm": 5.751425266265869, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8667355179786682, "num_tokens": 285325657.0, "step": 7482 }, { "epoch": 0.9519145146927872, "ewc_loss": 0.04838148504495621, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001981702953344211, "grad_norm": 5.861469268798828, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8616242408752441, "num_tokens": 285356362.0, "step": 7483 }, { "epoch": 0.9520417249713777, "ewc_loss": 0.0483877956867218, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019823340699076653, "grad_norm": 5.81711483001709, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8618434071540833, "num_tokens": 285394321.0, "step": 7484 }, { "epoch": 0.9521689352499682, "ewc_loss": 0.04833314195275307, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019768688071053475, "grad_norm": 5.728796482086182, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8810145258903503, "num_tokens": 285435260.0, "step": 7485 }, { "epoch": 0.9522961455285587, "ewc_loss": 0.04847842454910278, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001979190274141729, "grad_norm": 5.706989765167236, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8692212700843811, "num_tokens": 285479449.0, "step": 7486 }, { "epoch": 0.9524233558071492, "ewc_loss": 0.04855320602655411, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019866680668201298, "grad_norm": 5.852484703063965, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8416595458984375, "num_tokens": 285520492.0, "step": 7487 }, { "epoch": 0.9525505660857397, "ewc_loss": 0.04850593954324722, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001981941459234804, "grad_norm": 5.783036708831787, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8584394454956055, "num_tokens": 285558410.0, "step": 7488 }, { "epoch": 0.9526777763643303, "ewc_loss": 0.04858625680208206, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001989973388845101, "grad_norm": 5.7958149909973145, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8420769572257996, "num_tokens": 285598871.0, "step": 7489 }, { "epoch": 0.9528049866429208, "ewc_loss": 0.04853196442127228, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019845442147925496, "grad_norm": 5.802489280700684, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8541881442070007, "num_tokens": 285641374.0, "step": 7490 }, { "epoch": 0.9529321969215112, "ewc_loss": 0.04854812100529671, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001986159768421203, "grad_norm": 5.8012189865112305, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8723733425140381, "num_tokens": 285675416.0, "step": 7491 }, { "epoch": 0.9530594072001017, "ewc_loss": 0.04847420006990433, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019787673954851925, "grad_norm": 5.798355579376221, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8671726584434509, "num_tokens": 285707491.0, "step": 7492 }, { "epoch": 0.9531866174786923, "ewc_loss": 0.04853762686252594, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019851104298140854, "grad_norm": 5.817159175872803, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8512450456619263, "num_tokens": 285747571.0, "step": 7493 }, { "epoch": 0.9533138277572828, "ewc_loss": 0.0483650267124176, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019800575682893395, "grad_norm": 5.7199177742004395, "learning_rate": 1e-06, "loss": 0.3827, "mean_token_accuracy": 0.8848263621330261, "num_tokens": 285786362.0, "step": 7494 }, { "epoch": 0.9534410380358733, "ewc_loss": 0.048390649259090424, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019826195784844458, "grad_norm": 5.753493785858154, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8689820170402527, "num_tokens": 285829401.0, "step": 7495 }, { "epoch": 0.9535682483144639, "ewc_loss": 0.048402875661849976, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001983842084882781, "grad_norm": 5.751089572906494, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8596140146255493, "num_tokens": 285867853.0, "step": 7496 }, { "epoch": 0.9536954585930543, "ewc_loss": 0.048530563712120056, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001984403788810596, "grad_norm": 5.77719783782959, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8725773096084595, "num_tokens": 285908497.0, "step": 7497 }, { "epoch": 0.9538226688716448, "ewc_loss": 0.04842820763587952, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019863757188431919, "grad_norm": 5.848422050476074, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8476800918579102, "num_tokens": 285940528.0, "step": 7498 }, { "epoch": 0.9539498791502353, "ewc_loss": 0.04841819778084755, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019853745470754802, "grad_norm": 5.772860527038574, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8620744943618774, "num_tokens": 285980206.0, "step": 7499 }, { "epoch": 0.9540770894288259, "ewc_loss": 0.04853755235671997, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019851031538564712, "grad_norm": 5.872913837432861, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8571408987045288, "num_tokens": 286019409.0, "step": 7500 }, { "epoch": 0.9542042997074164, "ewc_loss": 0.048531800508499146, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019845279166474938, "grad_norm": 5.7692389488220215, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8706612586975098, "num_tokens": 286050955.0, "step": 7501 }, { "epoch": 0.9543315099860069, "ewc_loss": 0.04849216341972351, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019805642659775913, "grad_norm": 5.794045448303223, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8676975965499878, "num_tokens": 286087567.0, "step": 7502 }, { "epoch": 0.9544587202645973, "ewc_loss": 0.048514533787965775, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019828010408673435, "grad_norm": 5.893421173095703, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8407247066497803, "num_tokens": 286126893.0, "step": 7503 }, { "epoch": 0.9545859305431879, "ewc_loss": 0.04849716275930405, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019810636877082288, "grad_norm": 5.727884769439697, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.855872392654419, "num_tokens": 286165818.0, "step": 7504 }, { "epoch": 0.9547131408217784, "ewc_loss": 0.048385411500930786, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019820960005745292, "grad_norm": 5.770807266235352, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8637837171554565, "num_tokens": 286208026.0, "step": 7505 }, { "epoch": 0.9548403511003689, "ewc_loss": 0.04854381084442139, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019857285951729864, "grad_norm": 5.801395416259766, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8611449003219604, "num_tokens": 286245468.0, "step": 7506 }, { "epoch": 0.9549675613789594, "ewc_loss": 0.04843725636601448, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019872804114129394, "grad_norm": 5.833593368530273, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8655379414558411, "num_tokens": 286283955.0, "step": 7507 }, { "epoch": 0.95509477165755, "ewc_loss": 0.04848172515630722, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019917271856684238, "grad_norm": 5.7614545822143555, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8673977851867676, "num_tokens": 286326560.0, "step": 7508 }, { "epoch": 0.9552219819361404, "ewc_loss": 0.048539429903030396, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019852904370054603, "grad_norm": 5.804780006408691, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.86201012134552, "num_tokens": 286365791.0, "step": 7509 }, { "epoch": 0.9553491922147309, "ewc_loss": 0.04847715422511101, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019912701100111008, "grad_norm": 5.7826247215271, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8714101910591125, "num_tokens": 286404132.0, "step": 7510 }, { "epoch": 0.9554764024933214, "ewc_loss": 0.04855188727378845, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019865362264681607, "grad_norm": 5.789040565490723, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8520557880401611, "num_tokens": 286444828.0, "step": 7511 }, { "epoch": 0.955603612771912, "ewc_loss": 0.04839315637946129, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019828703079838306, "grad_norm": 5.77901029586792, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8667490482330322, "num_tokens": 286481725.0, "step": 7512 }, { "epoch": 0.9557308230505025, "ewc_loss": 0.04847497493028641, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019910524133592844, "grad_norm": 5.776468276977539, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8619797825813293, "num_tokens": 286520354.0, "step": 7513 }, { "epoch": 0.955858033329093, "ewc_loss": 0.048429980874061584, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019865525246132165, "grad_norm": 5.7868218421936035, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8596450090408325, "num_tokens": 286564314.0, "step": 7514 }, { "epoch": 0.9559852436076836, "ewc_loss": 0.04845210164785385, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001988764852285385, "grad_norm": 5.8168416023254395, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.860251784324646, "num_tokens": 286606955.0, "step": 7515 }, { "epoch": 0.956112453886274, "ewc_loss": 0.048504553735256195, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.000199401009012945, "grad_norm": 5.83554220199585, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8674125671386719, "num_tokens": 286640091.0, "step": 7516 }, { "epoch": 0.9562396641648645, "ewc_loss": 0.0485580638051033, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019871542463079095, "grad_norm": 5.809865474700928, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8611788749694824, "num_tokens": 286679090.0, "step": 7517 }, { "epoch": 0.956366874443455, "ewc_loss": 0.04859202355146408, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001990549935726449, "grad_norm": 5.850236892700195, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.865022599697113, "num_tokens": 286720260.0, "step": 7518 }, { "epoch": 0.9564940847220456, "ewc_loss": 0.04857547581195831, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019888953829649836, "grad_norm": 5.777541637420654, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.868401288986206, "num_tokens": 286763992.0, "step": 7519 }, { "epoch": 0.9566212950006361, "ewc_loss": 0.04842832684516907, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019863873603753746, "grad_norm": 5.79864501953125, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8425121903419495, "num_tokens": 286807068.0, "step": 7520 }, { "epoch": 0.9567485052792266, "ewc_loss": 0.048675790429115295, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019989268912468106, "grad_norm": 5.7654829025268555, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8685013055801392, "num_tokens": 286853020.0, "step": 7521 }, { "epoch": 0.956875715557817, "ewc_loss": 0.048423945903778076, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019859490566886961, "grad_norm": 5.813128471374512, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8517794609069824, "num_tokens": 286896098.0, "step": 7522 }, { "epoch": 0.9570029258364076, "ewc_loss": 0.048663556575775146, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019977033662144095, "grad_norm": 5.838623046875, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8458060026168823, "num_tokens": 286938873.0, "step": 7523 }, { "epoch": 0.9571301361149981, "ewc_loss": 0.04850153625011444, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019937082834076136, "grad_norm": 5.830790996551514, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8655455112457275, "num_tokens": 286972941.0, "step": 7524 }, { "epoch": 0.9572573463935886, "ewc_loss": 0.04861906170845032, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019932536815758795, "grad_norm": 5.81212043762207, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8510950803756714, "num_tokens": 287010818.0, "step": 7525 }, { "epoch": 0.9573845566721791, "ewc_loss": 0.04848307743668556, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019918623729608953, "grad_norm": 5.80810022354126, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.865562915802002, "num_tokens": 287053841.0, "step": 7526 }, { "epoch": 0.9575117669507697, "ewc_loss": 0.048655830323696136, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001996930513996631, "grad_norm": 5.868744850158691, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8671054840087891, "num_tokens": 287090900.0, "step": 7527 }, { "epoch": 0.9576389772293601, "ewc_loss": 0.04862310737371445, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019936583703383803, "grad_norm": 5.7704973220825195, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8591309785842896, "num_tokens": 287131203.0, "step": 7528 }, { "epoch": 0.9577661875079506, "ewc_loss": 0.04850466176867485, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001994020858546719, "grad_norm": 5.881110191345215, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8719730377197266, "num_tokens": 287165949.0, "step": 7529 }, { "epoch": 0.9578933977865411, "ewc_loss": 0.048489831387996674, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019925380183849484, "grad_norm": 5.814591884613037, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8611027002334595, "num_tokens": 287203376.0, "step": 7530 }, { "epoch": 0.9580206080651317, "ewc_loss": 0.04857092350721359, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020006469276268035, "grad_norm": 5.916475772857666, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8524873852729797, "num_tokens": 287240391.0, "step": 7531 }, { "epoch": 0.9581478183437222, "ewc_loss": 0.048501454293727875, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019937001343350857, "grad_norm": 5.799340724945068, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8592073917388916, "num_tokens": 287278485.0, "step": 7532 }, { "epoch": 0.9582750286223127, "ewc_loss": 0.0485234297811985, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019958976190537214, "grad_norm": 5.873396396636963, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8563511371612549, "num_tokens": 287313626.0, "step": 7533 }, { "epoch": 0.9584022389009031, "ewc_loss": 0.04850132763385773, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001993687474168837, "grad_norm": 5.781190872192383, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8773488998413086, "num_tokens": 287344532.0, "step": 7534 }, { "epoch": 0.9585294491794937, "ewc_loss": 0.04857451096177101, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002001005777856335, "grad_norm": 5.936441898345947, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8434193730354309, "num_tokens": 287383069.0, "step": 7535 }, { "epoch": 0.9586566594580842, "ewc_loss": 0.048527780920267105, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001996332866838202, "grad_norm": 5.80708646774292, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8615212440490723, "num_tokens": 287418554.0, "step": 7536 }, { "epoch": 0.9587838697366747, "ewc_loss": 0.04856926202774048, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020004808902740479, "grad_norm": 5.950800895690918, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8613733053207397, "num_tokens": 287452449.0, "step": 7537 }, { "epoch": 0.9589110800152653, "ewc_loss": 0.04854559153318405, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019981138757430017, "grad_norm": 5.757895469665527, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8465576171875, "num_tokens": 287492247.0, "step": 7538 }, { "epoch": 0.9590382902938558, "ewc_loss": 0.04855251684784889, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019988064013887197, "grad_norm": 6.031830787658691, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8666983842849731, "num_tokens": 287528651.0, "step": 7539 }, { "epoch": 0.9591655005724462, "ewc_loss": 0.04858890175819397, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020024451077915728, "grad_norm": 5.7647857666015625, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8515437841415405, "num_tokens": 287565837.0, "step": 7540 }, { "epoch": 0.9592927108510367, "ewc_loss": 0.04853294789791107, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001996849459828809, "grad_norm": 5.979407787322998, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8620285987854004, "num_tokens": 287599469.0, "step": 7541 }, { "epoch": 0.9594199211296273, "ewc_loss": 0.048547323793172836, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019982870435342193, "grad_norm": 5.72162389755249, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8600284457206726, "num_tokens": 287637104.0, "step": 7542 }, { "epoch": 0.9595471314082178, "ewc_loss": 0.04849690943956375, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019932456780225039, "grad_norm": 5.803691864013672, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8624370694160461, "num_tokens": 287679134.0, "step": 7543 }, { "epoch": 0.9596743416868083, "ewc_loss": 0.04854234308004379, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019977890769951046, "grad_norm": 5.786088466644287, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8690208792686462, "num_tokens": 287720340.0, "step": 7544 }, { "epoch": 0.9598015519653988, "ewc_loss": 0.04852640628814697, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019961953512392938, "grad_norm": 5.784900188446045, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8658460378646851, "num_tokens": 287760931.0, "step": 7545 }, { "epoch": 0.9599287622439893, "ewc_loss": 0.048646554350852966, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020082099945284426, "grad_norm": 5.917918682098389, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8475439548492432, "num_tokens": 287801723.0, "step": 7546 }, { "epoch": 0.9600559725225798, "ewc_loss": 0.048499539494514465, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019935087766498327, "grad_norm": 5.779459476470947, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8596399426460266, "num_tokens": 287837010.0, "step": 7547 }, { "epoch": 0.9601831828011703, "ewc_loss": 0.048688240349292755, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020001719531137496, "grad_norm": 5.856472015380859, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8638013601303101, "num_tokens": 287874381.0, "step": 7548 }, { "epoch": 0.9603103930797608, "ewc_loss": 0.04869562387466431, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020009100262541324, "grad_norm": 5.813133716583252, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8546924591064453, "num_tokens": 287916673.0, "step": 7549 }, { "epoch": 0.9604376033583514, "ewc_loss": 0.04856259375810623, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019998139759991318, "grad_norm": 5.804299831390381, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8569999933242798, "num_tokens": 287956764.0, "step": 7550 }, { "epoch": 0.9605648136369419, "ewc_loss": 0.048573583364486694, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020009132276754826, "grad_norm": 5.812915325164795, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8696544766426086, "num_tokens": 287996373.0, "step": 7551 }, { "epoch": 0.9606920239155323, "ewc_loss": 0.04856105148792267, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019996598712168634, "grad_norm": 5.868180751800537, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8569016456604004, "num_tokens": 288033671.0, "step": 7552 }, { "epoch": 0.9608192341941229, "ewc_loss": 0.048587024211883545, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020022572425659746, "grad_norm": 5.809973239898682, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8702882528305054, "num_tokens": 288068254.0, "step": 7553 }, { "epoch": 0.9609464444727134, "ewc_loss": 0.04868164658546448, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019995124603156, "grad_norm": 5.868037700653076, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8461823463439941, "num_tokens": 288102833.0, "step": 7554 }, { "epoch": 0.9610736547513039, "ewc_loss": 0.048670582473278046, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019984056416433305, "grad_norm": 5.868983268737793, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8744603395462036, "num_tokens": 288135927.0, "step": 7555 }, { "epoch": 0.9612008650298944, "ewc_loss": 0.048625580966472626, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019939058984164149, "grad_norm": 5.767087459564209, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.880929708480835, "num_tokens": 288173842.0, "step": 7556 }, { "epoch": 0.961328075308485, "ewc_loss": 0.04854714125394821, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001998268999159336, "grad_norm": 5.8148298263549805, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8658065795898438, "num_tokens": 288215007.0, "step": 7557 }, { "epoch": 0.9614552855870754, "ewc_loss": 0.04863220080733299, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001994567719521001, "grad_norm": 5.802872657775879, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.849796712398529, "num_tokens": 288251598.0, "step": 7558 }, { "epoch": 0.9615824958656659, "ewc_loss": 0.04867902398109436, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0001999249798245728, "grad_norm": 5.776918411254883, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8631552457809448, "num_tokens": 288292811.0, "step": 7559 }, { "epoch": 0.9617097061442564, "ewc_loss": 0.048586905002593994, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020022454555146396, "grad_norm": 5.856974124908447, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8653246760368347, "num_tokens": 288337790.0, "step": 7560 }, { "epoch": 0.961836916422847, "ewc_loss": 0.04859673231840134, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020032278553117067, "grad_norm": 5.815758228302002, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8585122227668762, "num_tokens": 288373347.0, "step": 7561 }, { "epoch": 0.9619641267014375, "ewc_loss": 0.04854617640376091, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019981723744422197, "grad_norm": 5.8103556632995605, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8820394277572632, "num_tokens": 288405650.0, "step": 7562 }, { "epoch": 0.962091336980028, "ewc_loss": 0.04858952760696411, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020025075355079025, "grad_norm": 5.845885753631592, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.868194580078125, "num_tokens": 288443849.0, "step": 7563 }, { "epoch": 0.9622185472586186, "ewc_loss": 0.04849062114953995, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00019926165987271816, "grad_norm": 5.742699146270752, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8417208790779114, "num_tokens": 288487557.0, "step": 7564 }, { "epoch": 0.962345757537209, "ewc_loss": 0.0487024188041687, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020015897462144494, "grad_norm": 5.791018009185791, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8519651889801025, "num_tokens": 288529932.0, "step": 7565 }, { "epoch": 0.9624729678157995, "ewc_loss": 0.04854993894696236, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0001998548541450873, "grad_norm": 5.810557842254639, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8569953441619873, "num_tokens": 288568276.0, "step": 7566 }, { "epoch": 0.96260017809439, "ewc_loss": 0.0486096516251564, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020045196288265288, "grad_norm": 5.780935764312744, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.864656388759613, "num_tokens": 288602387.0, "step": 7567 }, { "epoch": 0.9627273883729806, "ewc_loss": 0.04859248176217079, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020028029393870384, "grad_norm": 5.80480432510376, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8696795701980591, "num_tokens": 288638157.0, "step": 7568 }, { "epoch": 0.9628545986515711, "ewc_loss": 0.04857531189918518, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020010861044283956, "grad_norm": 5.775035858154297, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8577620983123779, "num_tokens": 288675054.0, "step": 7569 }, { "epoch": 0.9629818089301616, "ewc_loss": 0.04857394099235535, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002000948879867792, "grad_norm": 5.778018951416016, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8666185140609741, "num_tokens": 288714222.0, "step": 7570 }, { "epoch": 0.963109019208752, "ewc_loss": 0.04859316349029541, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020028711878694594, "grad_norm": 5.827721118927002, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8516988754272461, "num_tokens": 288752772.0, "step": 7571 }, { "epoch": 0.9632362294873426, "ewc_loss": 0.04856904596090317, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020004590624012053, "grad_norm": 5.7829976081848145, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8458415269851685, "num_tokens": 288792067.0, "step": 7572 }, { "epoch": 0.9633634397659331, "ewc_loss": 0.048633985221385956, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020069531456101686, "grad_norm": 5.825063228607178, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8737845420837402, "num_tokens": 288826172.0, "step": 7573 }, { "epoch": 0.9634906500445236, "ewc_loss": 0.04869053512811661, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020004012912977487, "grad_norm": 5.784857273101807, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8727635145187378, "num_tokens": 288863774.0, "step": 7574 }, { "epoch": 0.9636178603231141, "ewc_loss": 0.04860687628388405, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002004242269322276, "grad_norm": 5.803885459899902, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8556056618690491, "num_tokens": 288901892.0, "step": 7575 }, { "epoch": 0.9637450706017047, "ewc_loss": 0.04860088974237442, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002003643603529781, "grad_norm": 5.770258903503418, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8512243628501892, "num_tokens": 288941470.0, "step": 7576 }, { "epoch": 0.9638722808802951, "ewc_loss": 0.04859654977917671, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020032096654176712, "grad_norm": 5.78543758392334, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8483743667602539, "num_tokens": 288988616.0, "step": 7577 }, { "epoch": 0.9639994911588856, "ewc_loss": 0.0485958456993103, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020031395251862705, "grad_norm": 5.774808406829834, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8784281015396118, "num_tokens": 289024589.0, "step": 7578 }, { "epoch": 0.9641267014374761, "ewc_loss": 0.04862756282091141, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002006311115110293, "grad_norm": 5.810830116271973, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8642916679382324, "num_tokens": 289064334.0, "step": 7579 }, { "epoch": 0.9642539117160667, "ewc_loss": 0.04859309643507004, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002002864348469302, "grad_norm": 5.80487585067749, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8512076139450073, "num_tokens": 289099385.0, "step": 7580 }, { "epoch": 0.9643811219946572, "ewc_loss": 0.04871276766061783, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020026245329063386, "grad_norm": 5.764499187469482, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8652785420417786, "num_tokens": 289139143.0, "step": 7581 }, { "epoch": 0.9645083322732477, "ewc_loss": 0.048751093447208405, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020064572163391858, "grad_norm": 5.841910362243652, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8551424741744995, "num_tokens": 289179303.0, "step": 7582 }, { "epoch": 0.9646355425518381, "ewc_loss": 0.04873887449502945, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020052351464983076, "grad_norm": 5.853469371795654, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8533152937889099, "num_tokens": 289214218.0, "step": 7583 }, { "epoch": 0.9647627528304287, "ewc_loss": 0.048741988837718964, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002005546703003347, "grad_norm": 5.882026672363281, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8579227924346924, "num_tokens": 289240991.0, "step": 7584 }, { "epoch": 0.9648899631090192, "ewc_loss": 0.04871101304888725, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020024488912895322, "grad_norm": 5.819318771362305, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8554561734199524, "num_tokens": 289273374.0, "step": 7585 }, { "epoch": 0.9650171733876097, "ewc_loss": 0.04870876669883728, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020022242097184062, "grad_norm": 5.779074668884277, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8479917049407959, "num_tokens": 289314904.0, "step": 7586 }, { "epoch": 0.9651443836662003, "ewc_loss": 0.04878976568579674, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002010324242291972, "grad_norm": 5.802369117736816, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8543860912322998, "num_tokens": 289358110.0, "step": 7587 }, { "epoch": 0.9652715939447908, "ewc_loss": 0.04874550178647041, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020058978407178074, "grad_norm": 5.79035758972168, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8521295189857483, "num_tokens": 289391581.0, "step": 7588 }, { "epoch": 0.9653988042233812, "ewc_loss": 0.048775918781757355, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020089397730771452, "grad_norm": 5.815587520599365, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8361302614212036, "num_tokens": 289432280.0, "step": 7589 }, { "epoch": 0.9655260145019717, "ewc_loss": 0.048796646296978, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020110124023631215, "grad_norm": 5.743413925170898, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8456903696060181, "num_tokens": 289473222.0, "step": 7590 }, { "epoch": 0.9656532247805623, "ewc_loss": 0.04882967472076416, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002014315250562504, "grad_norm": 5.839536190032959, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8568285703659058, "num_tokens": 289512898.0, "step": 7591 }, { "epoch": 0.9657804350591528, "ewc_loss": 0.04878608509898186, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020099562243558466, "grad_norm": 5.762015342712402, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8659490942955017, "num_tokens": 289554394.0, "step": 7592 }, { "epoch": 0.9659076453377433, "ewc_loss": 0.048849739134311676, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020163216686341912, "grad_norm": 5.789865970611572, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8577587604522705, "num_tokens": 289594871.0, "step": 7593 }, { "epoch": 0.9660348556163338, "ewc_loss": 0.04883544519543648, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020148922340013087, "grad_norm": 5.844108581542969, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8502435088157654, "num_tokens": 289629165.0, "step": 7594 }, { "epoch": 0.9661620658949243, "ewc_loss": 0.04883744567632675, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020150921773165464, "grad_norm": 5.759990215301514, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8597298264503479, "num_tokens": 289670131.0, "step": 7595 }, { "epoch": 0.9662892761735148, "ewc_loss": 0.048819780349731445, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020133258658461273, "grad_norm": 5.794429302215576, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8646276593208313, "num_tokens": 289705238.0, "step": 7596 }, { "epoch": 0.9664164864521053, "ewc_loss": 0.048830315470695496, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002014379424508661, "grad_norm": 5.819654941558838, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.863915205001831, "num_tokens": 289740146.0, "step": 7597 }, { "epoch": 0.9665436967306958, "ewc_loss": 0.048876162618398666, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020189638598822057, "grad_norm": 5.796997547149658, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8649208545684814, "num_tokens": 289781145.0, "step": 7598 }, { "epoch": 0.9666709070092864, "ewc_loss": 0.04895078018307686, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020142186258453876, "grad_norm": 6.7287163734436035, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8441566228866577, "num_tokens": 289816347.0, "step": 7599 }, { "epoch": 0.9667981172878769, "ewc_loss": 0.049182772636413574, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020496247452683747, "grad_norm": 5.728291988372803, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8476413488388062, "num_tokens": 289852974.0, "step": 7600 }, { "epoch": 0.9669253275664673, "ewc_loss": 0.04869148135185242, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020004955877084285, "grad_norm": 5.815532684326172, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8488576412200928, "num_tokens": 289894553.0, "step": 7601 }, { "epoch": 0.9670525378450578, "ewc_loss": 0.04885317385196686, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020166649483144283, "grad_norm": 5.798600196838379, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8427034020423889, "num_tokens": 289930641.0, "step": 7602 }, { "epoch": 0.9671797481236484, "ewc_loss": 0.048825379461050034, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020138856780249625, "grad_norm": 5.834945201873779, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8553150296211243, "num_tokens": 289972690.0, "step": 7603 }, { "epoch": 0.9673069584022389, "ewc_loss": 0.04882224649190903, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020135723752900958, "grad_norm": 5.816279888153076, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8546855449676514, "num_tokens": 290006518.0, "step": 7604 }, { "epoch": 0.9674341686808294, "ewc_loss": 0.04878600314259529, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020099479297641665, "grad_norm": 5.802593231201172, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8478337526321411, "num_tokens": 290045206.0, "step": 7605 }, { "epoch": 0.96756137895942, "ewc_loss": 0.04880137741565704, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020114854851271957, "grad_norm": 5.805566310882568, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8661023378372192, "num_tokens": 290086976.0, "step": 7606 }, { "epoch": 0.9676885892380104, "ewc_loss": 0.04881635308265686, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002012983022723347, "grad_norm": 5.793805122375488, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8735048770904541, "num_tokens": 290126894.0, "step": 7607 }, { "epoch": 0.9678157995166009, "ewc_loss": 0.04881216585636139, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020125642186030746, "grad_norm": 5.8442769050598145, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8747836351394653, "num_tokens": 290161684.0, "step": 7608 }, { "epoch": 0.9679430097951914, "ewc_loss": 0.048776935786008835, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002009041199926287, "grad_norm": 5.794399261474609, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8637265563011169, "num_tokens": 290203407.0, "step": 7609 }, { "epoch": 0.968070220073782, "ewc_loss": 0.048842985183000565, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020156461687292904, "grad_norm": 5.8520894050598145, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8675242066383362, "num_tokens": 290238645.0, "step": 7610 }, { "epoch": 0.9681974303523725, "ewc_loss": 0.04877345263957977, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020086926815565675, "grad_norm": 5.788396835327148, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8601480722427368, "num_tokens": 290280279.0, "step": 7611 }, { "epoch": 0.968324640630963, "ewc_loss": 0.04882770776748657, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020141182176303118, "grad_norm": 5.849447250366211, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8598013520240784, "num_tokens": 290317739.0, "step": 7612 }, { "epoch": 0.9684518509095535, "ewc_loss": 0.0487513542175293, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020064831187482923, "grad_norm": 5.779931545257568, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8592344522476196, "num_tokens": 290360384.0, "step": 7613 }, { "epoch": 0.968579061188144, "ewc_loss": 0.048817068338394165, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020130546181462705, "grad_norm": 5.848733901977539, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8673975467681885, "num_tokens": 290398706.0, "step": 7614 }, { "epoch": 0.9687062714667345, "ewc_loss": 0.04877389967441559, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002008737501455471, "grad_norm": 5.763576030731201, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8738549947738647, "num_tokens": 290439524.0, "step": 7615 }, { "epoch": 0.968833481745325, "ewc_loss": 0.04876554384827614, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020079020760022104, "grad_norm": 5.8686676025390625, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8562484979629517, "num_tokens": 290472125.0, "step": 7616 }, { "epoch": 0.9689606920239155, "ewc_loss": 0.04877978563308716, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002009326417464763, "grad_norm": 6.659633159637451, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8447797894477844, "num_tokens": 290514348.0, "step": 7617 }, { "epoch": 0.9690879023025061, "ewc_loss": 0.04893121495842934, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020244691404514015, "grad_norm": 5.744760990142822, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.855362057685852, "num_tokens": 290551699.0, "step": 7618 }, { "epoch": 0.9692151125810966, "ewc_loss": 0.04854712635278702, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019860603788401932, "grad_norm": 5.8584065437316895, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8621149063110352, "num_tokens": 290588644.0, "step": 7619 }, { "epoch": 0.969342322859687, "ewc_loss": 0.04866757243871689, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019981047080364078, "grad_norm": 5.755691051483154, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8628657460212708, "num_tokens": 290629546.0, "step": 7620 }, { "epoch": 0.9694695331382776, "ewc_loss": 0.04868992790579796, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002000340464292094, "grad_norm": 5.860102653503418, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8689414262771606, "num_tokens": 290665527.0, "step": 7621 }, { "epoch": 0.9695967434168681, "ewc_loss": 0.04866785556077957, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019981330842711031, "grad_norm": 5.726474761962891, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8520269393920898, "num_tokens": 290706667.0, "step": 7622 }, { "epoch": 0.9697239536954586, "ewc_loss": 0.04875470697879791, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020068185403943062, "grad_norm": 5.86403751373291, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8557845950126648, "num_tokens": 290745698.0, "step": 7623 }, { "epoch": 0.9698511639740491, "ewc_loss": 0.048787716776132584, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020101193513255566, "grad_norm": 5.846927642822266, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8665817379951477, "num_tokens": 290777983.0, "step": 7624 }, { "epoch": 0.9699783742526397, "ewc_loss": 0.04881912097334862, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020132598001509905, "grad_norm": 5.900912761688232, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8550728559494019, "num_tokens": 290814184.0, "step": 7625 }, { "epoch": 0.9701055845312301, "ewc_loss": 0.048787761479616165, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00019979166972916573, "grad_norm": 6.1990065574646, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.852513313293457, "num_tokens": 290854054.0, "step": 7626 }, { "epoch": 0.9702327948098206, "ewc_loss": 0.04869946464896202, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002001294051297009, "grad_norm": 5.746693134307861, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8600187301635742, "num_tokens": 290891325.0, "step": 7627 }, { "epoch": 0.9703600050884111, "ewc_loss": 0.04857204109430313, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019885518122464418, "grad_norm": 5.88898229598999, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8594300746917725, "num_tokens": 290930101.0, "step": 7628 }, { "epoch": 0.9704872153670017, "ewc_loss": 0.048626482486724854, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019939958292525262, "grad_norm": 5.778543949127197, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8595995903015137, "num_tokens": 290964691.0, "step": 7629 }, { "epoch": 0.9706144256455922, "ewc_loss": 0.04865255951881409, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019966033869422972, "grad_norm": 5.761880397796631, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8579099774360657, "num_tokens": 291008140.0, "step": 7630 }, { "epoch": 0.9707416359241827, "ewc_loss": 0.048676181584596634, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019989657448604703, "grad_norm": 5.768353462219238, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8549159169197083, "num_tokens": 291041484.0, "step": 7631 }, { "epoch": 0.9708688462027731, "ewc_loss": 0.0487632229924202, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020076701184734702, "grad_norm": 5.807100772857666, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8424655199050903, "num_tokens": 291082337.0, "step": 7632 }, { "epoch": 0.9709960564813637, "ewc_loss": 0.04878642410039902, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020099901303183287, "grad_norm": 5.837059497833252, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.85992431640625, "num_tokens": 291115921.0, "step": 7633 }, { "epoch": 0.9711232667599542, "ewc_loss": 0.048802293837070465, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020115770166739821, "grad_norm": 5.801516532897949, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8689161539077759, "num_tokens": 291154367.0, "step": 7634 }, { "epoch": 0.9712504770385447, "ewc_loss": 0.0488092303276062, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020122708519920707, "grad_norm": 5.761214733123779, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8602790832519531, "num_tokens": 291198919.0, "step": 7635 }, { "epoch": 0.9713776873171353, "ewc_loss": 0.0488038994371891, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020117376698181033, "grad_norm": 5.824720859527588, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8495780229568481, "num_tokens": 291237062.0, "step": 7636 }, { "epoch": 0.9715048975957258, "ewc_loss": 0.04882875829935074, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020142232824582607, "grad_norm": 5.787502765655518, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8726544380187988, "num_tokens": 291275676.0, "step": 7637 }, { "epoch": 0.9716321078743162, "ewc_loss": 0.04880573973059654, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000201192160602659, "grad_norm": 5.781400203704834, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8532216548919678, "num_tokens": 291316785.0, "step": 7638 }, { "epoch": 0.9717593181529067, "ewc_loss": 0.048897504806518555, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020210981892887503, "grad_norm": 5.822022438049316, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8581223487854004, "num_tokens": 291360448.0, "step": 7639 }, { "epoch": 0.9718865284314973, "ewc_loss": 0.04885213077068329, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002016560611082241, "grad_norm": 5.819400310516357, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8580445647239685, "num_tokens": 291401650.0, "step": 7640 }, { "epoch": 0.9720137387100878, "ewc_loss": 0.048845358192920685, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020158833649475127, "grad_norm": 5.815167427062988, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8616372346878052, "num_tokens": 291439736.0, "step": 7641 }, { "epoch": 0.9721409489886783, "ewc_loss": 0.04884101450443268, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002015448990277946, "grad_norm": 5.841815948486328, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8662633299827576, "num_tokens": 291479077.0, "step": 7642 }, { "epoch": 0.9722681592672688, "ewc_loss": 0.048949893563985825, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020141300046816468, "grad_norm": 5.8096537590026855, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8706080913543701, "num_tokens": 291512470.0, "step": 7643 }, { "epoch": 0.9723953695458593, "ewc_loss": 0.04882507771253586, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020138554100412875, "grad_norm": 5.819123268127441, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.871107280254364, "num_tokens": 291552797.0, "step": 7644 }, { "epoch": 0.9725225798244498, "ewc_loss": 0.04882388189435005, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020137359388172626, "grad_norm": 5.827876091003418, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8643248081207275, "num_tokens": 291589885.0, "step": 7645 }, { "epoch": 0.9726497901030403, "ewc_loss": 0.048854779452085495, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020168256014585495, "grad_norm": 5.768880844116211, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8603883981704712, "num_tokens": 291633005.0, "step": 7646 }, { "epoch": 0.9727770003816308, "ewc_loss": 0.04883843660354614, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020151911303400993, "grad_norm": 5.877110004425049, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8567314147949219, "num_tokens": 291666083.0, "step": 7647 }, { "epoch": 0.9729042106602214, "ewc_loss": 0.048914264887571335, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020227741333656013, "grad_norm": 5.806247234344482, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8665584921836853, "num_tokens": 291706421.0, "step": 7648 }, { "epoch": 0.9730314209388119, "ewc_loss": 0.048875465989112854, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020188944472465664, "grad_norm": 5.8190131187438965, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8657531142234802, "num_tokens": 291744641.0, "step": 7649 }, { "epoch": 0.9731586312174023, "ewc_loss": 0.04885897785425186, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020172457152511925, "grad_norm": 5.813075065612793, "learning_rate": 1e-06, "loss": 0.3771, "mean_token_accuracy": 0.8868134021759033, "num_tokens": 291780586.0, "step": 7650 }, { "epoch": 0.9732858414959928, "ewc_loss": 0.04883332550525665, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020146803581155837, "grad_norm": 5.79121208190918, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8569586277008057, "num_tokens": 291820482.0, "step": 7651 }, { "epoch": 0.9734130517745834, "ewc_loss": 0.04885272681713104, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020166204194538295, "grad_norm": 5.801198959350586, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8704516887664795, "num_tokens": 291858568.0, "step": 7652 }, { "epoch": 0.9735402620531739, "ewc_loss": 0.048841990530490875, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002015546488109976, "grad_norm": 5.904763221740723, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8523274660110474, "num_tokens": 291889383.0, "step": 7653 }, { "epoch": 0.9736674723317644, "ewc_loss": 0.04890347272157669, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002021695108851418, "grad_norm": 5.823448181152344, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8697020411491394, "num_tokens": 291927541.0, "step": 7654 }, { "epoch": 0.973794682610355, "ewc_loss": 0.048813529312610626, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020127005700487643, "grad_norm": 5.842459201812744, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8448131084442139, "num_tokens": 291969074.0, "step": 7655 }, { "epoch": 0.9739218928889454, "ewc_loss": 0.048802897334098816, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020238445722498, "grad_norm": 5.809533596038818, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8580582141876221, "num_tokens": 292008311.0, "step": 7656 }, { "epoch": 0.9740491031675359, "ewc_loss": 0.04880784451961517, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002012132026720792, "grad_norm": 5.849048614501953, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8520881533622742, "num_tokens": 292051810.0, "step": 7657 }, { "epoch": 0.9741763134461264, "ewc_loss": 0.048886366188526154, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020199845312163234, "grad_norm": 5.804966449737549, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8676643371582031, "num_tokens": 292090411.0, "step": 7658 }, { "epoch": 0.974303523724717, "ewc_loss": 0.04880814999341965, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020121625857427716, "grad_norm": 5.8286614418029785, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8526074290275574, "num_tokens": 292124411.0, "step": 7659 }, { "epoch": 0.9744307340033075, "ewc_loss": 0.04882419854402542, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020137676619924605, "grad_norm": 5.76797342300415, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8605127930641174, "num_tokens": 292166428.0, "step": 7660 }, { "epoch": 0.974557944281898, "ewc_loss": 0.04889153689146042, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020205012697260827, "grad_norm": 5.851269721984863, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8624305725097656, "num_tokens": 292202687.0, "step": 7661 }, { "epoch": 0.9746851545604885, "ewc_loss": 0.04886116832494736, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002017464576056227, "grad_norm": 5.779898643493652, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.869908332824707, "num_tokens": 292246364.0, "step": 7662 }, { "epoch": 0.974812364839079, "ewc_loss": 0.048839762806892395, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002015323843806982, "grad_norm": 5.82388162612915, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8622623085975647, "num_tokens": 292285789.0, "step": 7663 }, { "epoch": 0.9749395751176695, "ewc_loss": 0.048838403075933456, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002015187928918749, "grad_norm": 5.8223185539245605, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8546534180641174, "num_tokens": 292320755.0, "step": 7664 }, { "epoch": 0.97506678539626, "ewc_loss": 0.04881267994642258, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002012615732382983, "grad_norm": 5.747255802154541, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.869436502456665, "num_tokens": 292362818.0, "step": 7665 }, { "epoch": 0.9751939956748505, "ewc_loss": 0.04890869930386543, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020222176681272686, "grad_norm": 5.845584392547607, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.852790117263794, "num_tokens": 292404016.0, "step": 7666 }, { "epoch": 0.9753212059534411, "ewc_loss": 0.04887467995285988, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002018815721385181, "grad_norm": 5.857615947723389, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.84749835729599, "num_tokens": 292442117.0, "step": 7667 }, { "epoch": 0.9754484162320316, "ewc_loss": 0.0488910898566246, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020204565953463316, "grad_norm": 5.8705735206604, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8675581216812134, "num_tokens": 292472523.0, "step": 7668 }, { "epoch": 0.975575626510622, "ewc_loss": 0.048866741359233856, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020180220599286258, "grad_norm": 5.871371746063232, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8655652403831482, "num_tokens": 292508571.0, "step": 7669 }, { "epoch": 0.9757028367892125, "ewc_loss": 0.0488143190741539, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002012779441429302, "grad_norm": 5.799705982208252, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8578030467033386, "num_tokens": 292545358.0, "step": 7670 }, { "epoch": 0.9758300470678031, "ewc_loss": 0.048898257315158844, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020211732771713287, "grad_norm": 5.875000476837158, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.845831036567688, "num_tokens": 292583962.0, "step": 7671 }, { "epoch": 0.9759572573463936, "ewc_loss": 0.04885295405983925, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020166431204415858, "grad_norm": 5.8252177238464355, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8606878519058228, "num_tokens": 292616186.0, "step": 7672 }, { "epoch": 0.9760844676249841, "ewc_loss": 0.04889790341258049, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020211379160173237, "grad_norm": 5.8520307540893555, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8567742705345154, "num_tokens": 292657378.0, "step": 7673 }, { "epoch": 0.9762116779035747, "ewc_loss": 0.04884197190403938, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002015544887399301, "grad_norm": 5.815097808837891, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8745703101158142, "num_tokens": 292692041.0, "step": 7674 }, { "epoch": 0.9763388881821651, "ewc_loss": 0.04887861758470535, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020192093506921083, "grad_norm": 5.854404926300049, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8581736087799072, "num_tokens": 292725062.0, "step": 7675 }, { "epoch": 0.9764660984607556, "ewc_loss": 0.04886972904205322, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020183203741908073, "grad_norm": 5.815791130065918, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8684499263763428, "num_tokens": 292766522.0, "step": 7676 }, { "epoch": 0.9765933087393461, "ewc_loss": 0.048913098871707916, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020226577180437744, "grad_norm": 5.796308517456055, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8599840998649597, "num_tokens": 292811371.0, "step": 7677 }, { "epoch": 0.9767205190179367, "ewc_loss": 0.04889383167028427, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020207308989483863, "grad_norm": 5.84515905380249, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8611927032470703, "num_tokens": 292850237.0, "step": 7678 }, { "epoch": 0.9768477292965272, "ewc_loss": 0.04891517758369446, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020228656649123877, "grad_norm": 5.8945770263671875, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.864945650100708, "num_tokens": 292887291.0, "step": 7679 }, { "epoch": 0.9769749395751177, "ewc_loss": 0.048865966498851776, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002017944207182154, "grad_norm": 5.816092491149902, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.873914361000061, "num_tokens": 292928396.0, "step": 7680 }, { "epoch": 0.9771021498537081, "ewc_loss": 0.048898957669734955, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002021243271883577, "grad_norm": 5.8616533279418945, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.864617645740509, "num_tokens": 292962748.0, "step": 7681 }, { "epoch": 0.9772293601322987, "ewc_loss": 0.04885733500123024, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020170811330899596, "grad_norm": 5.827065944671631, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8726438283920288, "num_tokens": 293002330.0, "step": 7682 }, { "epoch": 0.9773565704108892, "ewc_loss": 0.0488552525639534, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020168728951830417, "grad_norm": 5.849227428436279, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8616613149642944, "num_tokens": 293038288.0, "step": 7683 }, { "epoch": 0.9774837806894797, "ewc_loss": 0.0488642081618309, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020177687110845, "grad_norm": 5.8388495445251465, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8635972738265991, "num_tokens": 293078121.0, "step": 7684 }, { "epoch": 0.9776109909680702, "ewc_loss": 0.04886845126748085, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020181927538942546, "grad_norm": 5.865241050720215, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8574194312095642, "num_tokens": 293116846.0, "step": 7685 }, { "epoch": 0.9777382012466608, "ewc_loss": 0.048854902386665344, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002016837679548189, "grad_norm": 5.7730512619018555, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8571330308914185, "num_tokens": 293162237.0, "step": 7686 }, { "epoch": 0.9778654115252512, "ewc_loss": 0.04890771582722664, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020221192971803248, "grad_norm": 5.9502668380737305, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8572351932525635, "num_tokens": 293196631.0, "step": 7687 }, { "epoch": 0.9779926218038417, "ewc_loss": 0.048891581594944, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020205057808198035, "grad_norm": 5.833976745605469, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8711327314376831, "num_tokens": 293238883.0, "step": 7688 }, { "epoch": 0.9781198320824323, "ewc_loss": 0.0488613061606884, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020174782548565418, "grad_norm": 5.8980865478515625, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8515179753303528, "num_tokens": 293270802.0, "step": 7689 }, { "epoch": 0.9782470423610228, "ewc_loss": 0.0489158034324646, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020229280926287174, "grad_norm": 5.9185638427734375, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8478295803070068, "num_tokens": 293314910.0, "step": 7690 }, { "epoch": 0.9783742526396133, "ewc_loss": 0.04882095381617546, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020134430087637156, "grad_norm": 5.838046073913574, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8738117218017578, "num_tokens": 293353083.0, "step": 7691 }, { "epoch": 0.9785014629182038, "ewc_loss": 0.04879921302199364, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020112689526285976, "grad_norm": 5.894289016723633, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.862507700920105, "num_tokens": 293389473.0, "step": 7692 }, { "epoch": 0.9786286731967943, "ewc_loss": 0.04881248623132706, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020125960872974247, "grad_norm": 5.885563373565674, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8742354512214661, "num_tokens": 293426893.0, "step": 7693 }, { "epoch": 0.9787558834753848, "ewc_loss": 0.04876541346311569, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020078889792785048, "grad_norm": 5.815793514251709, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8663768172264099, "num_tokens": 293468116.0, "step": 7694 }, { "epoch": 0.9788830937539753, "ewc_loss": 0.04877443239092827, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002008790906984359, "grad_norm": 5.870217323303223, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8642485737800598, "num_tokens": 293503050.0, "step": 7695 }, { "epoch": 0.9790103040325658, "ewc_loss": 0.048802126199007034, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020115602819714695, "grad_norm": 5.887694358825684, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8510392904281616, "num_tokens": 293541673.0, "step": 7696 }, { "epoch": 0.9791375143111564, "ewc_loss": 0.04874444007873535, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002005791902774945, "grad_norm": 5.83721923828125, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8582711219787598, "num_tokens": 293578472.0, "step": 7697 }, { "epoch": 0.9792647245897469, "ewc_loss": 0.04880804196000099, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020121518173255026, "grad_norm": 5.923488140106201, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8625267744064331, "num_tokens": 293611204.0, "step": 7698 }, { "epoch": 0.9793919348683373, "ewc_loss": 0.04875607043504715, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020069546008016914, "grad_norm": 5.800431251525879, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8509432077407837, "num_tokens": 293651352.0, "step": 7699 }, { "epoch": 0.9795191451469278, "ewc_loss": 0.048816025257110596, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020129504264332354, "grad_norm": 5.915798664093018, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8721793293952942, "num_tokens": 293687396.0, "step": 7700 }, { "epoch": 0.9796463554255184, "ewc_loss": 0.04876488074660301, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002007835719268769, "grad_norm": 5.841913223266602, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8574813008308411, "num_tokens": 293725083.0, "step": 7701 }, { "epoch": 0.9797735657041089, "ewc_loss": 0.0488196462392807, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020133121870458126, "grad_norm": 5.8627238273620605, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8616104125976562, "num_tokens": 293766136.0, "step": 7702 }, { "epoch": 0.9799007759826994, "ewc_loss": 0.048790790140628815, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002010426833294332, "grad_norm": 5.923851490020752, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8443191647529602, "num_tokens": 293805810.0, "step": 7703 }, { "epoch": 0.98002798626129, "ewc_loss": 0.04880557954311371, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020119055989198387, "grad_norm": 5.919464111328125, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.853972852230072, "num_tokens": 293839041.0, "step": 7704 }, { "epoch": 0.9801551965398804, "ewc_loss": 0.04873618483543396, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020049662271048874, "grad_norm": 5.760217189788818, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8754318952560425, "num_tokens": 293876541.0, "step": 7705 }, { "epoch": 0.9802824068184709, "ewc_loss": 0.048772890120744705, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020086366566829383, "grad_norm": 5.83705472946167, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8752785921096802, "num_tokens": 293918141.0, "step": 7706 }, { "epoch": 0.9804096170970614, "ewc_loss": 0.04883577674627304, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020149254123680294, "grad_norm": 5.863001346588135, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8484583497047424, "num_tokens": 293958078.0, "step": 7707 }, { "epoch": 0.980536827375652, "ewc_loss": 0.048822589218616486, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020136067178100348, "grad_norm": 5.874109268188477, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8710476160049438, "num_tokens": 293990347.0, "step": 7708 }, { "epoch": 0.9806640376542425, "ewc_loss": 0.048832669854164124, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020146145834587514, "grad_norm": 5.844400882720947, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8663719892501831, "num_tokens": 294032136.0, "step": 7709 }, { "epoch": 0.980791247932833, "ewc_loss": 0.048816777765750885, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020130255143158138, "grad_norm": 5.9181437492370605, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8478675484657288, "num_tokens": 294065136.0, "step": 7710 }, { "epoch": 0.9809184582114235, "ewc_loss": 0.04884236305952072, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002015583886532113, "grad_norm": 5.812950611114502, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8750945329666138, "num_tokens": 294104539.0, "step": 7711 }, { "epoch": 0.981045668490014, "ewc_loss": 0.048824213445186615, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020137691171839833, "grad_norm": 5.850099563598633, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.859006404876709, "num_tokens": 294148962.0, "step": 7712 }, { "epoch": 0.9811728787686045, "ewc_loss": 0.04885152727365494, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020165002206340432, "grad_norm": 5.850246429443359, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8663436770439148, "num_tokens": 294181866.0, "step": 7713 }, { "epoch": 0.981300089047195, "ewc_loss": 0.04879677668213844, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020110253535676748, "grad_norm": 5.80160665512085, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8697166442871094, "num_tokens": 294223558.0, "step": 7714 }, { "epoch": 0.9814272993257855, "ewc_loss": 0.04884864389896393, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020162119471933693, "grad_norm": 5.928678035736084, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.865642786026001, "num_tokens": 294260487.0, "step": 7715 }, { "epoch": 0.9815545096043761, "ewc_loss": 0.04884496331214905, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020158437837380916, "grad_norm": 5.772400379180908, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8622710704803467, "num_tokens": 294302642.0, "step": 7716 }, { "epoch": 0.9816817198829666, "ewc_loss": 0.048807885497808456, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020121362467762083, "grad_norm": 5.989431381225586, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8510738611221313, "num_tokens": 294338376.0, "step": 7717 }, { "epoch": 0.981808930161557, "ewc_loss": 0.048884373158216476, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020197850244585425, "grad_norm": 5.839016914367676, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.855456531047821, "num_tokens": 294372089.0, "step": 7718 }, { "epoch": 0.9819361404401475, "ewc_loss": 0.04887460172176361, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000201880800887011, "grad_norm": 5.904412746429443, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8660641312599182, "num_tokens": 294403801.0, "step": 7719 }, { "epoch": 0.9820633507187381, "ewc_loss": 0.04886038601398468, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020173864322714508, "grad_norm": 5.837069034576416, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8643383979797363, "num_tokens": 294440301.0, "step": 7720 }, { "epoch": 0.9821905609973286, "ewc_loss": 0.048885151743888855, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020198625861667097, "grad_norm": 5.867495059967041, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8604216575622559, "num_tokens": 294472967.0, "step": 7721 }, { "epoch": 0.9823177712759191, "ewc_loss": 0.04889176785945892, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020205246983096004, "grad_norm": 5.813827991485596, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.851192831993103, "num_tokens": 294518248.0, "step": 7722 }, { "epoch": 0.9824449815545097, "ewc_loss": 0.04890327900648117, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002021675609285012, "grad_norm": 5.840002536773682, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8598600029945374, "num_tokens": 294553899.0, "step": 7723 }, { "epoch": 0.9825721918331001, "ewc_loss": 0.048933595418930054, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020247072097845376, "grad_norm": 5.8026323318481445, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8391631841659546, "num_tokens": 294591382.0, "step": 7724 }, { "epoch": 0.9826994021116906, "ewc_loss": 0.048962078988552094, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020275553106330335, "grad_norm": 5.845051288604736, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.861907958984375, "num_tokens": 294629606.0, "step": 7725 }, { "epoch": 0.9828266123902811, "ewc_loss": 0.04900582879781723, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020319306349847466, "grad_norm": 5.770942687988281, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8645113706588745, "num_tokens": 294669417.0, "step": 7726 }, { "epoch": 0.9829538226688717, "ewc_loss": 0.04901156201958656, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002032503834925592, "grad_norm": 5.851736068725586, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.871182918548584, "num_tokens": 294709876.0, "step": 7727 }, { "epoch": 0.9830810329474622, "ewc_loss": 0.04901757836341858, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002033105556620285, "grad_norm": 5.819851398468018, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8528940677642822, "num_tokens": 294750565.0, "step": 7728 }, { "epoch": 0.9832082432260527, "ewc_loss": 0.049060139805078506, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020373615552671254, "grad_norm": 5.845263481140137, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8621472716331482, "num_tokens": 294790586.0, "step": 7729 }, { "epoch": 0.9833354535046431, "ewc_loss": 0.04902826249599457, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020341738127171993, "grad_norm": 5.829738616943359, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8595713973045349, "num_tokens": 294830176.0, "step": 7730 }, { "epoch": 0.9834626637832337, "ewc_loss": 0.04899541288614273, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020308891544118524, "grad_norm": 5.819910526275635, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8492882251739502, "num_tokens": 294874199.0, "step": 7731 }, { "epoch": 0.9835898740618242, "ewc_loss": 0.04903235286474228, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020345831580925733, "grad_norm": 5.795773029327393, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8681373000144958, "num_tokens": 294914219.0, "step": 7732 }, { "epoch": 0.9837170843404147, "ewc_loss": 0.04901035130023956, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020323829085100442, "grad_norm": 5.8793535232543945, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8390697836875916, "num_tokens": 294950726.0, "step": 7733 }, { "epoch": 0.9838442946190052, "ewc_loss": 0.04902410879731178, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020337585010565817, "grad_norm": 5.815770626068115, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8617130517959595, "num_tokens": 294993815.0, "step": 7734 }, { "epoch": 0.9839715048975958, "ewc_loss": 0.048970215022563934, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002028369199251756, "grad_norm": 5.8184638023376465, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8693537712097168, "num_tokens": 295029275.0, "step": 7735 }, { "epoch": 0.9840987151761862, "ewc_loss": 0.049009907990694046, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020323383796494454, "grad_norm": 5.824791431427002, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8611131906509399, "num_tokens": 295066166.0, "step": 7736 }, { "epoch": 0.9842259254547767, "ewc_loss": 0.04902712628245354, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002034060307778418, "grad_norm": 5.898629665374756, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8391817808151245, "num_tokens": 295100436.0, "step": 7737 }, { "epoch": 0.9843531357333672, "ewc_loss": 0.04897547885775566, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002028895542025566, "grad_norm": 5.761434555053711, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8612686991691589, "num_tokens": 295141756.0, "step": 7738 }, { "epoch": 0.9844803460119578, "ewc_loss": 0.049028828740119934, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020342302741482854, "grad_norm": 5.860250473022461, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8785827159881592, "num_tokens": 295178894.0, "step": 7739 }, { "epoch": 0.9846075562905483, "ewc_loss": 0.04899048060178757, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020303956989664584, "grad_norm": 5.805539608001709, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8508104681968689, "num_tokens": 295219133.0, "step": 7740 }, { "epoch": 0.9847347665691388, "ewc_loss": 0.048981793224811554, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002029526949627325, "grad_norm": 5.860413074493408, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8617497086524963, "num_tokens": 295260322.0, "step": 7741 }, { "epoch": 0.9848619768477292, "ewc_loss": 0.04901104420423508, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020324518845882267, "grad_norm": 5.835089206695557, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8554941415786743, "num_tokens": 295299675.0, "step": 7742 }, { "epoch": 0.9849891871263198, "ewc_loss": 0.04893919825553894, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002025267604039982, "grad_norm": 5.8134307861328125, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8673489689826965, "num_tokens": 295336281.0, "step": 7743 }, { "epoch": 0.9851163974049103, "ewc_loss": 0.049021076411008835, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020334552391432226, "grad_norm": 5.828913688659668, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8591830730438232, "num_tokens": 295376932.0, "step": 7744 }, { "epoch": 0.9852436076835008, "ewc_loss": 0.04898979514837265, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020303271594457328, "grad_norm": 5.817639350891113, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.857958197593689, "num_tokens": 295421904.0, "step": 7745 }, { "epoch": 0.9853708179620914, "ewc_loss": 0.04900575429201126, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000203192321350798, "grad_norm": 5.898089408874512, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8615095615386963, "num_tokens": 295456688.0, "step": 7746 }, { "epoch": 0.9854980282406819, "ewc_loss": 0.048977166414260864, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020290643442422152, "grad_norm": 5.781014442443848, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8785219192504883, "num_tokens": 295500022.0, "step": 7747 }, { "epoch": 0.9856252385192723, "ewc_loss": 0.04900012165307999, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020313600543886423, "grad_norm": 5.847408294677734, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8811690807342529, "num_tokens": 295538106.0, "step": 7748 }, { "epoch": 0.9857524487978628, "ewc_loss": 0.048996321856975555, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002030979812843725, "grad_norm": 5.8665032386779785, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8563536405563354, "num_tokens": 295572465.0, "step": 7749 }, { "epoch": 0.9858796590764534, "ewc_loss": 0.048997193574905396, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020310671243350953, "grad_norm": 5.810748100280762, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8667200803756714, "num_tokens": 295614577.0, "step": 7750 }, { "epoch": 0.9860068693550439, "ewc_loss": 0.04900708049535751, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020320557814557105, "grad_norm": 5.820578098297119, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8644018173217773, "num_tokens": 295654780.0, "step": 7751 }, { "epoch": 0.9861340796336344, "ewc_loss": 0.04897285997867584, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020286337530706078, "grad_norm": 5.798308849334717, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8608811497688293, "num_tokens": 295695880.0, "step": 7752 }, { "epoch": 0.986261289912225, "ewc_loss": 0.049030229449272156, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020343705546110868, "grad_norm": 5.885445594787598, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8615500926971436, "num_tokens": 295738176.0, "step": 7753 }, { "epoch": 0.9863885001908154, "ewc_loss": 0.048960767686367035, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020274244889151305, "grad_norm": 5.853438854217529, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8500562906265259, "num_tokens": 295777562.0, "step": 7754 }, { "epoch": 0.9865157104694059, "ewc_loss": 0.048958804458379745, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020272281835786998, "grad_norm": 5.811585903167725, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8774731755256653, "num_tokens": 295815013.0, "step": 7755 }, { "epoch": 0.9866429207479964, "ewc_loss": 0.04898906499147415, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020302541088312864, "grad_norm": 5.837432861328125, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8502801656723022, "num_tokens": 295857751.0, "step": 7756 }, { "epoch": 0.986770131026587, "ewc_loss": 0.04898989945650101, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020303377823438495, "grad_norm": 5.842268466949463, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8667702674865723, "num_tokens": 295896033.0, "step": 7757 }, { "epoch": 0.9868973413051775, "ewc_loss": 0.04901555925607681, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002032903430517763, "grad_norm": 5.828630447387695, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8636731505393982, "num_tokens": 295936838.0, "step": 7758 }, { "epoch": 0.987024551583768, "ewc_loss": 0.049002695828676224, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020316173322498798, "grad_norm": 5.827977180480957, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.867261528968811, "num_tokens": 295972307.0, "step": 7759 }, { "epoch": 0.9871517618623584, "ewc_loss": 0.049062374979257584, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020375852182041854, "grad_norm": 5.906802654266357, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8589656352996826, "num_tokens": 296009966.0, "step": 7760 }, { "epoch": 0.987278972140949, "ewc_loss": 0.04899747669696808, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020310950640123338, "grad_norm": 5.876675128936768, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8578029870986938, "num_tokens": 296047874.0, "step": 7761 }, { "epoch": 0.9874061824195395, "ewc_loss": 0.0490453839302063, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020358858455438167, "grad_norm": 5.858377933502197, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8694337010383606, "num_tokens": 296090591.0, "step": 7762 }, { "epoch": 0.98753339269813, "ewc_loss": 0.04898908734321594, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020302561460994184, "grad_norm": 5.849045753479004, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8560271263122559, "num_tokens": 296133024.0, "step": 7763 }, { "epoch": 0.9876606029767205, "ewc_loss": 0.04898529499769211, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002029877359746024, "grad_norm": 5.83376932144165, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8495210409164429, "num_tokens": 296177347.0, "step": 7764 }, { "epoch": 0.9877878132553111, "ewc_loss": 0.04904010146856308, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002035357611021027, "grad_norm": 5.840099811553955, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8516196012496948, "num_tokens": 296215532.0, "step": 7765 }, { "epoch": 0.9879150235339016, "ewc_loss": 0.049005813896656036, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020319291797932237, "grad_norm": 5.934475898742676, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8499623537063599, "num_tokens": 296249494.0, "step": 7766 }, { "epoch": 0.988042233812492, "ewc_loss": 0.049010008573532104, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020323487115092576, "grad_norm": 5.838513374328613, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8523443341255188, "num_tokens": 296289370.0, "step": 7767 }, { "epoch": 0.9881694440910825, "ewc_loss": 0.049019042402505875, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000203325180336833, "grad_norm": 5.867129802703857, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8620099425315857, "num_tokens": 296329376.0, "step": 7768 }, { "epoch": 0.9882966543696731, "ewc_loss": 0.04901653528213501, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020330012193880975, "grad_norm": 5.8163042068481445, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8677628040313721, "num_tokens": 296368875.0, "step": 7769 }, { "epoch": 0.9884238646482636, "ewc_loss": 0.049008384346961975, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002032186312135309, "grad_norm": 5.885040283203125, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8598934412002563, "num_tokens": 296404766.0, "step": 7770 }, { "epoch": 0.9885510749268541, "ewc_loss": 0.04904927313327789, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002036274818237871, "grad_norm": 5.846181869506836, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8597210645675659, "num_tokens": 296441705.0, "step": 7771 }, { "epoch": 0.9886782852054447, "ewc_loss": 0.04896971583366394, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020283192861825228, "grad_norm": 5.827526569366455, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8683536052703857, "num_tokens": 296481075.0, "step": 7772 }, { "epoch": 0.9888054954840351, "ewc_loss": 0.04903370514512062, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020347181998658925, "grad_norm": 5.931447982788086, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8575153350830078, "num_tokens": 296511404.0, "step": 7773 }, { "epoch": 0.9889327057626256, "ewc_loss": 0.04900109022855759, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020314565335866064, "grad_norm": 5.889761447906494, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8516445755958557, "num_tokens": 296546080.0, "step": 7774 }, { "epoch": 0.9890599160412161, "ewc_loss": 0.04903228208422661, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002034575882134959, "grad_norm": 5.840913772583008, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8635379672050476, "num_tokens": 296586119.0, "step": 7775 }, { "epoch": 0.9891871263198067, "ewc_loss": 0.04906231164932251, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020375789608806372, "grad_norm": 5.851629257202148, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8596305251121521, "num_tokens": 296629602.0, "step": 7776 }, { "epoch": 0.9893143365983972, "ewc_loss": 0.049047693610191345, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002036116784438491, "grad_norm": 5.885305881500244, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8539397716522217, "num_tokens": 296666709.0, "step": 7777 }, { "epoch": 0.9894415468769877, "ewc_loss": 0.04904479905962944, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002035827492363751, "grad_norm": 5.830063819885254, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8593930006027222, "num_tokens": 296702611.0, "step": 7778 }, { "epoch": 0.9895687571555781, "ewc_loss": 0.049082737416028976, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020396214677020907, "grad_norm": 5.9392571449279785, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8663209676742554, "num_tokens": 296737649.0, "step": 7779 }, { "epoch": 0.9896959674341687, "ewc_loss": 0.049000538885593414, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020314015273470432, "grad_norm": 5.792259216308594, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8627045154571533, "num_tokens": 296776915.0, "step": 7780 }, { "epoch": 0.9898231777127592, "ewc_loss": 0.04903741180896759, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020350889826659113, "grad_norm": 5.89675760269165, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8680628538131714, "num_tokens": 296811491.0, "step": 7781 }, { "epoch": 0.9899503879913497, "ewc_loss": 0.049096591770648956, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002041006664512679, "grad_norm": 5.923338413238525, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8653287887573242, "num_tokens": 296847602.0, "step": 7782 }, { "epoch": 0.9900775982699402, "ewc_loss": 0.04898662120103836, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020300097821746022, "grad_norm": 5.832040309906006, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8451639413833618, "num_tokens": 296884737.0, "step": 7783 }, { "epoch": 0.9902048085485308, "ewc_loss": 0.04908040910959244, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020393884915392846, "grad_norm": 5.891307353973389, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8493971228599548, "num_tokens": 296927780.0, "step": 7784 }, { "epoch": 0.9903320188271212, "ewc_loss": 0.04908096045255661, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020394434977788478, "grad_norm": 5.907236576080322, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8754760026931763, "num_tokens": 296964200.0, "step": 7785 }, { "epoch": 0.9904592291057117, "ewc_loss": 0.049067482352256775, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020380958449095488, "grad_norm": 5.820209503173828, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8527262806892395, "num_tokens": 297009927.0, "step": 7786 }, { "epoch": 0.9905864393843022, "ewc_loss": 0.04907079041004181, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020384267554618418, "grad_norm": 5.959294319152832, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8487449288368225, "num_tokens": 297050788.0, "step": 7787 }, { "epoch": 0.9907136496628928, "ewc_loss": 0.04904213175177574, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020355609012767673, "grad_norm": 5.825775623321533, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8669640421867371, "num_tokens": 297092398.0, "step": 7788 }, { "epoch": 0.9908408599414833, "ewc_loss": 0.049025341868400574, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020338816102594137, "grad_norm": 5.848908424377441, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8586610555648804, "num_tokens": 297130728.0, "step": 7789 }, { "epoch": 0.9909680702200738, "ewc_loss": 0.049025438725948334, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002033891505561769, "grad_norm": 5.844511032104492, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8840682506561279, "num_tokens": 297169124.0, "step": 7790 }, { "epoch": 0.9910952804986642, "ewc_loss": 0.048990800976753235, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002030427858699113, "grad_norm": 5.935964584350586, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8705830574035645, "num_tokens": 297204914.0, "step": 7791 }, { "epoch": 0.9912224907772548, "ewc_loss": 0.04903274402022362, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002034622011706233, "grad_norm": 5.860494613647461, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.857661783695221, "num_tokens": 297240551.0, "step": 7792 }, { "epoch": 0.9913497010558453, "ewc_loss": 0.04895469918847084, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020268175285309553, "grad_norm": 5.800393581390381, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8521316051483154, "num_tokens": 297287160.0, "step": 7793 }, { "epoch": 0.9914769113344358, "ewc_loss": 0.04909144341945648, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020404919632710516, "grad_norm": 5.878233432769775, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8628891706466675, "num_tokens": 297323394.0, "step": 7794 }, { "epoch": 0.9916041216130264, "ewc_loss": 0.048972856253385544, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002028633316513151, "grad_norm": 5.930917263031006, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8398618698120117, "num_tokens": 297357808.0, "step": 7795 }, { "epoch": 0.9917313318916169, "ewc_loss": 0.04905802011489868, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002037149533862248, "grad_norm": 5.884810924530029, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8566339612007141, "num_tokens": 297397300.0, "step": 7796 }, { "epoch": 0.9918585421702073, "ewc_loss": 0.048970434814691544, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002028391172643751, "grad_norm": 5.821740627288818, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8697484731674194, "num_tokens": 297434699.0, "step": 7797 }, { "epoch": 0.9919857524487978, "ewc_loss": 0.04905228689312935, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020365763339214027, "grad_norm": 5.850226402282715, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8612991571426392, "num_tokens": 297473939.0, "step": 7798 }, { "epoch": 0.9921129627273884, "ewc_loss": 0.04902166873216629, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020335146109573543, "grad_norm": 5.870213508605957, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8539462089538574, "num_tokens": 297511154.0, "step": 7799 }, { "epoch": 0.9922401730059789, "ewc_loss": 0.049070172011852264, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002038364764302969, "grad_norm": 5.801812171936035, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8525211811065674, "num_tokens": 297556092.0, "step": 7800 }, { "epoch": 0.9923673832845694, "ewc_loss": 0.049117621034383774, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020431097073014826, "grad_norm": 5.898002624511719, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8656259775161743, "num_tokens": 297598160.0, "step": 7801 }, { "epoch": 0.9924945935631599, "ewc_loss": 0.049053043127059937, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020366517128422856, "grad_norm": 5.806299209594727, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8778910636901855, "num_tokens": 297638706.0, "step": 7802 }, { "epoch": 0.9926218038417504, "ewc_loss": 0.04913146048784256, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020444937399588525, "grad_norm": 5.895273208618164, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8659683465957642, "num_tokens": 297675955.0, "step": 7803 }, { "epoch": 0.9927490141203409, "ewc_loss": 0.049069177359342575, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020382653747219592, "grad_norm": 5.834592819213867, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8564857244491577, "num_tokens": 297715362.0, "step": 7804 }, { "epoch": 0.9928762243989314, "ewc_loss": 0.049092940986156464, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020406417024787515, "grad_norm": 5.9213948249816895, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8468677997589111, "num_tokens": 297750931.0, "step": 7805 }, { "epoch": 0.993003434677522, "ewc_loss": 0.04909303039312363, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020406505791470408, "grad_norm": 5.860998630523682, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8533979654312134, "num_tokens": 297786107.0, "step": 7806 }, { "epoch": 0.9931306449561125, "ewc_loss": 0.04907047748565674, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020383951778057963, "grad_norm": 5.877895355224609, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8602949976921082, "num_tokens": 297824782.0, "step": 7807 }, { "epoch": 0.993257855234703, "ewc_loss": 0.049082037061452866, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000203955132747069, "grad_norm": 5.888908386230469, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8573817014694214, "num_tokens": 297861748.0, "step": 7808 }, { "epoch": 0.9933850655132934, "ewc_loss": 0.04899951070547104, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020312986453063786, "grad_norm": 5.828101634979248, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8610669374465942, "num_tokens": 297902722.0, "step": 7809 }, { "epoch": 0.993512275791884, "ewc_loss": 0.0490366630256176, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020350140403024852, "grad_norm": 5.814698696136475, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8624717593193054, "num_tokens": 297948709.0, "step": 7810 }, { "epoch": 0.9936394860704745, "ewc_loss": 0.049076080322265625, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002038955717580393, "grad_norm": 5.856333255767822, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8629381656646729, "num_tokens": 297986446.0, "step": 7811 }, { "epoch": 0.993766696349065, "ewc_loss": 0.04907860606908798, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002039208193309605, "grad_norm": 5.858335018157959, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8543961644172668, "num_tokens": 298024004.0, "step": 7812 }, { "epoch": 0.9938939066276555, "ewc_loss": 0.04913193732500076, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020445413247216493, "grad_norm": 5.881438732147217, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8511655926704407, "num_tokens": 298065518.0, "step": 7813 }, { "epoch": 0.9940211169062461, "ewc_loss": 0.049033619463443756, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020347096142359078, "grad_norm": 5.874460220336914, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8539919257164001, "num_tokens": 298108138.0, "step": 7814 }, { "epoch": 0.9941483271848366, "ewc_loss": 0.0490502268075943, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002036370278801769, "grad_norm": 5.845443248748779, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8606293797492981, "num_tokens": 298149348.0, "step": 7815 }, { "epoch": 0.994275537463427, "ewc_loss": 0.0491144061088562, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020427884010132402, "grad_norm": 5.872908115386963, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.857895016670227, "num_tokens": 298186347.0, "step": 7816 }, { "epoch": 0.9944027477420175, "ewc_loss": 0.04911668971180916, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020430167205631733, "grad_norm": 5.886931896209717, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8403419256210327, "num_tokens": 298223322.0, "step": 7817 }, { "epoch": 0.9945299580206081, "ewc_loss": 0.0491531640291214, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020466641581151634, "grad_norm": 5.896425247192383, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8550480604171753, "num_tokens": 298258794.0, "step": 7818 }, { "epoch": 0.9946571682991986, "ewc_loss": 0.04914446175098419, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020457936625462025, "grad_norm": 5.893875598907471, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8542571067810059, "num_tokens": 298297510.0, "step": 7819 }, { "epoch": 0.9947843785777891, "ewc_loss": 0.04916347563266754, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002047695015789941, "grad_norm": 5.901682376861572, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8651202917098999, "num_tokens": 298333784.0, "step": 7820 }, { "epoch": 0.9949115888563796, "ewc_loss": 0.0491270050406456, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020440483058337122, "grad_norm": 5.836562156677246, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8660227060317993, "num_tokens": 298371241.0, "step": 7821 }, { "epoch": 0.9950387991349701, "ewc_loss": 0.04901234805583954, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020447897259145975, "grad_norm": 5.8880157470703125, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8464539647102356, "num_tokens": 298407902.0, "step": 7822 }, { "epoch": 0.9951660094135606, "ewc_loss": 0.04911132901906967, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020424806280061603, "grad_norm": 5.825649261474609, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8353874683380127, "num_tokens": 298452339.0, "step": 7823 }, { "epoch": 0.9952932196921511, "ewc_loss": 0.04912800341844559, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.0002056354860542342, "grad_norm": 5.86076021194458, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.851087212562561, "num_tokens": 298495854.0, "step": 7824 }, { "epoch": 0.9954204299707416, "ewc_loss": 0.04894708842039108, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020382636284921318, "grad_norm": 5.890114784240723, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8550362586975098, "num_tokens": 298529642.0, "step": 7825 }, { "epoch": 0.9955476402493322, "ewc_loss": 0.049073703587055206, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020509252499323338, "grad_norm": 5.805517673492432, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8619768023490906, "num_tokens": 298574396.0, "step": 7826 }, { "epoch": 0.9956748505279227, "ewc_loss": 0.04916742816567421, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002048090536845848, "grad_norm": 5.871307373046875, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8468250036239624, "num_tokens": 298612457.0, "step": 7827 }, { "epoch": 0.9958020608065131, "ewc_loss": 0.049152713268995285, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020466190471779555, "grad_norm": 5.838287353515625, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.860562801361084, "num_tokens": 298645061.0, "step": 7828 }, { "epoch": 0.9959292710851037, "ewc_loss": 0.049216948449611664, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020530425535980612, "grad_norm": 5.832152366638184, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8574532866477966, "num_tokens": 298688014.0, "step": 7829 }, { "epoch": 0.9960564813636942, "ewc_loss": 0.049205146729946136, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020518623932730407, "grad_norm": 5.8629150390625, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8639036417007446, "num_tokens": 298727815.0, "step": 7830 }, { "epoch": 0.9961836916422847, "ewc_loss": 0.0490751713514328, "ewc_loss_diag": 2.86102294921875e-05, "ewc_loss_parallel": 0.00020510716421995312, "grad_norm": 5.8475165367126465, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8796232342720032, "num_tokens": 298765663.0, "step": 7831 }, { "epoch": 0.9963109019208752, "ewc_loss": 0.0491800457239151, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020493521878961474, "grad_norm": 5.899591445922852, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8642162084579468, "num_tokens": 298799360.0, "step": 7832 }, { "epoch": 0.9964381121994658, "ewc_loss": 0.04916306585073471, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002047653979388997, "grad_norm": 5.833085060119629, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8601906299591064, "num_tokens": 298835669.0, "step": 7833 }, { "epoch": 0.9965653224780562, "ewc_loss": 0.04917643964290619, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020489915914367884, "grad_norm": 5.9018144607543945, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8576220273971558, "num_tokens": 298866871.0, "step": 7834 }, { "epoch": 0.9966925327566467, "ewc_loss": 0.049131568521261215, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020445045083761215, "grad_norm": 5.82416296005249, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8519147634506226, "num_tokens": 298904136.0, "step": 7835 }, { "epoch": 0.9968197430352372, "ewc_loss": 0.04918655753135681, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002050003531621769, "grad_norm": 5.853996753692627, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8720973134040833, "num_tokens": 298937069.0, "step": 7836 }, { "epoch": 0.9969469533138278, "ewc_loss": 0.04919104278087616, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002050451876129955, "grad_norm": 5.863656997680664, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8533788919448853, "num_tokens": 298976778.0, "step": 7837 }, { "epoch": 0.9970741635924183, "ewc_loss": 0.049193866550922394, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020507341832853854, "grad_norm": 5.857112884521484, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8694429397583008, "num_tokens": 299010808.0, "step": 7838 }, { "epoch": 0.9972013738710088, "ewc_loss": 0.049202810972929, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020516288350336254, "grad_norm": 5.863629341125488, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8491785526275635, "num_tokens": 299047611.0, "step": 7839 }, { "epoch": 0.9973285841495992, "ewc_loss": 0.049206770956516266, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002052024647127837, "grad_norm": 5.855597972869873, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8656952381134033, "num_tokens": 299088668.0, "step": 7840 }, { "epoch": 0.9974557944281898, "ewc_loss": 0.049216128885746, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020529606263153255, "grad_norm": 5.857256889343262, "learning_rate": 1e-06, "loss": 0.5238, "mean_token_accuracy": 0.8401963114738464, "num_tokens": 299126572.0, "step": 7841 }, { "epoch": 0.9975830047067803, "ewc_loss": 0.04917972534894943, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002049320173682645, "grad_norm": 5.838727951049805, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8720299601554871, "num_tokens": 299164806.0, "step": 7842 }, { "epoch": 0.9977102149853708, "ewc_loss": 0.04920606315135956, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020519540703389794, "grad_norm": 5.810154914855957, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8564826250076294, "num_tokens": 299207039.0, "step": 7843 }, { "epoch": 0.9978374252639614, "ewc_loss": 0.04921693354845047, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020530412439256907, "grad_norm": 5.849133491516113, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8692812919616699, "num_tokens": 299243242.0, "step": 7844 }, { "epoch": 0.9979646355425519, "ewc_loss": 0.04924389719963074, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002055737131740898, "grad_norm": 5.847748279571533, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8662247657775879, "num_tokens": 299282326.0, "step": 7845 }, { "epoch": 0.9980918458211423, "ewc_loss": 0.04920356720685959, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020517042139545083, "grad_norm": 5.852190971374512, "learning_rate": 1e-06, "loss": 0.3664, "mean_token_accuracy": 0.8890668153762817, "num_tokens": 299316645.0, "step": 7846 }, { "epoch": 0.9982190560997328, "ewc_loss": 0.04913458973169327, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000204480646061711, "grad_norm": 5.887043476104736, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8628910779953003, "num_tokens": 299349932.0, "step": 7847 }, { "epoch": 0.9983462663783234, "ewc_loss": 0.04923069849610329, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020544175640679896, "grad_norm": 5.889265537261963, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.8393739461898804, "num_tokens": 299392452.0, "step": 7848 }, { "epoch": 0.9984734766569139, "ewc_loss": 0.049178920686244965, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002049239701591432, "grad_norm": 5.842020511627197, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8666785955429077, "num_tokens": 299431479.0, "step": 7849 }, { "epoch": 0.9986006869355044, "ewc_loss": 0.049205340445041656, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020518817473202944, "grad_norm": 5.869433879852295, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8576522469520569, "num_tokens": 299475585.0, "step": 7850 }, { "epoch": 0.9987278972140949, "ewc_loss": 0.049198903143405914, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020512378250714391, "grad_norm": 5.859828948974609, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.87611323595047, "num_tokens": 299514026.0, "step": 7851 }, { "epoch": 0.9988551074926854, "ewc_loss": 0.0491349995136261, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020448473514989018, "grad_norm": 5.892106533050537, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8499226570129395, "num_tokens": 299550858.0, "step": 7852 }, { "epoch": 0.9989823177712759, "ewc_loss": 0.049172841012477875, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020486320136114955, "grad_norm": 5.8741254806518555, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8538377285003662, "num_tokens": 299586831.0, "step": 7853 }, { "epoch": 0.9991095280498664, "ewc_loss": 0.049170203506946564, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020483678963501006, "grad_norm": 5.95889139175415, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8710256814956665, "num_tokens": 299623116.0, "step": 7854 }, { "epoch": 0.9992367383284569, "ewc_loss": 0.04915284365415573, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020466318528633565, "grad_norm": 5.881097316741943, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.863337516784668, "num_tokens": 299662165.0, "step": 7855 }, { "epoch": 0.9993639486070475, "ewc_loss": 0.049122825264930725, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020436299382708967, "grad_norm": 5.8933844566345215, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8577489852905273, "num_tokens": 299702581.0, "step": 7856 }, { "epoch": 0.999491158885638, "ewc_loss": 0.04915304481983185, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020466522255446762, "grad_norm": 5.890746593475342, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8468194603919983, "num_tokens": 299742559.0, "step": 7857 }, { "epoch": 0.9996183691642284, "ewc_loss": 0.049161531031131744, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020475007477216423, "grad_norm": 5.924749374389648, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.875360369682312, "num_tokens": 299778637.0, "step": 7858 }, { "epoch": 0.9997455794428189, "ewc_loss": 0.04909266531467438, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000204061419935897, "grad_norm": 5.875918865203857, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8704904913902283, "num_tokens": 299812808.0, "step": 7859 }, { "epoch": 0.9998727897214095, "ewc_loss": 0.04913689196109772, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002045036671916023, "grad_norm": 5.92742919921875, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8577578067779541, "num_tokens": 299848987.0, "step": 7860 }, { "epoch": 1.0, "ewc_loss": 0.049079954624176025, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020393433806020766, "grad_norm": 5.898753643035889, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8434956073760986, "num_tokens": 299886286.0, "step": 7861 }, { "epoch": 1.0001272102785905, "ewc_loss": 0.049068741500377655, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020382220100145787, "grad_norm": 5.861715793609619, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.856979250907898, "num_tokens": 299925456.0, "step": 7862 }, { "epoch": 1.000254420557181, "ewc_loss": 0.04910333827137947, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020416814368218184, "grad_norm": 5.91588830947876, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8572303652763367, "num_tokens": 299965936.0, "step": 7863 }, { "epoch": 1.0003816308357716, "ewc_loss": 0.049133554100990295, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020447031420189887, "grad_norm": 5.909143924713135, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8739720582962036, "num_tokens": 300003181.0, "step": 7864 }, { "epoch": 1.0005088411143621, "ewc_loss": 0.04910298436880112, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020416459301486611, "grad_norm": 5.879015922546387, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8592112064361572, "num_tokens": 300040502.0, "step": 7865 }, { "epoch": 1.0006360513929526, "ewc_loss": 0.04908809810876846, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020401574147399515, "grad_norm": 5.905529975891113, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8769294023513794, "num_tokens": 300078795.0, "step": 7866 }, { "epoch": 1.0007632616715432, "ewc_loss": 0.04916226118803024, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020475737983360887, "grad_norm": 5.933679580688477, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8708310127258301, "num_tokens": 300115188.0, "step": 7867 }, { "epoch": 1.0008904719501335, "ewc_loss": 0.04907885566353798, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002039233222603798, "grad_norm": 5.834857940673828, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8638948202133179, "num_tokens": 300158583.0, "step": 7868 }, { "epoch": 1.001017682228724, "ewc_loss": 0.04918031394481659, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020493788179010153, "grad_norm": 6.014787197113037, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.866463303565979, "num_tokens": 300194415.0, "step": 7869 }, { "epoch": 1.0011448925073145, "ewc_loss": 0.04913307726383209, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020446554117370397, "grad_norm": 5.904092311859131, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8566740155220032, "num_tokens": 300235066.0, "step": 7870 }, { "epoch": 1.001272102785905, "ewc_loss": 0.04911373555660248, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020427214622031897, "grad_norm": 5.910978317260742, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8640758991241455, "num_tokens": 300272167.0, "step": 7871 }, { "epoch": 1.0013993130644956, "ewc_loss": 0.04906675964593887, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020380233763717115, "grad_norm": 5.959534645080566, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8531163334846497, "num_tokens": 300306032.0, "step": 7872 }, { "epoch": 1.0015265233430861, "ewc_loss": 0.049049012362957, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020362490613479167, "grad_norm": 5.912470817565918, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.859932541847229, "num_tokens": 300338433.0, "step": 7873 }, { "epoch": 1.0016537336216766, "ewc_loss": 0.04908115044236183, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020394627063069493, "grad_norm": 5.927628040313721, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8514341115951538, "num_tokens": 300376527.0, "step": 7874 }, { "epoch": 1.0017809439002672, "ewc_loss": 0.049094006419181824, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020407485135365278, "grad_norm": 5.87916898727417, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8793812990188599, "num_tokens": 300417309.0, "step": 7875 }, { "epoch": 1.0019081541788577, "ewc_loss": 0.04907675087451935, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002039022947428748, "grad_norm": 5.965091705322266, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.863432765007019, "num_tokens": 300459438.0, "step": 7876 }, { "epoch": 1.0020353644574482, "ewc_loss": 0.04908301681280136, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020396495528984815, "grad_norm": 5.8765997886657715, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8397602438926697, "num_tokens": 300501443.0, "step": 7877 }, { "epoch": 1.0021625747360388, "ewc_loss": 0.04913672059774399, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002045019791694358, "grad_norm": 5.981764793395996, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8704307079315186, "num_tokens": 300531918.0, "step": 7878 }, { "epoch": 1.0022897850146293, "ewc_loss": 0.04905601218342781, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002036948862951249, "grad_norm": 5.863528251647949, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8560875654220581, "num_tokens": 300575084.0, "step": 7879 }, { "epoch": 1.0024169952932196, "ewc_loss": 0.049132704734802246, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020446178677957505, "grad_norm": 6.007826328277588, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8619693517684937, "num_tokens": 300608737.0, "step": 7880 }, { "epoch": 1.0025442055718101, "ewc_loss": 0.04910917580127716, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020422652596607804, "grad_norm": 5.850431442260742, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8494973182678223, "num_tokens": 300647855.0, "step": 7881 }, { "epoch": 1.0026714158504006, "ewc_loss": 0.04917197674512863, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002048545575235039, "grad_norm": 5.915597915649414, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8618600964546204, "num_tokens": 300689763.0, "step": 7882 }, { "epoch": 1.0027986261289912, "ewc_loss": 0.04912463575601578, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020438109640963376, "grad_norm": 5.875196933746338, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8726073503494263, "num_tokens": 300725838.0, "step": 7883 }, { "epoch": 1.0029258364075817, "ewc_loss": 0.049304455518722534, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020495863282121718, "grad_norm": 9.383200645446777, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8678040504455566, "num_tokens": 300760423.0, "step": 7884 }, { "epoch": 1.0030530466861722, "ewc_loss": 0.05322393774986267, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002453741617500782, "grad_norm": 6.400051116943359, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.867951512336731, "num_tokens": 300796236.0, "step": 7885 }, { "epoch": 1.0031802569647628, "ewc_loss": 0.04827631264925003, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00019589791190810502, "grad_norm": 5.734708786010742, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8672226667404175, "num_tokens": 300835213.0, "step": 7886 }, { "epoch": 1.0033074672433533, "ewc_loss": 0.04983772709965706, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002115120441885665, "grad_norm": 6.13244104385376, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8693963289260864, "num_tokens": 300873363.0, "step": 7887 }, { "epoch": 1.0034346775219438, "ewc_loss": 0.049358807504177094, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000206722819712013, "grad_norm": 5.9455885887146, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8575763702392578, "num_tokens": 300910233.0, "step": 7888 }, { "epoch": 1.0035618878005343, "ewc_loss": 0.049383074045181274, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002069655165541917, "grad_norm": 6.002673149108887, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8697978854179382, "num_tokens": 300946986.0, "step": 7889 }, { "epoch": 1.0036890980791249, "ewc_loss": 0.0492553636431694, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.000205688425921835, "grad_norm": 5.9162774085998535, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8638023138046265, "num_tokens": 300985750.0, "step": 7890 }, { "epoch": 1.0038163083577154, "ewc_loss": 0.04926705360412598, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020580532145686448, "grad_norm": 5.924901485443115, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8613866567611694, "num_tokens": 301032820.0, "step": 7891 }, { "epoch": 1.0039435186363057, "ewc_loss": 0.049284931272268295, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020598407718352973, "grad_norm": 5.949373722076416, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.851691722869873, "num_tokens": 301070825.0, "step": 7892 }, { "epoch": 1.0040707289148962, "ewc_loss": 0.049166902899742126, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020480378589127213, "grad_norm": 5.905764102935791, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8588672280311584, "num_tokens": 301110764.0, "step": 7893 }, { "epoch": 1.0041979391934868, "ewc_loss": 0.049236610531806946, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020550089539028704, "grad_norm": 5.93115234375, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8646533489227295, "num_tokens": 301152421.0, "step": 7894 }, { "epoch": 1.0043251494720773, "ewc_loss": 0.04916060343384743, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020474080520216376, "grad_norm": 5.900182247161865, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8631312847137451, "num_tokens": 301192780.0, "step": 7895 }, { "epoch": 1.0044523597506678, "ewc_loss": 0.049224480986595154, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020537959062494338, "grad_norm": 6.03313684463501, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8586479425430298, "num_tokens": 301228147.0, "step": 7896 }, { "epoch": 1.0045795700292584, "ewc_loss": 0.04914556443691254, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020459041115827858, "grad_norm": 5.90405797958374, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8715400099754333, "num_tokens": 301259568.0, "step": 7897 }, { "epoch": 1.0047067803078489, "ewc_loss": 0.0491781160235405, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020491592295002192, "grad_norm": 5.988809108734131, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8626989722251892, "num_tokens": 301296483.0, "step": 7898 }, { "epoch": 1.0048339905864394, "ewc_loss": 0.049138955771923065, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020452433091122657, "grad_norm": 5.8977556228637695, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8641217350959778, "num_tokens": 301337437.0, "step": 7899 }, { "epoch": 1.00496120086503, "ewc_loss": 0.049158304929733276, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020471781317610294, "grad_norm": 5.9991350173950195, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8620845079421997, "num_tokens": 301368777.0, "step": 7900 }, { "epoch": 1.0050884111436205, "ewc_loss": 0.04915819317102432, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020471667812671512, "grad_norm": 5.932465076446533, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8597455620765686, "num_tokens": 301404885.0, "step": 7901 }, { "epoch": 1.005215621422211, "ewc_loss": 0.04916592687368393, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020479403610806912, "grad_norm": 5.930006504058838, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8521141409873962, "num_tokens": 301445308.0, "step": 7902 }, { "epoch": 1.0053428317008015, "ewc_loss": 0.04912218824028969, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.0002043566491920501, "grad_norm": 5.930187702178955, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.860646665096283, "num_tokens": 301484845.0, "step": 7903 }, { "epoch": 1.0054700419793918, "ewc_loss": 0.04913433641195297, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020447811402846128, "grad_norm": 5.88435697555542, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.860543966293335, "num_tokens": 301524498.0, "step": 7904 }, { "epoch": 1.0055972522579824, "ewc_loss": 0.04932069778442383, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002051210612989962, "grad_norm": 5.901200771331787, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8675252199172974, "num_tokens": 301565522.0, "step": 7905 }, { "epoch": 1.0057244625365729, "ewc_loss": 0.04917287826538086, "ewc_loss_diag": 2.872943878173828e-05, "ewc_loss_parallel": 0.00020486356515903026, "grad_norm": 5.900145053863525, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.857403039932251, "num_tokens": 301603724.0, "step": 7906 }, { "epoch": 1.0058516728151634, "ewc_loss": 0.049323052167892456, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002051446062978357, "grad_norm": 6.005895137786865, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8596296906471252, "num_tokens": 301636650.0, "step": 7907 }, { "epoch": 1.005978883093754, "ewc_loss": 0.049339160323143005, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020530565234366804, "grad_norm": 5.904788970947266, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8520879745483398, "num_tokens": 301675328.0, "step": 7908 }, { "epoch": 1.0061060933723445, "ewc_loss": 0.04933205991983414, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020523466810118407, "grad_norm": 5.950097560882568, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8557190895080566, "num_tokens": 301710058.0, "step": 7909 }, { "epoch": 1.006233303650935, "ewc_loss": 0.049320921301841736, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020512327319011092, "grad_norm": 5.849965572357178, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8564863204956055, "num_tokens": 301753567.0, "step": 7910 }, { "epoch": 1.0063605139295255, "ewc_loss": 0.04937567934393883, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020567084720823914, "grad_norm": 9.34241771697998, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8689489960670471, "num_tokens": 301792895.0, "step": 7911 }, { "epoch": 1.006487724208116, "ewc_loss": 0.05358269065618515, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.0002465202414896339, "grad_norm": 6.422257423400879, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8746395111083984, "num_tokens": 301833001.0, "step": 7912 }, { "epoch": 1.0066149344867066, "ewc_loss": 0.0485854297876358, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00019654764037113637, "grad_norm": 5.750888824462891, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.865288257598877, "num_tokens": 301871623.0, "step": 7913 }, { "epoch": 1.006742144765297, "ewc_loss": 0.04994747042655945, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.000211388774914667, "grad_norm": 6.128913879394531, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8734515905380249, "num_tokens": 301903750.0, "step": 7914 }, { "epoch": 1.0068693550438876, "ewc_loss": 0.04960803687572479, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002079944097204134, "grad_norm": 5.840500354766846, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8631232976913452, "num_tokens": 301943079.0, "step": 7915 }, { "epoch": 1.0069965653224782, "ewc_loss": 0.049543872475624084, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020735280122607946, "grad_norm": 6.0257248878479, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8683327436447144, "num_tokens": 301980031.0, "step": 7916 }, { "epoch": 1.0071237756010685, "ewc_loss": 0.049612857401371, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020804266387131065, "grad_norm": 5.888277530670166, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8622496128082275, "num_tokens": 302020454.0, "step": 7917 }, { "epoch": 1.007250985879659, "ewc_loss": 0.0495728999376297, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002076430682791397, "grad_norm": 5.9561333656311035, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8567701578140259, "num_tokens": 302064391.0, "step": 7918 }, { "epoch": 1.0073781961582495, "ewc_loss": 0.04963619261980057, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020827598928008229, "grad_norm": 5.913792133331299, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.862338125705719, "num_tokens": 302101598.0, "step": 7919 }, { "epoch": 1.00750540643684, "ewc_loss": 0.04958433285355568, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020775738812517375, "grad_norm": 5.936538219451904, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8666912317276001, "num_tokens": 302138803.0, "step": 7920 }, { "epoch": 1.0076326167154306, "ewc_loss": 0.04958787187933922, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020779277838300914, "grad_norm": 5.923971652984619, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8602873682975769, "num_tokens": 302173682.0, "step": 7921 }, { "epoch": 1.0077598269940211, "ewc_loss": 0.04961242154240608, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002080382837448269, "grad_norm": 5.919399261474609, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8592004776000977, "num_tokens": 302207390.0, "step": 7922 }, { "epoch": 1.0078870372726116, "ewc_loss": 0.049560997635126114, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020752403361257166, "grad_norm": 5.960257053375244, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8672775626182556, "num_tokens": 302244440.0, "step": 7923 }, { "epoch": 1.0080142475512022, "ewc_loss": 0.04953497648239136, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020726384536828846, "grad_norm": 5.934086322784424, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8566082715988159, "num_tokens": 302281058.0, "step": 7924 }, { "epoch": 1.0081414578297927, "ewc_loss": 0.04958173632621765, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020773144206032157, "grad_norm": 5.899924278259277, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8782618641853333, "num_tokens": 302319586.0, "step": 7925 }, { "epoch": 1.0082686681083832, "ewc_loss": 0.049539484083652496, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020730889809783548, "grad_norm": 5.932143688201904, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8569802045822144, "num_tokens": 302361356.0, "step": 7926 }, { "epoch": 1.0083958783869738, "ewc_loss": 0.04956726357340813, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.000207586694159545, "grad_norm": 5.925848007202148, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8726656436920166, "num_tokens": 302399974.0, "step": 7927 }, { "epoch": 1.0085230886655643, "ewc_loss": 0.0495319589972496, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020723366469610482, "grad_norm": 5.913569450378418, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8656407594680786, "num_tokens": 302435709.0, "step": 7928 }, { "epoch": 1.0086502989441546, "ewc_loss": 0.04955337941646576, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020744785433635116, "grad_norm": 5.941542625427246, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8584293127059937, "num_tokens": 302474151.0, "step": 7929 }, { "epoch": 1.0087775092227451, "ewc_loss": 0.049562111496925354, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020753516582772136, "grad_norm": 5.932754993438721, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8652020692825317, "num_tokens": 302509877.0, "step": 7930 }, { "epoch": 1.0089047195013356, "ewc_loss": 0.04950425773859024, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002069566398859024, "grad_norm": 5.890595436096191, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8750364780426025, "num_tokens": 302547689.0, "step": 7931 }, { "epoch": 1.0090319297799262, "ewc_loss": 0.04952468350529671, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.000207160905119963, "grad_norm": 5.895013809204102, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8608405590057373, "num_tokens": 302590728.0, "step": 7932 }, { "epoch": 1.0091591400585167, "ewc_loss": 0.04948515072464943, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020676557323895395, "grad_norm": 5.9414963722229, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8398722410202026, "num_tokens": 302625709.0, "step": 7933 }, { "epoch": 1.0092863503371072, "ewc_loss": 0.04969248175621033, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020761816995218396, "grad_norm": 5.939395427703857, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8702928423881531, "num_tokens": 302662086.0, "step": 7934 }, { "epoch": 1.0094135606156978, "ewc_loss": 0.04949124902486801, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002068265457637608, "grad_norm": 5.857359886169434, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8799453377723694, "num_tokens": 302703680.0, "step": 7935 }, { "epoch": 1.0095407708942883, "ewc_loss": 0.04954749345779419, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020738902094308287, "grad_norm": 5.954464912414551, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8472150564193726, "num_tokens": 302740426.0, "step": 7936 }, { "epoch": 1.0096679811728788, "ewc_loss": 0.04950587451457977, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020697282161563635, "grad_norm": 5.845801830291748, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.874972939491272, "num_tokens": 302780778.0, "step": 7937 }, { "epoch": 1.0097951914514693, "ewc_loss": 0.049705713987350464, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020775051962118596, "grad_norm": 5.939991474151611, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8664514422416687, "num_tokens": 302816012.0, "step": 7938 }, { "epoch": 1.0099224017300599, "ewc_loss": 0.049696385860443115, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020765721274074167, "grad_norm": 5.911255836486816, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8413366675376892, "num_tokens": 302858325.0, "step": 7939 }, { "epoch": 1.0100496120086504, "ewc_loss": 0.049539096653461456, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020730504184029996, "grad_norm": 5.889192581176758, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.863994836807251, "num_tokens": 302898210.0, "step": 7940 }, { "epoch": 1.0101768222872407, "ewc_loss": 0.04965865612030029, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020850061264354736, "grad_norm": 5.944636344909668, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.862759530544281, "num_tokens": 302936072.0, "step": 7941 }, { "epoch": 1.0103040325658312, "ewc_loss": 0.049568064510822296, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002075946977129206, "grad_norm": 5.861670017242432, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.853978157043457, "num_tokens": 302978941.0, "step": 7942 }, { "epoch": 1.0104312428444218, "ewc_loss": 0.04969106614589691, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020760401093866676, "grad_norm": 5.961203575134277, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8546391725540161, "num_tokens": 303016664.0, "step": 7943 }, { "epoch": 1.0105584531230123, "ewc_loss": 0.049561139196157455, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.0002075254451483488, "grad_norm": 5.817009449005127, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8582431674003601, "num_tokens": 303063313.0, "step": 7944 }, { "epoch": 1.0106856634016028, "ewc_loss": 0.04956386983394623, "ewc_loss_diag": 2.8848648071289062e-05, "ewc_loss_parallel": 0.00020755277364514768, "grad_norm": 5.964027404785156, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8515480756759644, "num_tokens": 303103786.0, "step": 7945 }, { "epoch": 1.0108128736801933, "ewc_loss": 0.0497414730489254, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.0002081080892821774, "grad_norm": 5.9157233238220215, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8691143989562988, "num_tokens": 303136928.0, "step": 7946 }, { "epoch": 1.0109400839587839, "ewc_loss": 0.04967726767063141, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020746601512655616, "grad_norm": 5.891178607940674, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8701404333114624, "num_tokens": 303174854.0, "step": 7947 }, { "epoch": 1.0110672942373744, "ewc_loss": 0.04971848800778389, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020787824178114533, "grad_norm": 9.38224983215332, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8595860600471497, "num_tokens": 303214234.0, "step": 7948 }, { "epoch": 1.011194504515965, "ewc_loss": 0.05374370142817497, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00024690967984497547, "grad_norm": 6.3359599113464355, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8754370808601379, "num_tokens": 303251830.0, "step": 7949 }, { "epoch": 1.0113217147945555, "ewc_loss": 0.04895240068435669, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020021737145725638, "grad_norm": 5.82194185256958, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.852614164352417, "num_tokens": 303291564.0, "step": 7950 }, { "epoch": 1.011448925073146, "ewc_loss": 0.050358690321445465, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00021305953850969672, "grad_norm": 6.099468231201172, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8594156503677368, "num_tokens": 303328642.0, "step": 7951 }, { "epoch": 1.0115761353517365, "ewc_loss": 0.050026535987854004, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.0002097380202030763, "grad_norm": 5.8936004638671875, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8604708313941956, "num_tokens": 303373693.0, "step": 7952 }, { "epoch": 1.0117033456303268, "ewc_loss": 0.049833253026008606, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020902587857563049, "grad_norm": 6.070733547210693, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.866070032119751, "num_tokens": 303407124.0, "step": 7953 }, { "epoch": 1.0118305559089174, "ewc_loss": 0.04978358373045921, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020852919260505587, "grad_norm": 5.864519119262695, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8743877410888672, "num_tokens": 303441586.0, "step": 7954 }, { "epoch": 1.0119577661875079, "ewc_loss": 0.04984305799007416, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.000209123914828524, "grad_norm": 6.053918838500977, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8641659021377563, "num_tokens": 303473593.0, "step": 7955 }, { "epoch": 1.0120849764660984, "ewc_loss": 0.04997016862034798, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020917433721479028, "grad_norm": 5.94857931137085, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8592749834060669, "num_tokens": 303510239.0, "step": 7956 }, { "epoch": 1.012212186744689, "ewc_loss": 0.04973568022251129, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020805018721148372, "grad_norm": 5.973227024078369, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8611804842948914, "num_tokens": 303544709.0, "step": 7957 }, { "epoch": 1.0123393970232795, "ewc_loss": 0.049886345863342285, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.0002083361177938059, "grad_norm": 5.903562068939209, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.871799886226654, "num_tokens": 303581990.0, "step": 7958 }, { "epoch": 1.01246660730187, "ewc_loss": 0.049708280712366104, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020777617464773357, "grad_norm": 5.877313137054443, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8657221794128418, "num_tokens": 303625167.0, "step": 7959 }, { "epoch": 1.0125938175804605, "ewc_loss": 0.049870654940605164, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020817920449189842, "grad_norm": 5.9092326164245605, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8636504411697388, "num_tokens": 303664751.0, "step": 7960 }, { "epoch": 1.012721027859051, "ewc_loss": 0.04987906664609909, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.0002082633291138336, "grad_norm": 5.929784297943115, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8601009845733643, "num_tokens": 303702190.0, "step": 7961 }, { "epoch": 1.0128482381376416, "ewc_loss": 0.049947358667850494, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.0002089462213916704, "grad_norm": 5.855215549468994, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8763813972473145, "num_tokens": 303742872.0, "step": 7962 }, { "epoch": 1.012975448416232, "ewc_loss": 0.04987326264381409, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020820526697207242, "grad_norm": 5.947030067443848, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8553154468536377, "num_tokens": 303781265.0, "step": 7963 }, { "epoch": 1.0131026586948226, "ewc_loss": 0.04985702037811279, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020804286759812385, "grad_norm": 5.918160438537598, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8623659610748291, "num_tokens": 303820686.0, "step": 7964 }, { "epoch": 1.0132298689734132, "ewc_loss": 0.049900121986866, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020847386622335762, "grad_norm": 5.931455612182617, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8651326894760132, "num_tokens": 303863428.0, "step": 7965 }, { "epoch": 1.0133570792520035, "ewc_loss": 0.04986780881881714, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.0002081507263937965, "grad_norm": 5.880814075469971, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8589992523193359, "num_tokens": 303903847.0, "step": 7966 }, { "epoch": 1.013484289530594, "ewc_loss": 0.04990878701210022, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020856050832662731, "grad_norm": 5.973571300506592, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8542340993881226, "num_tokens": 303943083.0, "step": 7967 }, { "epoch": 1.0136114998091845, "ewc_loss": 0.04988224059343338, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020829503773711622, "grad_norm": 5.912689208984375, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8727394938468933, "num_tokens": 303978773.0, "step": 7968 }, { "epoch": 1.013738710087775, "ewc_loss": 0.0498223677277565, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020769635739270598, "grad_norm": 5.916646957397461, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8695405721664429, "num_tokens": 304017058.0, "step": 7969 }, { "epoch": 1.0138659203663656, "ewc_loss": 0.04988626390695572, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020833528833463788, "grad_norm": 5.91891622543335, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8536074757575989, "num_tokens": 304054104.0, "step": 7970 }, { "epoch": 1.013993130644956, "ewc_loss": 0.04983217269182205, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020779436454176903, "grad_norm": 5.918705940246582, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8774580955505371, "num_tokens": 304091100.0, "step": 7971 }, { "epoch": 1.0141203409235466, "ewc_loss": 0.04986366629600525, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.0002081093261949718, "grad_norm": 5.903810024261475, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8633549213409424, "num_tokens": 304127747.0, "step": 7972 }, { "epoch": 1.0142475512021372, "ewc_loss": 0.04973747208714485, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.0002080680860672146, "grad_norm": 5.889782905578613, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8632747530937195, "num_tokens": 304171089.0, "step": 7973 }, { "epoch": 1.0143747614807277, "ewc_loss": 0.049773208796978, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020842545200139284, "grad_norm": 5.977686882019043, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8530633449554443, "num_tokens": 304210207.0, "step": 7974 }, { "epoch": 1.0145019717593182, "ewc_loss": 0.04972762614488602, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020796962780877948, "grad_norm": 5.883064270019531, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8790422081947327, "num_tokens": 304249423.0, "step": 7975 }, { "epoch": 1.0146291820379088, "ewc_loss": 0.04971848800778389, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020787824178114533, "grad_norm": 5.917912483215332, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8672745227813721, "num_tokens": 304286872.0, "step": 7976 }, { "epoch": 1.0147563923164993, "ewc_loss": 0.04976901412010193, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020838348427787423, "grad_norm": 5.963385105133057, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8670345544815063, "num_tokens": 304323033.0, "step": 7977 }, { "epoch": 1.0148836025950896, "ewc_loss": 0.049751438200473785, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020820775534957647, "grad_norm": 5.903487682342529, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8660749197006226, "num_tokens": 304360096.0, "step": 7978 }, { "epoch": 1.0150108128736801, "ewc_loss": 0.049725551158189774, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020794887677766383, "grad_norm": 5.89401388168335, "learning_rate": 1e-06, "loss": 0.3801, "mean_token_accuracy": 0.8834809064865112, "num_tokens": 304396364.0, "step": 7979 }, { "epoch": 1.0151380231522706, "ewc_loss": 0.04977476969361305, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020844105165451765, "grad_norm": 5.962088584899902, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.86556077003479, "num_tokens": 304431583.0, "step": 7980 }, { "epoch": 1.0152652334308612, "ewc_loss": 0.0499047115445137, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.0002085197629639879, "grad_norm": 5.9350738525390625, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8576599955558777, "num_tokens": 304470531.0, "step": 7981 }, { "epoch": 1.0153924437094517, "ewc_loss": 0.04976347088813782, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020832805603276938, "grad_norm": 5.912865161895752, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8683121800422668, "num_tokens": 304505187.0, "step": 7982 }, { "epoch": 1.0155196539880422, "ewc_loss": 0.049764715135097504, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.0002083404833683744, "grad_norm": 5.926383018493652, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8407894372940063, "num_tokens": 304542685.0, "step": 7983 }, { "epoch": 1.0156468642666328, "ewc_loss": 0.04982141777873039, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.0002089075424009934, "grad_norm": 5.885733127593994, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.862619161605835, "num_tokens": 304579204.0, "step": 7984 }, { "epoch": 1.0157740745452233, "ewc_loss": 0.049801234155893326, "ewc_loss_diag": 2.8967857360839844e-05, "ewc_loss_parallel": 0.00020870569278486073, "grad_norm": 5.916832447052002, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8695598840713501, "num_tokens": 304615224.0, "step": 7985 }, { "epoch": 1.0159012848238138, "ewc_loss": 0.04991651326417923, "ewc_loss_diag": 2.9087066650390625e-05, "ewc_loss_parallel": 0.00020863777899648994, "grad_norm": 5.901581287384033, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8579819202423096, "num_tokens": 304653513.0, "step": 7986 }, { "epoch": 1.0160284951024043, "ewc_loss": 0.05000900477170944, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020834201131947339, "grad_norm": 5.927107334136963, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8682793378829956, "num_tokens": 304686744.0, "step": 7987 }, { "epoch": 1.0161557053809949, "ewc_loss": 0.05008995160460472, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020915147615596652, "grad_norm": 5.9448676109313965, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8622373342514038, "num_tokens": 304727262.0, "step": 7988 }, { "epoch": 1.0162829156595854, "ewc_loss": 0.05006885528564453, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020894053159281611, "grad_norm": 5.956918716430664, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8611722588539124, "num_tokens": 304763418.0, "step": 7989 }, { "epoch": 1.0164101259381757, "ewc_loss": 0.05005703493952751, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020882229728158563, "grad_norm": 5.932272434234619, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8660046458244324, "num_tokens": 304800763.0, "step": 7990 }, { "epoch": 1.0165373362167662, "ewc_loss": 0.049998968839645386, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020824161765631288, "grad_norm": 5.96028995513916, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8815154433250427, "num_tokens": 304833136.0, "step": 7991 }, { "epoch": 1.0166645464953568, "ewc_loss": 0.05000588297843933, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020831076835747808, "grad_norm": 5.922175407409668, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8714234828948975, "num_tokens": 304873208.0, "step": 7992 }, { "epoch": 1.0167917567739473, "ewc_loss": 0.050017599016427994, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002084279403788969, "grad_norm": 5.957822799682617, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8638452291488647, "num_tokens": 304914237.0, "step": 7993 }, { "epoch": 1.0169189670525378, "ewc_loss": 0.049956344068050385, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020781539205927402, "grad_norm": 5.902065277099609, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8785624504089355, "num_tokens": 304955453.0, "step": 7994 }, { "epoch": 1.0170461773311283, "ewc_loss": 0.04997668415307999, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020801879873033613, "grad_norm": 5.978997230529785, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8626154661178589, "num_tokens": 304990533.0, "step": 7995 }, { "epoch": 1.0171733876097189, "ewc_loss": 0.04998795688152313, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020813154696952552, "grad_norm": 5.988353252410889, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8619801998138428, "num_tokens": 305024849.0, "step": 7996 }, { "epoch": 1.0173005978883094, "ewc_loss": 0.05001344531774521, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020838642376475036, "grad_norm": 5.943181037902832, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8606710433959961, "num_tokens": 305061615.0, "step": 7997 }, { "epoch": 1.0174278081669, "ewc_loss": 0.04995130002498627, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002077649551210925, "grad_norm": 5.925120830535889, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8435147404670715, "num_tokens": 305102251.0, "step": 7998 }, { "epoch": 1.0175550184454905, "ewc_loss": 0.04993521422147751, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020760411280207336, "grad_norm": 6.030028343200684, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.872528076171875, "num_tokens": 305135405.0, "step": 7999 }, { "epoch": 1.017682228724081, "ewc_loss": 0.049992214888334274, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002081741113215685, "grad_norm": 5.936273574829102, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8622968792915344, "num_tokens": 305169005.0, "step": 8000 }, { "epoch": 1.0178094390026715, "ewc_loss": 0.04991259053349495, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002073778596241027, "grad_norm": 6.004144191741943, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8590140342712402, "num_tokens": 305209050.0, "step": 8001 }, { "epoch": 1.0179366492812618, "ewc_loss": 0.04996465519070625, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020789851259905845, "grad_norm": 6.001421928405762, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8644716143608093, "num_tokens": 305238250.0, "step": 8002 }, { "epoch": 1.0180638595598523, "ewc_loss": 0.04997171461582184, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020796910393983126, "grad_norm": 5.884369373321533, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.867358922958374, "num_tokens": 305283544.0, "step": 8003 }, { "epoch": 1.0181910698384429, "ewc_loss": 0.04995833337306976, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020783531363122165, "grad_norm": 5.979459762573242, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8455692529678345, "num_tokens": 305320548.0, "step": 8004 }, { "epoch": 1.0183182801170334, "ewc_loss": 0.05002976208925247, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002085495798382908, "grad_norm": 5.946019172668457, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8703690767288208, "num_tokens": 305359560.0, "step": 8005 }, { "epoch": 1.018445490395624, "ewc_loss": 0.04998549073934555, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002081068669212982, "grad_norm": 5.971632480621338, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8601028919219971, "num_tokens": 305396680.0, "step": 8006 }, { "epoch": 1.0185727006742145, "ewc_loss": 0.050029635429382324, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020854831382166594, "grad_norm": 5.916659832000732, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8739484548568726, "num_tokens": 305432856.0, "step": 8007 }, { "epoch": 1.018699910952805, "ewc_loss": 0.05001920461654663, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020844397658947855, "grad_norm": 5.9460883140563965, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8647550344467163, "num_tokens": 305469973.0, "step": 8008 }, { "epoch": 1.0188271212313955, "ewc_loss": 0.049983445554971695, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020808640692848712, "grad_norm": 5.948709487915039, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8686258792877197, "num_tokens": 305505576.0, "step": 8009 }, { "epoch": 1.018954331509986, "ewc_loss": 0.050005510449409485, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020830705761909485, "grad_norm": 5.9623847007751465, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8442753553390503, "num_tokens": 305542037.0, "step": 8010 }, { "epoch": 1.0190815417885766, "ewc_loss": 0.050043705850839615, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020868901629000902, "grad_norm": 5.895440101623535, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.871103823184967, "num_tokens": 305587784.0, "step": 8011 }, { "epoch": 1.019208752067167, "ewc_loss": 0.0500313863158226, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002085657906718552, "grad_norm": 5.95463752746582, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8566535711288452, "num_tokens": 305621888.0, "step": 8012 }, { "epoch": 1.0193359623457576, "ewc_loss": 0.05003643035888672, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002086162567138672, "grad_norm": 5.826420783996582, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8822351098060608, "num_tokens": 305663844.0, "step": 8013 }, { "epoch": 1.0194631726243482, "ewc_loss": 0.050059251487255096, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020884447440039366, "grad_norm": 5.983952522277832, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8779513835906982, "num_tokens": 305695913.0, "step": 8014 }, { "epoch": 1.0195903829029385, "ewc_loss": 0.05005357414484024, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020878767827525735, "grad_norm": 5.948617935180664, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8612973690032959, "num_tokens": 305733632.0, "step": 8015 }, { "epoch": 1.019717593181529, "ewc_loss": 0.050051093101501465, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020876286725979298, "grad_norm": 5.97782039642334, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8474191427230835, "num_tokens": 305766917.0, "step": 8016 }, { "epoch": 1.0198448034601195, "ewc_loss": 0.04998067393898964, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002080587000818923, "grad_norm": 5.870533466339111, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8584539294242859, "num_tokens": 305805768.0, "step": 8017 }, { "epoch": 1.01997201373871, "ewc_loss": 0.05009782686829567, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020923023112118244, "grad_norm": 5.960071563720703, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.86586594581604, "num_tokens": 305843412.0, "step": 8018 }, { "epoch": 1.0200992240173006, "ewc_loss": 0.05003156512975693, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020856762421317399, "grad_norm": 5.866039276123047, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8780900835990906, "num_tokens": 305880523.0, "step": 8019 }, { "epoch": 1.020226434295891, "ewc_loss": 0.05009174346923828, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002091693750116974, "grad_norm": 6.035818099975586, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8638834953308105, "num_tokens": 305915379.0, "step": 8020 }, { "epoch": 1.0203536445744816, "ewc_loss": 0.05011117458343506, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020936370128765702, "grad_norm": 5.919312477111816, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8778362274169922, "num_tokens": 305952471.0, "step": 8021 }, { "epoch": 1.0204808548530722, "ewc_loss": 0.05005333572626114, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020878533541690558, "grad_norm": 5.906803607940674, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8630402088165283, "num_tokens": 305994023.0, "step": 8022 }, { "epoch": 1.0206080651316627, "ewc_loss": 0.05005232244729996, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002087751927319914, "grad_norm": 5.933366298675537, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8688168525695801, "num_tokens": 306026563.0, "step": 8023 }, { "epoch": 1.0207352754102532, "ewc_loss": 0.05005611479282379, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020881308591924608, "grad_norm": 5.874240875244141, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8585407733917236, "num_tokens": 306067913.0, "step": 8024 }, { "epoch": 1.0208624856888437, "ewc_loss": 0.05009818822145462, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002092338545480743, "grad_norm": 5.932721138000488, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8618906736373901, "num_tokens": 306108692.0, "step": 8025 }, { "epoch": 1.0209896959674343, "ewc_loss": 0.050140105187892914, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002096530224662274, "grad_norm": 5.937569618225098, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8634449243545532, "num_tokens": 306148082.0, "step": 8026 }, { "epoch": 1.0211169062460246, "ewc_loss": 0.05008760094642639, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020912798936478794, "grad_norm": 5.925638198852539, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8702552914619446, "num_tokens": 306182800.0, "step": 8027 }, { "epoch": 1.021244116524615, "ewc_loss": 0.050125602632761, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020950798352714628, "grad_norm": 5.8867926597595215, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8612753748893738, "num_tokens": 306226741.0, "step": 8028 }, { "epoch": 1.0213713268032056, "ewc_loss": 0.050123006105422974, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002094820374622941, "grad_norm": 5.985129356384277, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8621405363082886, "num_tokens": 306265993.0, "step": 8029 }, { "epoch": 1.0214985370817962, "ewc_loss": 0.050104010850191116, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020929206220898777, "grad_norm": 5.915905952453613, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8664048910140991, "num_tokens": 306306511.0, "step": 8030 }, { "epoch": 1.0216257473603867, "ewc_loss": 0.05011875182390213, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020943947311025113, "grad_norm": 5.971217155456543, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8787699937820435, "num_tokens": 306337687.0, "step": 8031 }, { "epoch": 1.0217529576389772, "ewc_loss": 0.050111934542655945, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020937129738740623, "grad_norm": 5.931710720062256, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8291892409324646, "num_tokens": 306379646.0, "step": 8032 }, { "epoch": 1.0218801679175677, "ewc_loss": 0.050107963383197784, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020933159976266325, "grad_norm": 5.935419082641602, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8636658191680908, "num_tokens": 306415389.0, "step": 8033 }, { "epoch": 1.0220073781961583, "ewc_loss": 0.05019696056842804, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00021022153669036925, "grad_norm": 6.004403114318848, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8590601086616516, "num_tokens": 306452035.0, "step": 8034 }, { "epoch": 1.0221345884747488, "ewc_loss": 0.05008436739444733, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020909564045723528, "grad_norm": 5.886855125427246, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8627297878265381, "num_tokens": 306496277.0, "step": 8035 }, { "epoch": 1.0222617987533393, "ewc_loss": 0.0500822551548481, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002090745110763237, "grad_norm": 5.984131336212158, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8688077330589294, "num_tokens": 306528212.0, "step": 8036 }, { "epoch": 1.0223890090319299, "ewc_loss": 0.05015883594751358, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002098403056152165, "grad_norm": 6.005263805389404, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8672524690628052, "num_tokens": 306562047.0, "step": 8037 }, { "epoch": 1.0225162193105204, "ewc_loss": 0.05009075999259949, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020915953791700304, "grad_norm": 6.055276870727539, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8535103797912598, "num_tokens": 306601298.0, "step": 8038 }, { "epoch": 1.0226434295891107, "ewc_loss": 0.050034672021865845, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020859864889644086, "grad_norm": 5.874206066131592, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8634605407714844, "num_tokens": 306640961.0, "step": 8039 }, { "epoch": 1.0227706398677012, "ewc_loss": 0.05006055533885956, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002088575274683535, "grad_norm": 5.94075345993042, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.881461501121521, "num_tokens": 306679507.0, "step": 8040 }, { "epoch": 1.0228978501462918, "ewc_loss": 0.05010834336280823, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002093353687087074, "grad_norm": 5.951140880584717, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8537408709526062, "num_tokens": 306717337.0, "step": 8041 }, { "epoch": 1.0230250604248823, "ewc_loss": 0.05008057504892349, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020905770361423492, "grad_norm": 5.960644721984863, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8375449180603027, "num_tokens": 306754152.0, "step": 8042 }, { "epoch": 1.0231522707034728, "ewc_loss": 0.05014895647764206, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.0002097415126627311, "grad_norm": 5.967195987701416, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8560141324996948, "num_tokens": 306794887.0, "step": 8043 }, { "epoch": 1.0232794809820633, "ewc_loss": 0.05010874941945076, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020933944324497133, "grad_norm": 5.9703521728515625, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8661330938339233, "num_tokens": 306828374.0, "step": 8044 }, { "epoch": 1.0234066912606539, "ewc_loss": 0.05012780800461769, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020953002967871726, "grad_norm": 5.933568477630615, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8673553466796875, "num_tokens": 306865285.0, "step": 8045 }, { "epoch": 1.0235339015392444, "ewc_loss": 0.05010228604078293, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020927480363752693, "grad_norm": 5.973707675933838, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8689734935760498, "num_tokens": 306897195.0, "step": 8046 }, { "epoch": 1.023661111817835, "ewc_loss": 0.05014631152153015, "ewc_loss_diag": 2.9206275939941406e-05, "ewc_loss_parallel": 0.00020971507183276117, "grad_norm": 5.940145015716553, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8629840016365051, "num_tokens": 306935352.0, "step": 8047 }, { "epoch": 1.0237883220964255, "ewc_loss": 0.05017152428627014, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020874649635516107, "grad_norm": 5.957478046417236, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8686248064041138, "num_tokens": 306964621.0, "step": 8048 }, { "epoch": 1.023915532375016, "ewc_loss": 0.050247110426425934, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020950236648786813, "grad_norm": 5.931519508361816, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8618059158325195, "num_tokens": 307003451.0, "step": 8049 }, { "epoch": 1.0240427426536065, "ewc_loss": 0.05022931098937988, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020932436746079475, "grad_norm": 5.920681953430176, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8633817434310913, "num_tokens": 307044695.0, "step": 8050 }, { "epoch": 1.0241699529321968, "ewc_loss": 0.0502566322684288, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.0002095975651172921, "grad_norm": 5.8935227394104, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8698229193687439, "num_tokens": 307080603.0, "step": 8051 }, { "epoch": 1.0242971632107873, "ewc_loss": 0.050258245319128036, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020961370319128036, "grad_norm": 5.925121784210205, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8564372658729553, "num_tokens": 307122300.0, "step": 8052 }, { "epoch": 1.0244243734893779, "ewc_loss": 0.050268154591321945, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020971280173398554, "grad_norm": 5.943155765533447, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8695233464241028, "num_tokens": 307158685.0, "step": 8053 }, { "epoch": 1.0245515837679684, "ewc_loss": 0.05029093101620674, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.0002099405537592247, "grad_norm": 5.899877071380615, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.868310809135437, "num_tokens": 307199104.0, "step": 8054 }, { "epoch": 1.024678794046559, "ewc_loss": 0.050273992121219635, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.0002097711549140513, "grad_norm": 5.987033367156982, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8576999306678772, "num_tokens": 307235840.0, "step": 8055 }, { "epoch": 1.0248060043251495, "ewc_loss": 0.05026254802942276, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020965674775652587, "grad_norm": 5.8850531578063965, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8524191379547119, "num_tokens": 307273089.0, "step": 8056 }, { "epoch": 1.02493321460374, "ewc_loss": 0.05032722279429436, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00021030347852502018, "grad_norm": 5.967485427856445, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8638628721237183, "num_tokens": 307318825.0, "step": 8057 }, { "epoch": 1.0250604248823305, "ewc_loss": 0.0504658967256546, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021046950132586062, "grad_norm": 5.948662757873535, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8700573444366455, "num_tokens": 307357006.0, "step": 8058 }, { "epoch": 1.025187635160921, "ewc_loss": 0.05024876445531845, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020951889746356755, "grad_norm": 5.97937536239624, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8611594438552856, "num_tokens": 307390274.0, "step": 8059 }, { "epoch": 1.0253148454395116, "ewc_loss": 0.05030461400747299, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00021007739997003227, "grad_norm": 5.941005229949951, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8713011741638184, "num_tokens": 307427157.0, "step": 8060 }, { "epoch": 1.025442055718102, "ewc_loss": 0.05021284520626068, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.000209159727091901, "grad_norm": 5.915989398956299, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8589165210723877, "num_tokens": 307466555.0, "step": 8061 }, { "epoch": 1.0255692659966926, "ewc_loss": 0.05027896910905838, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020982093701604754, "grad_norm": 6.016040802001953, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8553157448768616, "num_tokens": 307503375.0, "step": 8062 }, { "epoch": 1.0256964762752832, "ewc_loss": 0.05023563280701637, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020938758098054677, "grad_norm": 5.911919116973877, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8727871775627136, "num_tokens": 307543170.0, "step": 8063 }, { "epoch": 1.0258236865538735, "ewc_loss": 0.05024928227066994, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020952406339347363, "grad_norm": 5.958267688751221, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8597801923751831, "num_tokens": 307582890.0, "step": 8064 }, { "epoch": 1.025950896832464, "ewc_loss": 0.0502873919904232, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020990516350138932, "grad_norm": 5.984820365905762, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8557197451591492, "num_tokens": 307617643.0, "step": 8065 }, { "epoch": 1.0260781071110545, "ewc_loss": 0.050225771963596344, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020928896265104413, "grad_norm": 5.947316646575928, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8607171773910522, "num_tokens": 307652258.0, "step": 8066 }, { "epoch": 1.026205317389645, "ewc_loss": 0.05027419328689575, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.0002097731630783528, "grad_norm": 5.948070049285889, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8597517013549805, "num_tokens": 307693245.0, "step": 8067 }, { "epoch": 1.0263325276682356, "ewc_loss": 0.05025182291865349, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020954948558937758, "grad_norm": 5.925491809844971, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8655949831008911, "num_tokens": 307728751.0, "step": 8068 }, { "epoch": 1.026459737946826, "ewc_loss": 0.050282493233680725, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00020985619630664587, "grad_norm": 5.97744607925415, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8524316549301147, "num_tokens": 307767869.0, "step": 8069 }, { "epoch": 1.0265869482254166, "ewc_loss": 0.050522394478321075, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00020981380657758564, "grad_norm": 5.888460159301758, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8726440072059631, "num_tokens": 307809438.0, "step": 8070 }, { "epoch": 1.0267141585040072, "ewc_loss": 0.05038101226091385, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020962067355867475, "grad_norm": 5.952829360961914, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8437505960464478, "num_tokens": 307848752.0, "step": 8071 }, { "epoch": 1.0268413687825977, "ewc_loss": 0.05045654997229576, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021037604892626405, "grad_norm": 5.994511604309082, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8580297231674194, "num_tokens": 307886080.0, "step": 8072 }, { "epoch": 1.0269685790611882, "ewc_loss": 0.050426334142684937, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021007386385463178, "grad_norm": 5.917911529541016, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8759770393371582, "num_tokens": 307922854.0, "step": 8073 }, { "epoch": 1.0270957893397787, "ewc_loss": 0.05042421072721481, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002100526326103136, "grad_norm": 5.94221305847168, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8602737784385681, "num_tokens": 307960032.0, "step": 8074 }, { "epoch": 1.0272229996183693, "ewc_loss": 0.05044381320476532, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021024869056418538, "grad_norm": 6.043178558349609, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8617254495620728, "num_tokens": 307996074.0, "step": 8075 }, { "epoch": 1.0273502098969596, "ewc_loss": 0.05039443075656891, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020975485676899552, "grad_norm": 5.931058406829834, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.876036524772644, "num_tokens": 308030833.0, "step": 8076 }, { "epoch": 1.02747742017555, "ewc_loss": 0.050396569073200226, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020977621898055077, "grad_norm": 5.920863151550293, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8722028732299805, "num_tokens": 308079620.0, "step": 8077 }, { "epoch": 1.0276046304541406, "ewc_loss": 0.05037657916545868, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002095763193210587, "grad_norm": 5.9473042488098145, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8566991090774536, "num_tokens": 308118555.0, "step": 8078 }, { "epoch": 1.0277318407327312, "ewc_loss": 0.050466299057006836, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002104735467582941, "grad_norm": 6.031434535980225, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8472033143043518, "num_tokens": 308154410.0, "step": 8079 }, { "epoch": 1.0278590510113217, "ewc_loss": 0.05036746710538864, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020948522433172911, "grad_norm": 5.948115348815918, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8651272058486938, "num_tokens": 308191990.0, "step": 8080 }, { "epoch": 1.0279862612899122, "ewc_loss": 0.05043863505125046, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021019691484980285, "grad_norm": 5.98445463180542, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.843314528465271, "num_tokens": 308231047.0, "step": 8081 }, { "epoch": 1.0281134715685027, "ewc_loss": 0.05069423466920853, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021031151118222624, "grad_norm": 5.971236228942871, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8780218958854675, "num_tokens": 308269487.0, "step": 8082 }, { "epoch": 1.0282406818470933, "ewc_loss": 0.0504181794822216, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020999234402552247, "grad_norm": 5.965763092041016, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8629531860351562, "num_tokens": 308304974.0, "step": 8083 }, { "epoch": 1.0283678921256838, "ewc_loss": 0.050363894551992416, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020944948482792825, "grad_norm": 5.936224937438965, "learning_rate": 1e-06, "loss": 0.3838, "mean_token_accuracy": 0.883285403251648, "num_tokens": 308342072.0, "step": 8084 }, { "epoch": 1.0284951024042743, "ewc_loss": 0.050470057874917984, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021051113435532898, "grad_norm": 6.020407676696777, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8596018552780151, "num_tokens": 308374293.0, "step": 8085 }, { "epoch": 1.0286223126828649, "ewc_loss": 0.0504198893904686, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021000945707783103, "grad_norm": 5.937511444091797, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8665603399276733, "num_tokens": 308415872.0, "step": 8086 }, { "epoch": 1.0287495229614554, "ewc_loss": 0.050431057810783386, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021012109937146306, "grad_norm": 5.938223838806152, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8569869995117188, "num_tokens": 308451198.0, "step": 8087 }, { "epoch": 1.0288767332400457, "ewc_loss": 0.050495363771915436, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002107642067130655, "grad_norm": 5.9594197273254395, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8709504008293152, "num_tokens": 308490554.0, "step": 8088 }, { "epoch": 1.0290039435186362, "ewc_loss": 0.05046190321445465, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021042959997430444, "grad_norm": 5.970229148864746, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8577412366867065, "num_tokens": 308530221.0, "step": 8089 }, { "epoch": 1.0291311537972267, "ewc_loss": 0.05048729479312897, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021068348723929375, "grad_norm": 6.005160331726074, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8482086658477783, "num_tokens": 308563634.0, "step": 8090 }, { "epoch": 1.0292583640758173, "ewc_loss": 0.05039696767926216, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020978022075723857, "grad_norm": 5.906437397003174, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8629688024520874, "num_tokens": 308605033.0, "step": 8091 }, { "epoch": 1.0293855743544078, "ewc_loss": 0.05047526955604553, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.000210563259315677, "grad_norm": 5.99807071685791, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.864189088344574, "num_tokens": 308642603.0, "step": 8092 }, { "epoch": 1.0295127846329983, "ewc_loss": 0.050424814224243164, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021005867165513337, "grad_norm": 5.8894548416137695, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8604234457015991, "num_tokens": 308685189.0, "step": 8093 }, { "epoch": 1.0296399949115889, "ewc_loss": 0.05051034688949585, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002109140041284263, "grad_norm": 5.980458736419678, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.863758385181427, "num_tokens": 308726052.0, "step": 8094 }, { "epoch": 1.0297672051901794, "ewc_loss": 0.05047527700662613, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002105633175233379, "grad_norm": 5.977358341217041, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8500919342041016, "num_tokens": 308764828.0, "step": 8095 }, { "epoch": 1.02989441546877, "ewc_loss": 0.050467394292354584, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002104844752466306, "grad_norm": 5.979236125946045, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.873288631439209, "num_tokens": 308799712.0, "step": 8096 }, { "epoch": 1.0300216257473604, "ewc_loss": 0.0504608228802681, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002104187587974593, "grad_norm": 5.931797981262207, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8627593517303467, "num_tokens": 308839879.0, "step": 8097 }, { "epoch": 1.030148836025951, "ewc_loss": 0.050625599920749664, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021084582840558141, "grad_norm": 5.989504337310791, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8555346131324768, "num_tokens": 308880362.0, "step": 8098 }, { "epoch": 1.0302760463045415, "ewc_loss": 0.05045842379331589, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002103948063449934, "grad_norm": 6.037126541137695, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8725310564041138, "num_tokens": 308921395.0, "step": 8099 }, { "epoch": 1.0304032565831318, "ewc_loss": 0.05039273202419281, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020973788923583925, "grad_norm": 5.968156814575195, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8717938661575317, "num_tokens": 308959976.0, "step": 8100 }, { "epoch": 1.0305304668617223, "ewc_loss": 0.050416000187397, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020997054525651038, "grad_norm": 5.963964462280273, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8752118349075317, "num_tokens": 309000702.0, "step": 8101 }, { "epoch": 1.0306576771403129, "ewc_loss": 0.050372034311294556, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020953090279363096, "grad_norm": 5.9385294914245605, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8823304176330566, "num_tokens": 309036540.0, "step": 8102 }, { "epoch": 1.0307848874189034, "ewc_loss": 0.05038672313094139, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020967777527403086, "grad_norm": 5.985005855560303, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8616415858268738, "num_tokens": 309072408.0, "step": 8103 }, { "epoch": 1.030912097697494, "ewc_loss": 0.05041342228651047, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020994477381464094, "grad_norm": 6.037296772003174, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8488616347312927, "num_tokens": 309110366.0, "step": 8104 }, { "epoch": 1.0310393079760845, "ewc_loss": 0.05037796497344971, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020959018729627132, "grad_norm": 5.8961405754089355, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8658825755119324, "num_tokens": 309155194.0, "step": 8105 }, { "epoch": 1.031166518254675, "ewc_loss": 0.05035966634750366, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020940722606610507, "grad_norm": 5.985982894897461, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8717272877693176, "num_tokens": 309193747.0, "step": 8106 }, { "epoch": 1.0312937285332655, "ewc_loss": 0.05039115622639656, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020972210040781647, "grad_norm": 5.903001308441162, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8510552644729614, "num_tokens": 309234161.0, "step": 8107 }, { "epoch": 1.031420938811856, "ewc_loss": 0.0503121092915535, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00021015234233345836, "grad_norm": 6.002081871032715, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8525635004043579, "num_tokens": 309272574.0, "step": 8108 }, { "epoch": 1.0315481490904466, "ewc_loss": 0.050263963639736176, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.0002096708631142974, "grad_norm": 5.963565349578857, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8662985563278198, "num_tokens": 309306567.0, "step": 8109 }, { "epoch": 1.031675359369037, "ewc_loss": 0.05035196989774704, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.0002105509483953938, "grad_norm": 5.974902629852295, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8684215545654297, "num_tokens": 309347218.0, "step": 8110 }, { "epoch": 1.0318025696476276, "ewc_loss": 0.05041208863258362, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00020993145881220698, "grad_norm": 5.90713357925415, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8768309354782104, "num_tokens": 309385701.0, "step": 8111 }, { "epoch": 1.0319297799262181, "ewc_loss": 0.05034216493368149, "ewc_loss_diag": 2.9325485229492188e-05, "ewc_loss_parallel": 0.00021045288303866982, "grad_norm": 5.964075088500977, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8610342741012573, "num_tokens": 309424600.0, "step": 8112 }, { "epoch": 1.0320569902048085, "ewc_loss": 0.05047603324055672, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021057088451925665, "grad_norm": 5.964747905731201, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8432974815368652, "num_tokens": 309459929.0, "step": 8113 }, { "epoch": 1.032184200483399, "ewc_loss": 0.05048191547393799, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021062971791252494, "grad_norm": 5.951976299285889, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8550223708152771, "num_tokens": 309505085.0, "step": 8114 }, { "epoch": 1.0323114107619895, "ewc_loss": 0.050419941544532776, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021000996639486402, "grad_norm": 5.96657657623291, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8659883737564087, "num_tokens": 309544746.0, "step": 8115 }, { "epoch": 1.03243862104058, "ewc_loss": 0.05047949403524399, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002106054889736697, "grad_norm": 5.965700149536133, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.850111722946167, "num_tokens": 309584301.0, "step": 8116 }, { "epoch": 1.0325658313191706, "ewc_loss": 0.05049930140376091, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021080356964375824, "grad_norm": 5.975492477416992, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8663619160652161, "num_tokens": 309622570.0, "step": 8117 }, { "epoch": 1.032693041597761, "ewc_loss": 0.050476446747779846, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021057500271126628, "grad_norm": 5.927180767059326, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8475857973098755, "num_tokens": 309666417.0, "step": 8118 }, { "epoch": 1.0328202518763516, "ewc_loss": 0.050539322197437286, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021120376186445355, "grad_norm": 5.979310512542725, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8533096313476562, "num_tokens": 309705706.0, "step": 8119 }, { "epoch": 1.0329474621549422, "ewc_loss": 0.05062130093574524, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021080285659991205, "grad_norm": 5.928391933441162, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8719128370285034, "num_tokens": 309744288.0, "step": 8120 }, { "epoch": 1.0330746724335327, "ewc_loss": 0.05050383135676384, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021084885520394892, "grad_norm": 6.014017105102539, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8503114581108093, "num_tokens": 309780406.0, "step": 8121 }, { "epoch": 1.0332018827121232, "ewc_loss": 0.05044884979724884, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.000210299069294706, "grad_norm": 5.993434906005859, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8311095237731934, "num_tokens": 309811135.0, "step": 8122 }, { "epoch": 1.0333290929907137, "ewc_loss": 0.05046231299638748, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002104336890624836, "grad_norm": 5.937802791595459, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8666882514953613, "num_tokens": 309847037.0, "step": 8123 }, { "epoch": 1.0334563032693043, "ewc_loss": 0.0505194328725338, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021100486628711224, "grad_norm": 5.969780921936035, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8636189699172974, "num_tokens": 309884274.0, "step": 8124 }, { "epoch": 1.0335835135478946, "ewc_loss": 0.05054159462451935, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.0002112265065079555, "grad_norm": 5.974473476409912, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8758445382118225, "num_tokens": 309915572.0, "step": 8125 }, { "epoch": 1.033710723826485, "ewc_loss": 0.05056729540228844, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021148349333088845, "grad_norm": 5.904720783233643, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8675829172134399, "num_tokens": 309954502.0, "step": 8126 }, { "epoch": 1.0338379341050756, "ewc_loss": 0.050553373992443085, "ewc_loss_diag": 2.944469451904297e-05, "ewc_loss_parallel": 0.00021134430426172912, "grad_norm": 5.927373886108398, "learning_rate": 1e-06, "loss": 0.3823, "mean_token_accuracy": 0.8844445943832397, "num_tokens": 309993015.0, "step": 8127 }, { "epoch": 1.0339651443836662, "ewc_loss": 0.05064549297094345, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021104478219058365, "grad_norm": 5.9591898918151855, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8528594970703125, "num_tokens": 310030840.0, "step": 8128 }, { "epoch": 1.0340923546622567, "ewc_loss": 0.05075937137007713, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021218355686869472, "grad_norm": 5.967993259429932, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.864260196685791, "num_tokens": 310067586.0, "step": 8129 }, { "epoch": 1.0342195649408472, "ewc_loss": 0.05070626735687256, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021165251382626593, "grad_norm": 5.921411991119385, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8716806173324585, "num_tokens": 310104496.0, "step": 8130 }, { "epoch": 1.0343467752194377, "ewc_loss": 0.0507364422082901, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021195427689235657, "grad_norm": 5.955053329467773, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8462629318237305, "num_tokens": 310141162.0, "step": 8131 }, { "epoch": 1.0344739854980283, "ewc_loss": 0.05073954910039902, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021198534523136914, "grad_norm": 5.929055213928223, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.861508846282959, "num_tokens": 310182965.0, "step": 8132 }, { "epoch": 1.0346011957766188, "ewc_loss": 0.05071219801902771, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021171182743273675, "grad_norm": 5.962056636810303, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8457856774330139, "num_tokens": 310219572.0, "step": 8133 }, { "epoch": 1.0347284060552093, "ewc_loss": 0.05075792595744133, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021216910681687295, "grad_norm": 6.051763534545898, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8510406017303467, "num_tokens": 310248711.0, "step": 8134 }, { "epoch": 1.0348556163337999, "ewc_loss": 0.050790101289749146, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021249084966257215, "grad_norm": 5.927243709564209, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8472871780395508, "num_tokens": 310287915.0, "step": 8135 }, { "epoch": 1.0349828266123904, "ewc_loss": 0.050769127905368805, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021228109835647047, "grad_norm": 6.034677028656006, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8608165383338928, "num_tokens": 310325619.0, "step": 8136 }, { "epoch": 1.0351100368909807, "ewc_loss": 0.050711601972579956, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021170589025132358, "grad_norm": 5.933035850524902, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8695072531700134, "num_tokens": 310362456.0, "step": 8137 }, { "epoch": 1.0352372471695712, "ewc_loss": 0.05073331296443939, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021192296117078513, "grad_norm": 5.986627578735352, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8576344847679138, "num_tokens": 310399584.0, "step": 8138 }, { "epoch": 1.0353644574481617, "ewc_loss": 0.05070115998387337, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.0002116014511557296, "grad_norm": 6.06095552444458, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8572544455528259, "num_tokens": 310437435.0, "step": 8139 }, { "epoch": 1.0354916677267523, "ewc_loss": 0.05069391429424286, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021152899716980755, "grad_norm": 5.925057411193848, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8683230876922607, "num_tokens": 310476320.0, "step": 8140 }, { "epoch": 1.0356188780053428, "ewc_loss": 0.05069759488105774, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.0002115657989634201, "grad_norm": 5.9411444664001465, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8777188062667847, "num_tokens": 310512956.0, "step": 8141 }, { "epoch": 1.0357460882839333, "ewc_loss": 0.050720468163490295, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021179452596697956, "grad_norm": 6.023556232452393, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8600186109542847, "num_tokens": 310547605.0, "step": 8142 }, { "epoch": 1.0358732985625239, "ewc_loss": 0.05068618059158325, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.00021145165374036878, "grad_norm": 5.908532619476318, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8609632253646851, "num_tokens": 310586991.0, "step": 8143 }, { "epoch": 1.0360005088411144, "ewc_loss": 0.05077923834323883, "ewc_loss_diag": 2.956390380859375e-05, "ewc_loss_parallel": 0.0002123822196153924, "grad_norm": 5.962270259857178, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8617129325866699, "num_tokens": 310621271.0, "step": 8144 }, { "epoch": 1.036127719119705, "ewc_loss": 0.0509587861597538, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021173630375415087, "grad_norm": 5.93135404586792, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.865182638168335, "num_tokens": 310662154.0, "step": 8145 }, { "epoch": 1.0362549293982954, "ewc_loss": 0.05103962868452072, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021254474995657802, "grad_norm": 5.9888739585876465, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8725835084915161, "num_tokens": 310701924.0, "step": 8146 }, { "epoch": 1.036382139676886, "ewc_loss": 0.051091890782117844, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021184665092732757, "grad_norm": 5.929323673248291, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8689429759979248, "num_tokens": 310739688.0, "step": 8147 }, { "epoch": 1.0365093499554765, "ewc_loss": 0.05090147256851196, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021116317657288164, "grad_norm": 5.961818218231201, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8568422794342041, "num_tokens": 310773317.0, "step": 8148 }, { "epoch": 1.0366365602340668, "ewc_loss": 0.05099859833717346, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.0002121344004990533, "grad_norm": 5.998025417327881, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8600001931190491, "num_tokens": 310810854.0, "step": 8149 }, { "epoch": 1.0367637705126573, "ewc_loss": 0.0509747639298439, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021189609833527356, "grad_norm": 6.013406276702881, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8558565378189087, "num_tokens": 310846080.0, "step": 8150 }, { "epoch": 1.0368909807912479, "ewc_loss": 0.050936851650476456, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.0002115169627359137, "grad_norm": 5.933794975280762, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8397548198699951, "num_tokens": 310885032.0, "step": 8151 }, { "epoch": 1.0370181910698384, "ewc_loss": 0.051109958440065384, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021202731295488775, "grad_norm": 6.022695541381836, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8679077625274658, "num_tokens": 310924518.0, "step": 8152 }, { "epoch": 1.037145401348429, "ewc_loss": 0.05104155093431473, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021134324197191745, "grad_norm": 5.967113971710205, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8642297983169556, "num_tokens": 310961734.0, "step": 8153 }, { "epoch": 1.0372726116270194, "ewc_loss": 0.05107003450393677, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021162806660868227, "grad_norm": 5.90878438949585, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8653419017791748, "num_tokens": 311004541.0, "step": 8154 }, { "epoch": 1.03739982190561, "ewc_loss": 0.05113016068935394, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021222932264208794, "grad_norm": 5.977082252502441, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.87186598777771, "num_tokens": 311043947.0, "step": 8155 }, { "epoch": 1.0375270321842005, "ewc_loss": 0.05106320232152939, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002115597453666851, "grad_norm": 5.924106597900391, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8588775396347046, "num_tokens": 311083154.0, "step": 8156 }, { "epoch": 1.037654242462791, "ewc_loss": 0.05105527490377426, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021148046653252095, "grad_norm": 5.929001331329346, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.874722421169281, "num_tokens": 311121429.0, "step": 8157 }, { "epoch": 1.0377814527413816, "ewc_loss": 0.051054731011390686, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021147506777197123, "grad_norm": 5.894674301147461, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8469942808151245, "num_tokens": 311166578.0, "step": 8158 }, { "epoch": 1.037908663019972, "ewc_loss": 0.051109857857227325, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021202629432082176, "grad_norm": 5.945970058441162, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8652928471565247, "num_tokens": 311209645.0, "step": 8159 }, { "epoch": 1.0380358732985626, "ewc_loss": 0.05108284205198288, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021175613801460713, "grad_norm": 5.939358234405518, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8668626546859741, "num_tokens": 311248428.0, "step": 8160 }, { "epoch": 1.0381630835771531, "ewc_loss": 0.05110512673854828, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002119790151482448, "grad_norm": 6.014575958251953, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8634116053581238, "num_tokens": 311282040.0, "step": 8161 }, { "epoch": 1.0382902938557435, "ewc_loss": 0.051033154129981995, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021125926286913455, "grad_norm": 5.902926921844482, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8693691492080688, "num_tokens": 311320233.0, "step": 8162 }, { "epoch": 1.038417504134334, "ewc_loss": 0.05104856193065643, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002114133385475725, "grad_norm": 5.965762138366699, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8477966785430908, "num_tokens": 311357586.0, "step": 8163 }, { "epoch": 1.0385447144129245, "ewc_loss": 0.05111192911863327, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002120470453519374, "grad_norm": 5.9462480545043945, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.862301230430603, "num_tokens": 311395545.0, "step": 8164 }, { "epoch": 1.038671924691515, "ewc_loss": 0.05108156055212021, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002117433468811214, "grad_norm": 5.988049507141113, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.862495481967926, "num_tokens": 311428785.0, "step": 8165 }, { "epoch": 1.0387991349701056, "ewc_loss": 0.0511336587369442, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021226431999821216, "grad_norm": 5.985929012298584, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8507307767868042, "num_tokens": 311469546.0, "step": 8166 }, { "epoch": 1.038926345248696, "ewc_loss": 0.05106203258037567, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021154804562684149, "grad_norm": 6.043336868286133, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8551622033119202, "num_tokens": 311504064.0, "step": 8167 }, { "epoch": 1.0390535555272866, "ewc_loss": 0.05111575126647949, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021208527323324233, "grad_norm": 5.9727396965026855, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.844546377658844, "num_tokens": 311536600.0, "step": 8168 }, { "epoch": 1.0391807658058771, "ewc_loss": 0.05109979957342148, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021192572603467852, "grad_norm": 5.977133274078369, "learning_rate": 1e-06, "loss": 0.3909, "mean_token_accuracy": 0.8795431852340698, "num_tokens": 311573638.0, "step": 8169 }, { "epoch": 1.0393079760844677, "ewc_loss": 0.05105924606323242, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021152017870917916, "grad_norm": 5.947683334350586, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8548644185066223, "num_tokens": 311611085.0, "step": 8170 }, { "epoch": 1.0394351863630582, "ewc_loss": 0.05106649920344353, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021159272000659257, "grad_norm": 5.959596157073975, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8560953140258789, "num_tokens": 311655497.0, "step": 8171 }, { "epoch": 1.0395623966416487, "ewc_loss": 0.05115348845720291, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021246263349894434, "grad_norm": 6.0210089683532715, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.848016083240509, "num_tokens": 311688167.0, "step": 8172 }, { "epoch": 1.0396896069202393, "ewc_loss": 0.05108922719955444, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002118199918186292, "grad_norm": 5.972721576690674, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8616982102394104, "num_tokens": 311724926.0, "step": 8173 }, { "epoch": 1.0398168171988296, "ewc_loss": 0.05108802020549774, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021180791372898966, "grad_norm": 6.020125865936279, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8436112999916077, "num_tokens": 311760729.0, "step": 8174 }, { "epoch": 1.03994402747742, "ewc_loss": 0.05111108720302582, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021203859068918973, "grad_norm": 5.939023971557617, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.86310875415802, "num_tokens": 311802138.0, "step": 8175 }, { "epoch": 1.0400712377560106, "ewc_loss": 0.05115009471774101, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021242868388071656, "grad_norm": 5.986499309539795, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8696420192718506, "num_tokens": 311838510.0, "step": 8176 }, { "epoch": 1.0401984480346012, "ewc_loss": 0.05111929029226303, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002121206489391625, "grad_norm": 6.025432586669922, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8562372922897339, "num_tokens": 311870875.0, "step": 8177 }, { "epoch": 1.0403256583131917, "ewc_loss": 0.05111666023731232, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021209430997259915, "grad_norm": 5.966920852661133, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8758972883224487, "num_tokens": 311905854.0, "step": 8178 }, { "epoch": 1.0404528685917822, "ewc_loss": 0.05107295513153076, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021165730140637606, "grad_norm": 6.021214962005615, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.859247088432312, "num_tokens": 311936381.0, "step": 8179 }, { "epoch": 1.0405800788703727, "ewc_loss": 0.051091574132442474, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021184346405789256, "grad_norm": 6.006627082824707, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8565055131912231, "num_tokens": 311972529.0, "step": 8180 }, { "epoch": 1.0407072891489633, "ewc_loss": 0.05112922936677933, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.000212220023968257, "grad_norm": 6.011518955230713, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8809279203414917, "num_tokens": 312007883.0, "step": 8181 }, { "epoch": 1.0408344994275538, "ewc_loss": 0.05100446939468384, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021097240096423775, "grad_norm": 6.008668899536133, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.857402503490448, "num_tokens": 312041000.0, "step": 8182 }, { "epoch": 1.0409617097061443, "ewc_loss": 0.0510648638010025, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021157637820579112, "grad_norm": 5.976128578186035, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8647620677947998, "num_tokens": 312076011.0, "step": 8183 }, { "epoch": 1.0410889199847349, "ewc_loss": 0.0510684996843338, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021161272889003158, "grad_norm": 5.981237411499023, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8554064035415649, "num_tokens": 312115614.0, "step": 8184 }, { "epoch": 1.0412161302633254, "ewc_loss": 0.05101872235536575, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021111496607773006, "grad_norm": 6.068045139312744, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8564287424087524, "num_tokens": 312154111.0, "step": 8185 }, { "epoch": 1.0413433405419157, "ewc_loss": 0.05100202187895775, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002109479537466541, "grad_norm": 5.977722644805908, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8562113642692566, "num_tokens": 312191571.0, "step": 8186 }, { "epoch": 1.0414705508205062, "ewc_loss": 0.05099731683731079, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021090090740472078, "grad_norm": 5.927579402923584, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8554239273071289, "num_tokens": 312229205.0, "step": 8187 }, { "epoch": 1.0415977610990967, "ewc_loss": 0.050960980355739594, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021053754608146846, "grad_norm": 5.9423933029174805, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8777748346328735, "num_tokens": 312266604.0, "step": 8188 }, { "epoch": 1.0417249713776873, "ewc_loss": 0.051033273339271545, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021126045612618327, "grad_norm": 5.947689056396484, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8505817651748657, "num_tokens": 312305222.0, "step": 8189 }, { "epoch": 1.0418521816562778, "ewc_loss": 0.051020096987485886, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021112870308570564, "grad_norm": 5.925617218017578, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8401485681533813, "num_tokens": 312345823.0, "step": 8190 }, { "epoch": 1.0419793919348683, "ewc_loss": 0.05100429430603981, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021097066928632557, "grad_norm": 5.969625949859619, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8531118035316467, "num_tokens": 312383569.0, "step": 8191 }, { "epoch": 1.0421066022134589, "ewc_loss": 0.05107099935412407, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002116377290803939, "grad_norm": 5.940698146820068, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8650999069213867, "num_tokens": 312423068.0, "step": 8192 }, { "epoch": 1.0422338124920494, "ewc_loss": 0.051072053611278534, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021164827921893448, "grad_norm": 5.981690883636475, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8569443821907043, "num_tokens": 312460569.0, "step": 8193 }, { "epoch": 1.04236102277064, "ewc_loss": 0.0510236993432045, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021116471907589585, "grad_norm": 5.970516681671143, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8406916856765747, "num_tokens": 312501654.0, "step": 8194 }, { "epoch": 1.0424882330492304, "ewc_loss": 0.05109213665127754, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021184911020100117, "grad_norm": 5.973773956298828, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8539915084838867, "num_tokens": 312543630.0, "step": 8195 }, { "epoch": 1.042615443327821, "ewc_loss": 0.05104586482048035, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021138638840056956, "grad_norm": 5.98714542388916, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8593549728393555, "num_tokens": 312579428.0, "step": 8196 }, { "epoch": 1.0427426536064115, "ewc_loss": 0.05101215839385986, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002110493223881349, "grad_norm": 5.9156575202941895, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8758442997932434, "num_tokens": 312621388.0, "step": 8197 }, { "epoch": 1.0428698638850018, "ewc_loss": 0.050994183868169785, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002108695771312341, "grad_norm": 5.953124046325684, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8622201085090637, "num_tokens": 312665303.0, "step": 8198 }, { "epoch": 1.0429970741635923, "ewc_loss": 0.051086895167827606, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002117966942023486, "grad_norm": 6.011146068572998, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8514523506164551, "num_tokens": 312702354.0, "step": 8199 }, { "epoch": 1.0431242844421829, "ewc_loss": 0.051035165786743164, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021127937361598015, "grad_norm": 5.985190391540527, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8667992353439331, "num_tokens": 312737223.0, "step": 8200 }, { "epoch": 1.0432514947207734, "ewc_loss": 0.05109097808599472, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021183754142839462, "grad_norm": 5.991179466247559, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8632249236106873, "num_tokens": 312773353.0, "step": 8201 }, { "epoch": 1.043378704999364, "ewc_loss": 0.05102701485157013, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021119786833878607, "grad_norm": 5.979180335998535, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8442956805229187, "num_tokens": 312809230.0, "step": 8202 }, { "epoch": 1.0435059152779544, "ewc_loss": 0.05110083520412445, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021193610155023634, "grad_norm": 5.979655742645264, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8793045282363892, "num_tokens": 312841278.0, "step": 8203 }, { "epoch": 1.043633125556545, "ewc_loss": 0.051072798669338226, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021165570069570094, "grad_norm": 5.952246189117432, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8671316504478455, "num_tokens": 312888469.0, "step": 8204 }, { "epoch": 1.0437603358351355, "ewc_loss": 0.05117115378379822, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021263925009407103, "grad_norm": 6.006117343902588, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8696233630180359, "num_tokens": 312925446.0, "step": 8205 }, { "epoch": 1.043887546113726, "ewc_loss": 0.0511065348982811, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021199305774644017, "grad_norm": 5.934166431427002, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8602466583251953, "num_tokens": 312965425.0, "step": 8206 }, { "epoch": 1.0440147563923166, "ewc_loss": 0.05112104490399361, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021213818399701267, "grad_norm": 6.041479110717773, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8550329208374023, "num_tokens": 313000637.0, "step": 8207 }, { "epoch": 1.044141966670907, "ewc_loss": 0.05112651363015175, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021219287009444088, "grad_norm": 5.976901531219482, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8619680404663086, "num_tokens": 313035986.0, "step": 8208 }, { "epoch": 1.0442691769494976, "ewc_loss": 0.05111920088529587, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021211971761658788, "grad_norm": 6.012531757354736, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.849888026714325, "num_tokens": 313077027.0, "step": 8209 }, { "epoch": 1.0443963872280881, "ewc_loss": 0.05108731985092163, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002118009579135105, "grad_norm": 5.933530807495117, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8716747760772705, "num_tokens": 313121733.0, "step": 8210 }, { "epoch": 1.0445235975066784, "ewc_loss": 0.05112228915095329, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021215062588453293, "grad_norm": 5.9458909034729, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8560025095939636, "num_tokens": 313165542.0, "step": 8211 }, { "epoch": 1.044650807785269, "ewc_loss": 0.051132798194885254, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021225569071248174, "grad_norm": 6.00392484664917, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8502130508422852, "num_tokens": 313201082.0, "step": 8212 }, { "epoch": 1.0447780180638595, "ewc_loss": 0.05113670974969864, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002122948062606156, "grad_norm": 5.942529201507568, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.872664749622345, "num_tokens": 313241765.0, "step": 8213 }, { "epoch": 1.04490522834245, "ewc_loss": 0.05115489661693573, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002124767197528854, "grad_norm": 6.019337177276611, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8597517609596252, "num_tokens": 313282471.0, "step": 8214 }, { "epoch": 1.0450324386210406, "ewc_loss": 0.05116385594010353, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.000212566286791116, "grad_norm": 5.927909851074219, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8574889898300171, "num_tokens": 313331505.0, "step": 8215 }, { "epoch": 1.045159648899631, "ewc_loss": 0.05119410157203674, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021286877745296806, "grad_norm": 6.041505336761475, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8618797063827515, "num_tokens": 313369823.0, "step": 8216 }, { "epoch": 1.0452868591782216, "ewc_loss": 0.05116376280784607, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002125653700204566, "grad_norm": 5.967188358306885, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8618901968002319, "num_tokens": 313404255.0, "step": 8217 }, { "epoch": 1.0454140694568121, "ewc_loss": 0.05097341910004616, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021310332522261888, "grad_norm": 6.004731178283691, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8734213709831238, "num_tokens": 313439368.0, "step": 8218 }, { "epoch": 1.0455412797354027, "ewc_loss": 0.051140133291482925, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021232906146906316, "grad_norm": 5.945241928100586, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.853850245475769, "num_tokens": 313479475.0, "step": 8219 }, { "epoch": 1.0456684900139932, "ewc_loss": 0.05123496800661087, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002132774388883263, "grad_norm": 6.099437713623047, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8522761464118958, "num_tokens": 313514604.0, "step": 8220 }, { "epoch": 1.0457957002925837, "ewc_loss": 0.05111955851316452, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021212331193964928, "grad_norm": 5.953538417816162, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.857752799987793, "num_tokens": 313548907.0, "step": 8221 }, { "epoch": 1.0459229105711743, "ewc_loss": 0.051144734025001526, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021237507462501526, "grad_norm": 5.992334365844727, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8602022528648376, "num_tokens": 313586615.0, "step": 8222 }, { "epoch": 1.0460501208497646, "ewc_loss": 0.05115576833486557, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002124853926943615, "grad_norm": 5.970875263214111, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8513561487197876, "num_tokens": 313625202.0, "step": 8223 }, { "epoch": 1.046177331128355, "ewc_loss": 0.05114245414733887, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021235228632576764, "grad_norm": 5.965081691741943, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8555150032043457, "num_tokens": 313663257.0, "step": 8224 }, { "epoch": 1.0463045414069456, "ewc_loss": 0.050942663103342056, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021279577049426734, "grad_norm": 5.939329624176025, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8663553595542908, "num_tokens": 313701430.0, "step": 8225 }, { "epoch": 1.0464317516855361, "ewc_loss": 0.05118555575609207, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021278327039908618, "grad_norm": 5.968777656555176, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8734216094017029, "num_tokens": 313741286.0, "step": 8226 }, { "epoch": 1.0465589619641267, "ewc_loss": 0.0509805828332901, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021317496430128813, "grad_norm": 6.019873142242432, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8462350964546204, "num_tokens": 313779115.0, "step": 8227 }, { "epoch": 1.0466861722427172, "ewc_loss": 0.050920404493808746, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021257318439893425, "grad_norm": 5.907535552978516, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8663077354431152, "num_tokens": 313822331.0, "step": 8228 }, { "epoch": 1.0468133825213077, "ewc_loss": 0.050941966474056244, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021278878557495773, "grad_norm": 5.937594890594482, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8557885885238647, "num_tokens": 313863545.0, "step": 8229 }, { "epoch": 1.0469405927998983, "ewc_loss": 0.05096691846847534, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.000213038336369209, "grad_norm": 5.990762710571289, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8814443349838257, "num_tokens": 313894956.0, "step": 8230 }, { "epoch": 1.0470678030784888, "ewc_loss": 0.05099732056260109, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.0002133423404302448, "grad_norm": 6.03081750869751, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.870400071144104, "num_tokens": 313927920.0, "step": 8231 }, { "epoch": 1.0471950133570793, "ewc_loss": 0.050968237221241, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021305149130057544, "grad_norm": 5.949066162109375, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.870246410369873, "num_tokens": 313963230.0, "step": 8232 }, { "epoch": 1.0473222236356698, "ewc_loss": 0.050993822515010834, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021330734307412058, "grad_norm": 6.075741291046143, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8532570004463196, "num_tokens": 313995951.0, "step": 8233 }, { "epoch": 1.0474494339142604, "ewc_loss": 0.05120570957660675, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021298481442499906, "grad_norm": 5.934829235076904, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8626900911331177, "num_tokens": 314040928.0, "step": 8234 }, { "epoch": 1.0475766441928507, "ewc_loss": 0.05115871876478195, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002125149330822751, "grad_norm": 5.993408203125, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8531280159950256, "num_tokens": 314080329.0, "step": 8235 }, { "epoch": 1.0477038544714412, "ewc_loss": 0.0509214773774147, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021258390916045755, "grad_norm": 5.96189022064209, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8626466989517212, "num_tokens": 314117107.0, "step": 8236 }, { "epoch": 1.0478310647500317, "ewc_loss": 0.05120377987623215, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002129655476892367, "grad_norm": 6.0541486740112305, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8589376211166382, "num_tokens": 314148077.0, "step": 8237 }, { "epoch": 1.0479582750286223, "ewc_loss": 0.05115678161382675, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002124955499311909, "grad_norm": 6.000547885894775, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8542189002037048, "num_tokens": 314182064.0, "step": 8238 }, { "epoch": 1.0480854853072128, "ewc_loss": 0.05111783742904663, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021210609702393413, "grad_norm": 5.98699951171875, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8529269695281982, "num_tokens": 314221667.0, "step": 8239 }, { "epoch": 1.0482126955858033, "ewc_loss": 0.05112519860267639, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002121797006111592, "grad_norm": 6.012693881988525, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8591513633728027, "num_tokens": 314255225.0, "step": 8240 }, { "epoch": 1.0483399058643939, "ewc_loss": 0.051026228815317154, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021241072681732476, "grad_norm": 6.030838966369629, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8615907430648804, "num_tokens": 314288011.0, "step": 8241 }, { "epoch": 1.0484671161429844, "ewc_loss": 0.051148176193237305, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002124095190083608, "grad_norm": 5.9161057472229, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8575351238250732, "num_tokens": 314332461.0, "step": 8242 }, { "epoch": 1.048594326421575, "ewc_loss": 0.05121183395385742, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021304606343619525, "grad_norm": 5.987581253051758, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8564587831497192, "num_tokens": 314367967.0, "step": 8243 }, { "epoch": 1.0487215367001654, "ewc_loss": 0.051457345485687256, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021305977134034038, "grad_norm": 6.031407356262207, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8701959848403931, "num_tokens": 314404487.0, "step": 8244 }, { "epoch": 1.048848746978756, "ewc_loss": 0.051468633115291595, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021317267965059727, "grad_norm": 5.954111576080322, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8603873252868652, "num_tokens": 314444731.0, "step": 8245 }, { "epoch": 1.0489759572573465, "ewc_loss": 0.05146891251206398, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002131754590664059, "grad_norm": 6.048933982849121, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8632238507270813, "num_tokens": 314479720.0, "step": 8246 }, { "epoch": 1.0491031675359368, "ewc_loss": 0.0512186661362648, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021311439923010767, "grad_norm": 5.957546234130859, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8792990446090698, "num_tokens": 314515613.0, "step": 8247 }, { "epoch": 1.0492303778145273, "ewc_loss": 0.05119858682155609, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021291359735187143, "grad_norm": 5.990571975708008, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8597838878631592, "num_tokens": 314556519.0, "step": 8248 }, { "epoch": 1.0493575880931179, "ewc_loss": 0.051260173320770264, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021230879065115005, "grad_norm": 5.98613166809082, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.871684193611145, "num_tokens": 314590803.0, "step": 8249 }, { "epoch": 1.0494847983717084, "ewc_loss": 0.051242388784885406, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021335161000024527, "grad_norm": 5.995727062225342, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8625115156173706, "num_tokens": 314631347.0, "step": 8250 }, { "epoch": 1.049612008650299, "ewc_loss": 0.051275622099637985, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021246325923129916, "grad_norm": 5.968262195587158, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8598470687866211, "num_tokens": 314673739.0, "step": 8251 }, { "epoch": 1.0497392189288894, "ewc_loss": 0.05122082680463791, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021313599427230656, "grad_norm": 5.981121063232422, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8620696067810059, "num_tokens": 314718498.0, "step": 8252 }, { "epoch": 1.04986642920748, "ewc_loss": 0.051181480288505554, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002127425541402772, "grad_norm": 5.987866401672363, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8674218058586121, "num_tokens": 314758755.0, "step": 8253 }, { "epoch": 1.0499936394860705, "ewc_loss": 0.05117570981383324, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002126848412444815, "grad_norm": 6.0077595710754395, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8654797077178955, "num_tokens": 314793644.0, "step": 8254 }, { "epoch": 1.050120849764661, "ewc_loss": 0.051234371960163116, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021327142894733697, "grad_norm": 6.020416259765625, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.858052134513855, "num_tokens": 314828045.0, "step": 8255 }, { "epoch": 1.0502480600432516, "ewc_loss": 0.0511188730597496, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021211647253949195, "grad_norm": 5.983099460601807, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8640390634536743, "num_tokens": 314866088.0, "step": 8256 }, { "epoch": 1.050375270321842, "ewc_loss": 0.051198624074459076, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021291396114975214, "grad_norm": 6.011192798614502, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8529919385910034, "num_tokens": 314904757.0, "step": 8257 }, { "epoch": 1.0505024806004326, "ewc_loss": 0.05111066251993179, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021203434152994305, "grad_norm": 5.98423433303833, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8535511493682861, "num_tokens": 314941410.0, "step": 8258 }, { "epoch": 1.0506296908790231, "ewc_loss": 0.05132142826914787, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021292130986694247, "grad_norm": 5.990329742431641, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8694566488265991, "num_tokens": 314982060.0, "step": 8259 }, { "epoch": 1.0507569011576134, "ewc_loss": 0.05125705897808075, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021227763500064611, "grad_norm": 5.980278968811035, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8533165454864502, "num_tokens": 315021812.0, "step": 8260 }, { "epoch": 1.050884111436204, "ewc_loss": 0.05119626596570015, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021289040159899741, "grad_norm": 5.965091228485107, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.85810387134552, "num_tokens": 315059520.0, "step": 8261 }, { "epoch": 1.0510113217147945, "ewc_loss": 0.0512482151389122, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002134099049726501, "grad_norm": 6.012448310852051, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8561673760414124, "num_tokens": 315098392.0, "step": 8262 }, { "epoch": 1.051138531993385, "ewc_loss": 0.051223643124103546, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021316415222827345, "grad_norm": 6.043956756591797, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.85094153881073, "num_tokens": 315137375.0, "step": 8263 }, { "epoch": 1.0512657422719756, "ewc_loss": 0.05122463405132294, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021317406208254397, "grad_norm": 6.015517234802246, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8648827075958252, "num_tokens": 315171308.0, "step": 8264 }, { "epoch": 1.051392952550566, "ewc_loss": 0.05119325593113899, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021286029368638992, "grad_norm": 5.984719753265381, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8628774285316467, "num_tokens": 315212176.0, "step": 8265 }, { "epoch": 1.0515201628291566, "ewc_loss": 0.05122586339712143, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002131863875547424, "grad_norm": 6.010015487670898, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8491230010986328, "num_tokens": 315251321.0, "step": 8266 }, { "epoch": 1.0516473731077471, "ewc_loss": 0.05120566487312317, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021298436331562698, "grad_norm": 6.021700859069824, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.845935046672821, "num_tokens": 315288780.0, "step": 8267 }, { "epoch": 1.0517745833863377, "ewc_loss": 0.051189035177230835, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002128181076841429, "grad_norm": 5.9771904945373535, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8841685652732849, "num_tokens": 315325630.0, "step": 8268 }, { "epoch": 1.0519017936649282, "ewc_loss": 0.05117672309279442, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021269496937748045, "grad_norm": 5.948618412017822, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8558228611946106, "num_tokens": 315362887.0, "step": 8269 }, { "epoch": 1.0520290039435187, "ewc_loss": 0.051435112953186035, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002128374471794814, "grad_norm": 5.992539405822754, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.874333381652832, "num_tokens": 315397552.0, "step": 8270 }, { "epoch": 1.0521562142221093, "ewc_loss": 0.05147796869277954, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021326603018678725, "grad_norm": 5.981205940246582, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8712036609649658, "num_tokens": 315436785.0, "step": 8271 }, { "epoch": 1.0522834245006996, "ewc_loss": 0.051457688212394714, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002130632201442495, "grad_norm": 6.045844554901123, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8714072704315186, "num_tokens": 315467849.0, "step": 8272 }, { "epoch": 1.05241063477929, "ewc_loss": 0.05116572603583336, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021258500055409968, "grad_norm": 5.994576930999756, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8702872395515442, "num_tokens": 315501496.0, "step": 8273 }, { "epoch": 1.0525378450578806, "ewc_loss": 0.051442719995975494, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002129135245922953, "grad_norm": 5.952780723571777, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8708862066268921, "num_tokens": 315540878.0, "step": 8274 }, { "epoch": 1.0526650553364711, "ewc_loss": 0.05141746252775192, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002126609324477613, "grad_norm": 6.019349575042725, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8754178285598755, "num_tokens": 315578139.0, "step": 8275 }, { "epoch": 1.0527922656150617, "ewc_loss": 0.051153432577848434, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002124620514223352, "grad_norm": 5.93427848815918, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8630943298339844, "num_tokens": 315615845.0, "step": 8276 }, { "epoch": 1.0529194758936522, "ewc_loss": 0.051197148859500885, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021289923461154103, "grad_norm": 6.071868896484375, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8530402183532715, "num_tokens": 315645407.0, "step": 8277 }, { "epoch": 1.0530466861722427, "ewc_loss": 0.05123282968997955, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021325601846911013, "grad_norm": 5.955655097961426, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8686169981956482, "num_tokens": 315687922.0, "step": 8278 }, { "epoch": 1.0531738964508333, "ewc_loss": 0.051186129450798035, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021278901840560138, "grad_norm": 6.025665760040283, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8641228675842285, "num_tokens": 315723730.0, "step": 8279 }, { "epoch": 1.0533011067294238, "ewc_loss": 0.05141879618167877, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021267429110594094, "grad_norm": 5.896640300750732, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8736661076545715, "num_tokens": 315765662.0, "step": 8280 }, { "epoch": 1.0534283170080143, "ewc_loss": 0.051469553261995316, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021318186190910637, "grad_norm": 5.987306594848633, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8376774787902832, "num_tokens": 315808668.0, "step": 8281 }, { "epoch": 1.0535555272866048, "ewc_loss": 0.05127765238285065, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021248353004921228, "grad_norm": 5.9768147468566895, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8564989566802979, "num_tokens": 315844878.0, "step": 8282 }, { "epoch": 1.0536827375651954, "ewc_loss": 0.05119452625513077, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002128729975083843, "grad_norm": 5.951938152313232, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8757146596908569, "num_tokens": 315879472.0, "step": 8283 }, { "epoch": 1.0538099478437857, "ewc_loss": 0.05132880061864853, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021299505897331983, "grad_norm": 6.00662088394165, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8581613302230835, "num_tokens": 315915496.0, "step": 8284 }, { "epoch": 1.0539371581223762, "ewc_loss": 0.0514509379863739, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021299569925758988, "grad_norm": 5.964752197265625, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8640857934951782, "num_tokens": 315954317.0, "step": 8285 }, { "epoch": 1.0540643684009667, "ewc_loss": 0.05119816213846207, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021290936274453998, "grad_norm": 5.967841625213623, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8573470115661621, "num_tokens": 315995518.0, "step": 8286 }, { "epoch": 1.0541915786795573, "ewc_loss": 0.051208481192588806, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021301255037542433, "grad_norm": 5.921227931976318, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.87493896484375, "num_tokens": 316040475.0, "step": 8287 }, { "epoch": 1.0543187889581478, "ewc_loss": 0.051369559019804, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002134026144631207, "grad_norm": 5.998263359069824, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8605084419250488, "num_tokens": 316083003.0, "step": 8288 }, { "epoch": 1.0544459992367383, "ewc_loss": 0.05132710188627243, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002129780623363331, "grad_norm": 5.94204568862915, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.863379955291748, "num_tokens": 316124687.0, "step": 8289 }, { "epoch": 1.0545732095153288, "ewc_loss": 0.051281243562698364, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002137401606887579, "grad_norm": 6.005422592163086, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8496209383010864, "num_tokens": 316170052.0, "step": 8290 }, { "epoch": 1.0547004197939194, "ewc_loss": 0.051534079015254974, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002138271083822474, "grad_norm": 5.973721504211426, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8487082123756409, "num_tokens": 316211941.0, "step": 8291 }, { "epoch": 1.05482763007251, "ewc_loss": 0.05150541663169861, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002135405084118247, "grad_norm": 6.021139621734619, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8660026788711548, "num_tokens": 316252592.0, "step": 8292 }, { "epoch": 1.0549548403511004, "ewc_loss": 0.051362693309783936, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021333398763090372, "grad_norm": 5.970166206359863, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8700467348098755, "num_tokens": 316287250.0, "step": 8293 }, { "epoch": 1.055082050629691, "ewc_loss": 0.05139724165201187, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021367945009842515, "grad_norm": 5.986971855163574, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8659546375274658, "num_tokens": 316326308.0, "step": 8294 }, { "epoch": 1.0552092609082815, "ewc_loss": 0.05139676108956337, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021367463341448456, "grad_norm": 5.954421520233154, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8654053807258606, "num_tokens": 316364519.0, "step": 8295 }, { "epoch": 1.0553364711868718, "ewc_loss": 0.051399677991867065, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002137037954526022, "grad_norm": 5.976786136627197, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8676440715789795, "num_tokens": 316401165.0, "step": 8296 }, { "epoch": 1.0554636814654623, "ewc_loss": 0.05140436440706253, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021375066717155278, "grad_norm": 5.969709396362305, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8567119836807251, "num_tokens": 316440245.0, "step": 8297 }, { "epoch": 1.0555908917440529, "ewc_loss": 0.05146586894989014, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002143657475244254, "grad_norm": 5.9801530838012695, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8714156150817871, "num_tokens": 316484969.0, "step": 8298 }, { "epoch": 1.0557181020226434, "ewc_loss": 0.05140992999076843, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021380631369538605, "grad_norm": 5.984907150268555, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8798790574073792, "num_tokens": 316520510.0, "step": 8299 }, { "epoch": 1.055845312301234, "ewc_loss": 0.051423847675323486, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021394550276454538, "grad_norm": 5.971446514129639, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.846135139465332, "num_tokens": 316561946.0, "step": 8300 }, { "epoch": 1.0559725225798244, "ewc_loss": 0.051592953503131866, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021441584976855665, "grad_norm": 5.980325698852539, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8798571825027466, "num_tokens": 316605489.0, "step": 8301 }, { "epoch": 1.056099732858415, "ewc_loss": 0.05146825686097145, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021438959811348468, "grad_norm": 5.966752529144287, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8461619019508362, "num_tokens": 316648590.0, "step": 8302 }, { "epoch": 1.0562269431370055, "ewc_loss": 0.05136658251285553, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021459355775732547, "grad_norm": 5.983586311340332, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8749420046806335, "num_tokens": 316688169.0, "step": 8303 }, { "epoch": 1.056354153415596, "ewc_loss": 0.051467522978782654, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021438226394820958, "grad_norm": 6.041961193084717, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8408191204071045, "num_tokens": 316724031.0, "step": 8304 }, { "epoch": 1.0564813636941865, "ewc_loss": 0.05136127024888992, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.0002145404287148267, "grad_norm": 5.930323600769043, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.861098051071167, "num_tokens": 316769762.0, "step": 8305 }, { "epoch": 1.056608573972777, "ewc_loss": 0.05136309191584587, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021455864771269262, "grad_norm": 5.952438831329346, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8696235418319702, "num_tokens": 316814156.0, "step": 8306 }, { "epoch": 1.0567357842513676, "ewc_loss": 0.051396191120147705, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021488964557647705, "grad_norm": 6.0017900466918945, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.871246874332428, "num_tokens": 316852784.0, "step": 8307 }, { "epoch": 1.0568629945299581, "ewc_loss": 0.05141296237707138, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021505738550331444, "grad_norm": 6.004806995391846, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8565870523452759, "num_tokens": 316891992.0, "step": 8308 }, { "epoch": 1.0569902048085484, "ewc_loss": 0.05147060751914978, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021441312856040895, "grad_norm": 5.99019193649292, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8765383958816528, "num_tokens": 316932246.0, "step": 8309 }, { "epoch": 1.057117415087139, "ewc_loss": 0.05133232846856117, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021547172218561172, "grad_norm": 6.041260719299316, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8547936677932739, "num_tokens": 316972017.0, "step": 8310 }, { "epoch": 1.0572446253657295, "ewc_loss": 0.05128707364201546, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021501917217392474, "grad_norm": 6.010943412780762, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8580135703086853, "num_tokens": 317008631.0, "step": 8311 }, { "epoch": 1.05737183564432, "ewc_loss": 0.05145137012004852, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021544141054619104, "grad_norm": 6.04107666015625, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8674446940422058, "num_tokens": 317048616.0, "step": 8312 }, { "epoch": 1.0574990459229106, "ewc_loss": 0.051166750490665436, "ewc_loss_diag": 2.968311309814453e-05, "ewc_loss_parallel": 0.00021503666357602924, "grad_norm": 6.039156436920166, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8543844223022461, "num_tokens": 317088675.0, "step": 8313 }, { "epoch": 1.057626256201501, "ewc_loss": 0.05143919587135315, "ewc_loss_diag": 2.9921531677246094e-05, "ewc_loss_parallel": 0.00021531968377530575, "grad_norm": 6.0513458251953125, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.882900059223175, "num_tokens": 317120351.0, "step": 8314 }, { "epoch": 1.0577534664800916, "ewc_loss": 0.05126519873738289, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.00021480042778421193, "grad_norm": 6.02453088760376, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8563297986984253, "num_tokens": 317159027.0, "step": 8315 }, { "epoch": 1.0578806767586821, "ewc_loss": 0.05130162090063095, "ewc_loss_diag": 2.9802322387695312e-05, "ewc_loss_parallel": 0.0002151646331185475, "grad_norm": 6.113399505615234, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.861451268196106, "num_tokens": 317188892.0, "step": 8316 }, { "epoch": 1.0580078870372727, "ewc_loss": 0.05154942348599434, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021398055832833052, "grad_norm": 6.006213665008545, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8672940135002136, "num_tokens": 317229026.0, "step": 8317 }, { "epoch": 1.0581350973158632, "ewc_loss": 0.051520124077796936, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002149082865798846, "grad_norm": 5.981664180755615, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8616182804107666, "num_tokens": 317272946.0, "step": 8318 }, { "epoch": 1.0582623075944537, "ewc_loss": 0.0515313558280468, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002150205837097019, "grad_norm": 6.000188827514648, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8620922565460205, "num_tokens": 317314625.0, "step": 8319 }, { "epoch": 1.058389517873044, "ewc_loss": 0.05159584432840347, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021444479352794588, "grad_norm": 6.005001068115234, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8716566562652588, "num_tokens": 317351450.0, "step": 8320 }, { "epoch": 1.0585167281516346, "ewc_loss": 0.051706455647945404, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021555088460445404, "grad_norm": 6.040759563446045, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8453474044799805, "num_tokens": 317391519.0, "step": 8321 }, { "epoch": 1.058643938430225, "ewc_loss": 0.051480501890182495, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021451202337630093, "grad_norm": 5.981409549713135, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8564381003379822, "num_tokens": 317430672.0, "step": 8322 }, { "epoch": 1.0587711487088156, "ewc_loss": 0.051642850041389465, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002149148058379069, "grad_norm": 6.001426696777344, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8636052012443542, "num_tokens": 317470203.0, "step": 8323 }, { "epoch": 1.0588983589874061, "ewc_loss": 0.05153515934944153, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021505860786419362, "grad_norm": 6.011560440063477, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8651130199432373, "num_tokens": 317509594.0, "step": 8324 }, { "epoch": 1.0590255692659967, "ewc_loss": 0.051544301211833954, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002151500666514039, "grad_norm": 5.999063968658447, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8597503900527954, "num_tokens": 317549735.0, "step": 8325 }, { "epoch": 1.0591527795445872, "ewc_loss": 0.05165230110287666, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021500933507923037, "grad_norm": 6.033574104309082, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8595262169837952, "num_tokens": 317585064.0, "step": 8326 }, { "epoch": 1.0592799898231777, "ewc_loss": 0.05166047811508179, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002150911168428138, "grad_norm": 5.972474575042725, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8777093887329102, "num_tokens": 317622289.0, "step": 8327 }, { "epoch": 1.0594072001017683, "ewc_loss": 0.05153327062726021, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021503973403014243, "grad_norm": 6.03865385055542, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8517721891403198, "num_tokens": 317662872.0, "step": 8328 }, { "epoch": 1.0595344103803588, "ewc_loss": 0.05155645310878754, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021527156059164554, "grad_norm": 5.959863662719727, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.863079845905304, "num_tokens": 317701418.0, "step": 8329 }, { "epoch": 1.0596616206589493, "ewc_loss": 0.05161113291978836, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021581837791018188, "grad_norm": 6.00370979309082, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8760858178138733, "num_tokens": 317741416.0, "step": 8330 }, { "epoch": 1.0597888309375398, "ewc_loss": 0.05173564329743385, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002158427523681894, "grad_norm": 6.036309242248535, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8668813705444336, "num_tokens": 317778623.0, "step": 8331 }, { "epoch": 1.0599160412161304, "ewc_loss": 0.0517457015812397, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021594333520624787, "grad_norm": 6.052577018737793, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8647369146347046, "num_tokens": 317816577.0, "step": 8332 }, { "epoch": 1.0600432514947207, "ewc_loss": 0.05173342674970627, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021582060435321182, "grad_norm": 6.075353622436523, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8616088628768921, "num_tokens": 317851532.0, "step": 8333 }, { "epoch": 1.0601704617733112, "ewc_loss": 0.05167565867304802, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002152429224224761, "grad_norm": 5.967961311340332, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8628894090652466, "num_tokens": 317886789.0, "step": 8334 }, { "epoch": 1.0602976720519017, "ewc_loss": 0.05163155496120453, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021602259948849678, "grad_norm": 6.070540904998779, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8641401529312134, "num_tokens": 317929511.0, "step": 8335 }, { "epoch": 1.0604248823304923, "ewc_loss": 0.051669903099536896, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021518536959774792, "grad_norm": 6.02964973449707, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8608093857765198, "num_tokens": 317963767.0, "step": 8336 }, { "epoch": 1.0605520926090828, "ewc_loss": 0.051692895591259, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021541528985835612, "grad_norm": 6.028223514556885, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.878840446472168, "num_tokens": 318000458.0, "step": 8337 }, { "epoch": 1.0606793028876733, "ewc_loss": 0.05169697850942612, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021545609342865646, "grad_norm": 6.017502784729004, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8793200254440308, "num_tokens": 318040394.0, "step": 8338 }, { "epoch": 1.0608065131662638, "ewc_loss": 0.05166717618703842, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021515809930860996, "grad_norm": 6.04378080368042, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8591684103012085, "num_tokens": 318080859.0, "step": 8339 }, { "epoch": 1.0609337234448544, "ewc_loss": 0.05167923867702484, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021527870558202267, "grad_norm": 5.99699592590332, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8783655166625977, "num_tokens": 318115378.0, "step": 8340 }, { "epoch": 1.061060933723445, "ewc_loss": 0.05171078443527222, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021559414744842798, "grad_norm": 6.085742950439453, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8551487922668457, "num_tokens": 318150351.0, "step": 8341 }, { "epoch": 1.0611881440020354, "ewc_loss": 0.051691923290491104, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021540556917898357, "grad_norm": 5.97465181350708, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8622650504112244, "num_tokens": 318192427.0, "step": 8342 }, { "epoch": 1.061315354280626, "ewc_loss": 0.0516369491815567, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021485579782165587, "grad_norm": 6.036064624786377, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8501603007316589, "num_tokens": 318232150.0, "step": 8343 }, { "epoch": 1.0614425645592165, "ewc_loss": 0.05168090760707855, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021529542573262006, "grad_norm": 6.081582069396973, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8578953742980957, "num_tokens": 318267742.0, "step": 8344 }, { "epoch": 1.0615697748378068, "ewc_loss": 0.051595598459243774, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002144422905985266, "grad_norm": 5.960768222808838, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8666199445724487, "num_tokens": 318307974.0, "step": 8345 }, { "epoch": 1.0616969851163973, "ewc_loss": 0.05164425075054169, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021492881933227181, "grad_norm": 6.046682834625244, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.845284640789032, "num_tokens": 318349880.0, "step": 8346 }, { "epoch": 1.0618241953949878, "ewc_loss": 0.05162348225712776, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002147211489500478, "grad_norm": 6.059815883636475, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8601361513137817, "num_tokens": 318383869.0, "step": 8347 }, { "epoch": 1.0619514056735784, "ewc_loss": 0.05152076482772827, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.0002149146603187546, "grad_norm": 5.987186908721924, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8784843683242798, "num_tokens": 318419659.0, "step": 8348 }, { "epoch": 1.062078615952169, "ewc_loss": 0.05168956518173218, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021538199507631361, "grad_norm": 6.019438743591309, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8708868026733398, "num_tokens": 318460795.0, "step": 8349 }, { "epoch": 1.0622058262307594, "ewc_loss": 0.05154040828347206, "ewc_loss_diag": 3.0040740966796875e-05, "ewc_loss_parallel": 0.00021511111117433757, "grad_norm": 6.072995185852051, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8532478213310242, "num_tokens": 318499451.0, "step": 8350 }, { "epoch": 1.06233303650935, "ewc_loss": 0.051639363169670105, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002148799685528502, "grad_norm": 6.035902500152588, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8670602440834045, "num_tokens": 318533413.0, "step": 8351 }, { "epoch": 1.0624602467879405, "ewc_loss": 0.05168573558330536, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021534369443543255, "grad_norm": 6.094878673553467, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.85224449634552, "num_tokens": 318572533.0, "step": 8352 }, { "epoch": 1.062587457066531, "ewc_loss": 0.05162782222032547, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021476452820934355, "grad_norm": 5.996631145477295, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8650918006896973, "num_tokens": 318609442.0, "step": 8353 }, { "epoch": 1.0627146673451215, "ewc_loss": 0.05163412168622017, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021482753800228238, "grad_norm": 6.095646381378174, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8776456117630005, "num_tokens": 318643475.0, "step": 8354 }, { "epoch": 1.062841877623712, "ewc_loss": 0.051670365035533905, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021518996800296009, "grad_norm": 5.99667501449585, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8651885986328125, "num_tokens": 318684223.0, "step": 8355 }, { "epoch": 1.0629690879023026, "ewc_loss": 0.051674872636795044, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021523507894016802, "grad_norm": 6.102762699127197, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8633801937103271, "num_tokens": 318717148.0, "step": 8356 }, { "epoch": 1.0630962981808931, "ewc_loss": 0.05171136185526848, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002155999536626041, "grad_norm": 5.976975440979004, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8742635250091553, "num_tokens": 318758929.0, "step": 8357 }, { "epoch": 1.0632235084594834, "ewc_loss": 0.05169897526502609, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000215476073208265, "grad_norm": 6.078089714050293, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8606791496276855, "num_tokens": 318796438.0, "step": 8358 }, { "epoch": 1.063350718738074, "ewc_loss": 0.05171991512179375, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002156854752684012, "grad_norm": 6.041535377502441, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8496785163879395, "num_tokens": 318835732.0, "step": 8359 }, { "epoch": 1.0634779290166645, "ewc_loss": 0.051727019250392914, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021575653227046132, "grad_norm": 6.040212631225586, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8617913722991943, "num_tokens": 318872151.0, "step": 8360 }, { "epoch": 1.063605139295255, "ewc_loss": 0.05167606845498085, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021524701151065528, "grad_norm": 6.053714752197266, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8518901467323303, "num_tokens": 318911956.0, "step": 8361 }, { "epoch": 1.0637323495738455, "ewc_loss": 0.05166977643966675, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002151840744772926, "grad_norm": 6.036130428314209, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.864392876625061, "num_tokens": 318945593.0, "step": 8362 }, { "epoch": 1.063859559852436, "ewc_loss": 0.05167008936405182, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021518723224289715, "grad_norm": 6.111865997314453, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8719881176948547, "num_tokens": 318976670.0, "step": 8363 }, { "epoch": 1.0639867701310266, "ewc_loss": 0.051676519215106964, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021525149350054562, "grad_norm": 6.088186740875244, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8652728796005249, "num_tokens": 319007798.0, "step": 8364 }, { "epoch": 1.0641139804096171, "ewc_loss": 0.05160701274871826, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002145564358215779, "grad_norm": 6.009897232055664, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8524512052536011, "num_tokens": 319044808.0, "step": 8365 }, { "epoch": 1.0642411906882077, "ewc_loss": 0.05162250995635986, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021471141371876001, "grad_norm": 6.015013694763184, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8823926448822021, "num_tokens": 319083387.0, "step": 8366 }, { "epoch": 1.0643684009667982, "ewc_loss": 0.05163875222206116, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021487382764462382, "grad_norm": 6.053582191467285, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8780767917633057, "num_tokens": 319121897.0, "step": 8367 }, { "epoch": 1.0644956112453887, "ewc_loss": 0.05167115479707718, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021519785514101386, "grad_norm": 6.0988240242004395, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8509920239448547, "num_tokens": 319157190.0, "step": 8368 }, { "epoch": 1.064622821523979, "ewc_loss": 0.051606275141239166, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002145491016563028, "grad_norm": 6.04984188079834, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8730325102806091, "num_tokens": 319188353.0, "step": 8369 }, { "epoch": 1.0647500318025696, "ewc_loss": 0.05168378725647926, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021532419486902654, "grad_norm": 6.046687126159668, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8653616309165955, "num_tokens": 319225536.0, "step": 8370 }, { "epoch": 1.06487724208116, "ewc_loss": 0.05168714374303818, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021535775158554316, "grad_norm": 6.051205635070801, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8687623739242554, "num_tokens": 319265114.0, "step": 8371 }, { "epoch": 1.0650044523597506, "ewc_loss": 0.05164484679698944, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002149348147213459, "grad_norm": 6.0080108642578125, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8737900257110596, "num_tokens": 319309280.0, "step": 8372 }, { "epoch": 1.0651316626383411, "ewc_loss": 0.05163519084453583, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002148382627638057, "grad_norm": 6.005021572113037, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8742215633392334, "num_tokens": 319346114.0, "step": 8373 }, { "epoch": 1.0652588729169317, "ewc_loss": 0.05167342722415924, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002152205997845158, "grad_norm": 6.026648044586182, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.870139479637146, "num_tokens": 319389001.0, "step": 8374 }, { "epoch": 1.0653860831955222, "ewc_loss": 0.051654569804668427, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021503200696315616, "grad_norm": 5.970784664154053, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8721082210540771, "num_tokens": 319431464.0, "step": 8375 }, { "epoch": 1.0655132934741127, "ewc_loss": 0.05175965279340744, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000216082829865627, "grad_norm": 6.020806312561035, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8398992419242859, "num_tokens": 319475638.0, "step": 8376 }, { "epoch": 1.0656405037527032, "ewc_loss": 0.05170261114835739, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021551242389250547, "grad_norm": 6.061503887176514, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8732903599739075, "num_tokens": 319515983.0, "step": 8377 }, { "epoch": 1.0657677140312938, "ewc_loss": 0.051720455288887024, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021569088858086616, "grad_norm": 6.110293388366699, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8596601486206055, "num_tokens": 319552152.0, "step": 8378 }, { "epoch": 1.0658949243098843, "ewc_loss": 0.051677972078323364, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021526605996768922, "grad_norm": 5.988702297210693, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8553832173347473, "num_tokens": 319592963.0, "step": 8379 }, { "epoch": 1.0660221345884748, "ewc_loss": 0.051759347319602966, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002160798030672595, "grad_norm": 6.093997478485107, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8495458364486694, "num_tokens": 319635136.0, "step": 8380 }, { "epoch": 1.0661493448670654, "ewc_loss": 0.051658205687999725, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021506838675122708, "grad_norm": 6.017782688140869, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8655434846878052, "num_tokens": 319670118.0, "step": 8381 }, { "epoch": 1.0662765551456557, "ewc_loss": 0.051763661205768585, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021612293494399637, "grad_norm": 6.056371212005615, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8697668313980103, "num_tokens": 319711057.0, "step": 8382 }, { "epoch": 1.0664037654242462, "ewc_loss": 0.05169704556465149, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002154567773686722, "grad_norm": 6.048020362854004, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8608801960945129, "num_tokens": 319746451.0, "step": 8383 }, { "epoch": 1.0665309757028367, "ewc_loss": 0.05175559222698212, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021604224457405508, "grad_norm": 6.041589260101318, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8568055629730225, "num_tokens": 319792149.0, "step": 8384 }, { "epoch": 1.0666581859814273, "ewc_loss": 0.051733098924160004, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021581730106845498, "grad_norm": 6.052861213684082, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8548246622085571, "num_tokens": 319826508.0, "step": 8385 }, { "epoch": 1.0667853962600178, "ewc_loss": 0.05173405259847641, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021582686167676002, "grad_norm": 6.082039833068848, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.866131067276001, "num_tokens": 319867000.0, "step": 8386 }, { "epoch": 1.0669126065386083, "ewc_loss": 0.05167614296078682, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021524775365833193, "grad_norm": 6.050893783569336, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8449230790138245, "num_tokens": 319904476.0, "step": 8387 }, { "epoch": 1.0670398168171988, "ewc_loss": 0.05170129984617233, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002154993126168847, "grad_norm": 6.0327839851379395, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8386139869689941, "num_tokens": 319943279.0, "step": 8388 }, { "epoch": 1.0671670270957894, "ewc_loss": 0.051713403314352036, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002156203699996695, "grad_norm": 6.032339096069336, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8583787679672241, "num_tokens": 319986610.0, "step": 8389 }, { "epoch": 1.06729423737438, "ewc_loss": 0.05172397941350937, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002157261042157188, "grad_norm": 6.0344367027282715, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8566070795059204, "num_tokens": 320022702.0, "step": 8390 }, { "epoch": 1.0674214476529704, "ewc_loss": 0.05180257186293602, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021651203860528767, "grad_norm": 6.034567356109619, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8652693629264832, "num_tokens": 320067918.0, "step": 8391 }, { "epoch": 1.067548657931561, "ewc_loss": 0.05172257125377655, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021571201796177775, "grad_norm": 6.02362060546875, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8550736308097839, "num_tokens": 320110455.0, "step": 8392 }, { "epoch": 1.0676758682101515, "ewc_loss": 0.05173622816801071, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021584863134194165, "grad_norm": 6.021742820739746, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8800718188285828, "num_tokens": 320147675.0, "step": 8393 }, { "epoch": 1.0678030784887418, "ewc_loss": 0.05175412446260452, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021602754713967443, "grad_norm": 6.067288398742676, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8495447039604187, "num_tokens": 320183882.0, "step": 8394 }, { "epoch": 1.0679302887673323, "ewc_loss": 0.051704034209251404, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002155266556655988, "grad_norm": 6.026226043701172, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8530686497688293, "num_tokens": 320222215.0, "step": 8395 }, { "epoch": 1.0680574990459228, "ewc_loss": 0.05179966986179352, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002164830220863223, "grad_norm": 6.065980911254883, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.855398952960968, "num_tokens": 320262418.0, "step": 8396 }, { "epoch": 1.0681847093245134, "ewc_loss": 0.051808279007673264, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021656912576872855, "grad_norm": 6.055502891540527, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8559675216674805, "num_tokens": 320297304.0, "step": 8397 }, { "epoch": 1.068311919603104, "ewc_loss": 0.05173307657241821, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021581706823781133, "grad_norm": 6.066014289855957, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8648583292961121, "num_tokens": 320330919.0, "step": 8398 }, { "epoch": 1.0684391298816944, "ewc_loss": 0.05177328735589981, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002162191813113168, "grad_norm": 6.110006809234619, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8627802729606628, "num_tokens": 320367602.0, "step": 8399 }, { "epoch": 1.068566340160285, "ewc_loss": 0.05172082036733627, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021569451200775802, "grad_norm": 6.018942356109619, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8631953001022339, "num_tokens": 320403875.0, "step": 8400 }, { "epoch": 1.0686935504388755, "ewc_loss": 0.051781944930553436, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021630579431075603, "grad_norm": 6.084086894989014, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8666564226150513, "num_tokens": 320439149.0, "step": 8401 }, { "epoch": 1.068820760717466, "ewc_loss": 0.05171356350183487, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021562195615842938, "grad_norm": 5.970733165740967, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8751975893974304, "num_tokens": 320476786.0, "step": 8402 }, { "epoch": 1.0689479709960565, "ewc_loss": 0.05176379531621933, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002161242882721126, "grad_norm": 6.094095706939697, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8473066091537476, "num_tokens": 320516945.0, "step": 8403 }, { "epoch": 1.069075181274647, "ewc_loss": 0.051745448261499405, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021594081772491336, "grad_norm": 5.986712455749512, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8777227401733398, "num_tokens": 320552801.0, "step": 8404 }, { "epoch": 1.0692023915532376, "ewc_loss": 0.05173551291227341, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021584145724773407, "grad_norm": 6.047788619995117, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8642668724060059, "num_tokens": 320593605.0, "step": 8405 }, { "epoch": 1.0693296018318281, "ewc_loss": 0.05177059769630432, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021619230392389, "grad_norm": 6.06183385848999, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8698630332946777, "num_tokens": 320624476.0, "step": 8406 }, { "epoch": 1.0694568121104184, "ewc_loss": 0.05173160135746002, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000215802327147685, "grad_norm": 5.9896416664123535, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8695436716079712, "num_tokens": 320667000.0, "step": 8407 }, { "epoch": 1.069584022389009, "ewc_loss": 0.051734380424022675, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021583010675385594, "grad_norm": 6.102547645568848, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.867789626121521, "num_tokens": 320699286.0, "step": 8408 }, { "epoch": 1.0697112326675995, "ewc_loss": 0.05174396187067032, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002159259602194652, "grad_norm": 5.998167514801025, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8703603744506836, "num_tokens": 320732520.0, "step": 8409 }, { "epoch": 1.06983844294619, "ewc_loss": 0.05177511274814606, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002162374439649284, "grad_norm": 6.048269748687744, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8509808778762817, "num_tokens": 320773425.0, "step": 8410 }, { "epoch": 1.0699656532247805, "ewc_loss": 0.05174683406949043, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021595467114821076, "grad_norm": 6.017480850219727, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8406755924224854, "num_tokens": 320810859.0, "step": 8411 }, { "epoch": 1.070092863503371, "ewc_loss": 0.05180396884679794, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021652603754773736, "grad_norm": 6.036343574523926, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8664841055870056, "num_tokens": 320850266.0, "step": 8412 }, { "epoch": 1.0702200737819616, "ewc_loss": 0.05178165063261986, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021630284027196467, "grad_norm": 5.990900039672852, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8611867427825928, "num_tokens": 320892812.0, "step": 8413 }, { "epoch": 1.0703472840605521, "ewc_loss": 0.051790330559015274, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021638962789438665, "grad_norm": 6.033063888549805, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8733386993408203, "num_tokens": 320929563.0, "step": 8414 }, { "epoch": 1.0704744943391427, "ewc_loss": 0.05184946954250336, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021698104683309793, "grad_norm": 6.079565048217773, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8696516752243042, "num_tokens": 320964010.0, "step": 8415 }, { "epoch": 1.0706017046177332, "ewc_loss": 0.051762256771326065, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000216108892345801, "grad_norm": 6.011856555938721, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8597832918167114, "num_tokens": 321006405.0, "step": 8416 }, { "epoch": 1.0707289148963237, "ewc_loss": 0.05181536078453064, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002166399353882298, "grad_norm": 6.011305809020996, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8580757975578308, "num_tokens": 321050146.0, "step": 8417 }, { "epoch": 1.070856125174914, "ewc_loss": 0.051838427782058716, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002168706269003451, "grad_norm": 6.083582401275635, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8624826669692993, "num_tokens": 321086857.0, "step": 8418 }, { "epoch": 1.0709833354535045, "ewc_loss": 0.051771461963653564, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021620093320962042, "grad_norm": 6.042621612548828, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.867908239364624, "num_tokens": 321127310.0, "step": 8419 }, { "epoch": 1.071110545732095, "ewc_loss": 0.05184762924909592, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021696262410841882, "grad_norm": 6.108704090118408, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8527367115020752, "num_tokens": 321165881.0, "step": 8420 }, { "epoch": 1.0712377560106856, "ewc_loss": 0.051823973655700684, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002167260681744665, "grad_norm": 6.058591842651367, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8565492033958435, "num_tokens": 321204494.0, "step": 8421 }, { "epoch": 1.0713649662892761, "ewc_loss": 0.051792118698358536, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002164075121982023, "grad_norm": 6.062374114990234, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8526624441146851, "num_tokens": 321243639.0, "step": 8422 }, { "epoch": 1.0714921765678667, "ewc_loss": 0.05182362720370293, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021672260481864214, "grad_norm": 6.1321258544921875, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8540576696395874, "num_tokens": 321281399.0, "step": 8423 }, { "epoch": 1.0716193868464572, "ewc_loss": 0.051732465624809265, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002158110000891611, "grad_norm": 6.099747657775879, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8498454093933105, "num_tokens": 321316539.0, "step": 8424 }, { "epoch": 1.0717465971250477, "ewc_loss": 0.051733359694480896, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021581993496511132, "grad_norm": 6.081660747528076, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8580049872398376, "num_tokens": 321348618.0, "step": 8425 }, { "epoch": 1.0718738074036382, "ewc_loss": 0.05175172537565231, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021600356558337808, "grad_norm": 6.063252925872803, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8682577610015869, "num_tokens": 321381674.0, "step": 8426 }, { "epoch": 1.0720010176822288, "ewc_loss": 0.051794249564409256, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021642881620209664, "grad_norm": 6.042839527130127, "learning_rate": 1e-06, "loss": 0.4002, "mean_token_accuracy": 0.8775128722190857, "num_tokens": 321420342.0, "step": 8427 }, { "epoch": 1.0721282279608193, "ewc_loss": 0.05190452188253403, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021631084382534027, "grad_norm": 6.0963969230651855, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8589110374450684, "num_tokens": 321450770.0, "step": 8428 }, { "epoch": 1.0722554382394098, "ewc_loss": 0.051829807460308075, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021678439225070179, "grad_norm": 6.0375261306762695, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8649306297302246, "num_tokens": 321491450.0, "step": 8429 }, { "epoch": 1.0723826485180004, "ewc_loss": 0.051965292543172836, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002169185463571921, "grad_norm": 6.098803520202637, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8558217287063599, "num_tokens": 321531774.0, "step": 8430 }, { "epoch": 1.0725098587965907, "ewc_loss": 0.05193819850683212, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021664760424755514, "grad_norm": 6.068349361419678, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8626947999000549, "num_tokens": 321567692.0, "step": 8431 }, { "epoch": 1.0726370690751812, "ewc_loss": 0.05179060250520706, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021639236365444958, "grad_norm": 6.180253982543945, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.857621431350708, "num_tokens": 321601168.0, "step": 8432 }, { "epoch": 1.0727642793537717, "ewc_loss": 0.0518670491874218, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002159361174562946, "grad_norm": 6.04246711730957, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8440234661102295, "num_tokens": 321636100.0, "step": 8433 }, { "epoch": 1.0728914896323622, "ewc_loss": 0.05186270922422409, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002158927236450836, "grad_norm": 6.053860664367676, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8633407354354858, "num_tokens": 321674365.0, "step": 8434 }, { "epoch": 1.0730186999109528, "ewc_loss": 0.05184739828109741, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021573962294496596, "grad_norm": 6.031279563903809, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8732181191444397, "num_tokens": 321712267.0, "step": 8435 }, { "epoch": 1.0731459101895433, "ewc_loss": 0.051775168627500534, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002162380114896223, "grad_norm": 6.068265914916992, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8596787452697754, "num_tokens": 321749955.0, "step": 8436 }, { "epoch": 1.0732731204681338, "ewc_loss": 0.05178426206111908, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021632897551171482, "grad_norm": 6.087452411651611, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8705711364746094, "num_tokens": 321788522.0, "step": 8437 }, { "epoch": 1.0734003307467244, "ewc_loss": 0.05189827084541321, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021624835790134966, "grad_norm": 6.0719828605651855, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8589633107185364, "num_tokens": 321826314.0, "step": 8438 }, { "epoch": 1.073527541025315, "ewc_loss": 0.05204763635993004, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002165212936233729, "grad_norm": 6.06663179397583, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8579224348068237, "num_tokens": 321864851.0, "step": 8439 }, { "epoch": 1.0736547513039054, "ewc_loss": 0.0521162673830986, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021720759104937315, "grad_norm": 6.134543418884277, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8721910715103149, "num_tokens": 321901001.0, "step": 8440 }, { "epoch": 1.073781961582496, "ewc_loss": 0.051805026829242706, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021653660223819315, "grad_norm": 6.020284175872803, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8598986268043518, "num_tokens": 321941368.0, "step": 8441 }, { "epoch": 1.0739091718610865, "ewc_loss": 0.05180724337697029, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021655876480508596, "grad_norm": 6.088881969451904, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.864758312702179, "num_tokens": 321978268.0, "step": 8442 }, { "epoch": 1.0740363821396768, "ewc_loss": 0.05179210752248764, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021640742488671094, "grad_norm": 5.979498386383057, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8700329065322876, "num_tokens": 322016969.0, "step": 8443 }, { "epoch": 1.0741635924182673, "ewc_loss": 0.05183464661240578, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021683279192075133, "grad_norm": 6.073604583740234, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8549767732620239, "num_tokens": 322059362.0, "step": 8444 }, { "epoch": 1.0742908026968578, "ewc_loss": 0.05185217782855034, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021700809884350747, "grad_norm": 6.058164596557617, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8432663679122925, "num_tokens": 322097334.0, "step": 8445 }, { "epoch": 1.0744180129754484, "ewc_loss": 0.05185164511203766, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021700277284253389, "grad_norm": 6.117473602294922, "learning_rate": 1e-06, "loss": 0.3811, "mean_token_accuracy": 0.8826563358306885, "num_tokens": 322121988.0, "step": 8446 }, { "epoch": 1.074545223254039, "ewc_loss": 0.051822640001773834, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021671270951628685, "grad_norm": 6.0533976554870605, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8575299978256226, "num_tokens": 322156602.0, "step": 8447 }, { "epoch": 1.0746724335326294, "ewc_loss": 0.051856257021427155, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021704887330997735, "grad_norm": 6.076155662536621, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8590410947799683, "num_tokens": 322194475.0, "step": 8448 }, { "epoch": 1.07479964381122, "ewc_loss": 0.05195586383342743, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021682427905034274, "grad_norm": 6.045651912689209, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8712313175201416, "num_tokens": 322232158.0, "step": 8449 }, { "epoch": 1.0749268540898105, "ewc_loss": 0.05191986635327339, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002164642937714234, "grad_norm": 6.026855945587158, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8708983659744263, "num_tokens": 322272170.0, "step": 8450 }, { "epoch": 1.075054064368401, "ewc_loss": 0.05196717381477356, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002169373765354976, "grad_norm": 6.050841331481934, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.867475152015686, "num_tokens": 322305496.0, "step": 8451 }, { "epoch": 1.0751812746469915, "ewc_loss": 0.05195485055446625, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002168141509173438, "grad_norm": 6.011646270751953, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.866477906703949, "num_tokens": 322347552.0, "step": 8452 }, { "epoch": 1.075308484925582, "ewc_loss": 0.05199313908815384, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021719704091083258, "grad_norm": 6.05488395690918, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8652408123016357, "num_tokens": 322386471.0, "step": 8453 }, { "epoch": 1.0754356952041726, "ewc_loss": 0.05201222002506256, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021738780196756124, "grad_norm": 6.068288803100586, "learning_rate": 1e-06, "loss": 0.5609, "mean_token_accuracy": 0.8281084299087524, "num_tokens": 322431019.0, "step": 8454 }, { "epoch": 1.0755629054827631, "ewc_loss": 0.05199592560529709, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021722487872466445, "grad_norm": 6.01593542098999, "learning_rate": 1e-06, "loss": 0.3925, "mean_token_accuracy": 0.8805825114250183, "num_tokens": 322471525.0, "step": 8455 }, { "epoch": 1.0756901157613534, "ewc_loss": 0.052058637142181396, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021785199351143092, "grad_norm": 6.0557146072387695, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8655263781547546, "num_tokens": 322516594.0, "step": 8456 }, { "epoch": 1.075817326039944, "ewc_loss": 0.05186804383993149, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021716674382332712, "grad_norm": 6.0185770988464355, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.868615984916687, "num_tokens": 322557837.0, "step": 8457 }, { "epoch": 1.0759445363185345, "ewc_loss": 0.05205082893371582, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021777393703814596, "grad_norm": 6.128353595733643, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8577344417572021, "num_tokens": 322594165.0, "step": 8458 }, { "epoch": 1.076071746597125, "ewc_loss": 0.05183742940425873, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021686060063075274, "grad_norm": 6.041412830352783, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.845521092414856, "num_tokens": 322637083.0, "step": 8459 }, { "epoch": 1.0761989568757155, "ewc_loss": 0.05188625305891037, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002173488464904949, "grad_norm": 6.0910186767578125, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8612352013587952, "num_tokens": 322673216.0, "step": 8460 }, { "epoch": 1.076326167154306, "ewc_loss": 0.05183188244700432, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021680514328181744, "grad_norm": 6.005856037139893, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8663191199302673, "num_tokens": 322709150.0, "step": 8461 }, { "epoch": 1.0764533774328966, "ewc_loss": 0.05186273157596588, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021711365843657404, "grad_norm": 6.045287132263184, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8685939311981201, "num_tokens": 322747893.0, "step": 8462 }, { "epoch": 1.0765805877114871, "ewc_loss": 0.05180767923593521, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021656313037965447, "grad_norm": 6.076136589050293, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8457233905792236, "num_tokens": 322781667.0, "step": 8463 }, { "epoch": 1.0767077979900777, "ewc_loss": 0.051883719861507416, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021732352615799755, "grad_norm": 6.078590393066406, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8618143796920776, "num_tokens": 322818008.0, "step": 8464 }, { "epoch": 1.0768350082686682, "ewc_loss": 0.05183364078402519, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021682273654732853, "grad_norm": 6.048191547393799, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8419216871261597, "num_tokens": 322856528.0, "step": 8465 }, { "epoch": 1.0769622185472587, "ewc_loss": 0.05189021676778793, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021738851501140743, "grad_norm": 6.099365711212158, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8655760288238525, "num_tokens": 322888798.0, "step": 8466 }, { "epoch": 1.077089428825849, "ewc_loss": 0.05189259350299835, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021741227828897536, "grad_norm": 6.078643798828125, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8637472987174988, "num_tokens": 322923301.0, "step": 8467 }, { "epoch": 1.0772166391044395, "ewc_loss": 0.05203019082546234, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021756751812063158, "grad_norm": 6.283557891845703, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.874579668045044, "num_tokens": 322964253.0, "step": 8468 }, { "epoch": 1.07734384938303, "ewc_loss": 0.05184265971183777, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002169129002140835, "grad_norm": 5.999697208404541, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8606857061386108, "num_tokens": 323006926.0, "step": 8469 }, { "epoch": 1.0774710596616206, "ewc_loss": 0.05186937004327774, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021718002972193062, "grad_norm": 6.098931789398193, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8751005530357361, "num_tokens": 323047541.0, "step": 8470 }, { "epoch": 1.0775982699402111, "ewc_loss": 0.051886022090911865, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002173465327359736, "grad_norm": 6.096598148345947, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8547093868255615, "num_tokens": 323084272.0, "step": 8471 }, { "epoch": 1.0777254802188017, "ewc_loss": 0.051870301365852356, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000217189357499592, "grad_norm": 6.109490871429443, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.87351393699646, "num_tokens": 323116698.0, "step": 8472 }, { "epoch": 1.0778526904973922, "ewc_loss": 0.05186624079942703, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021714874310418963, "grad_norm": 6.020013332366943, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8699262142181396, "num_tokens": 323158836.0, "step": 8473 }, { "epoch": 1.0779799007759827, "ewc_loss": 0.051891110837459564, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021739744988735765, "grad_norm": 6.118597507476807, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8586547374725342, "num_tokens": 323193777.0, "step": 8474 }, { "epoch": 1.0781071110545732, "ewc_loss": 0.05185241624712944, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021701049990952015, "grad_norm": 6.054287910461426, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8631277680397034, "num_tokens": 323232546.0, "step": 8475 }, { "epoch": 1.0782343213331638, "ewc_loss": 0.051917243748903275, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021765877318102866, "grad_norm": 6.132261276245117, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8629347085952759, "num_tokens": 323269403.0, "step": 8476 }, { "epoch": 1.0783615316117543, "ewc_loss": 0.05182957276701927, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021678206394426525, "grad_norm": 6.09111213684082, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8754370808601379, "num_tokens": 323305982.0, "step": 8477 }, { "epoch": 1.0784887418903448, "ewc_loss": 0.05182081460952759, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002166944759665057, "grad_norm": 6.098718643188477, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.857637882232666, "num_tokens": 323341032.0, "step": 8478 }, { "epoch": 1.0786159521689354, "ewc_loss": 0.05184296891093254, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021691601432394236, "grad_norm": 6.066669940948486, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8572454452514648, "num_tokens": 323381647.0, "step": 8479 }, { "epoch": 1.0787431624475257, "ewc_loss": 0.0518881231546402, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021736754570156336, "grad_norm": 6.089073181152344, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8666214346885681, "num_tokens": 323417185.0, "step": 8480 }, { "epoch": 1.0788703727261162, "ewc_loss": 0.05192361772060394, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021772249601781368, "grad_norm": 6.095911026000977, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8628969192504883, "num_tokens": 323454624.0, "step": 8481 }, { "epoch": 1.0789975830047067, "ewc_loss": 0.051942914724349976, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021791546896565706, "grad_norm": 6.078675270080566, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8633750081062317, "num_tokens": 323492862.0, "step": 8482 }, { "epoch": 1.0791247932832972, "ewc_loss": 0.051890190690755844, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002173882385250181, "grad_norm": 6.0699052810668945, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8590165376663208, "num_tokens": 323536012.0, "step": 8483 }, { "epoch": 1.0792520035618878, "ewc_loss": 0.05190971493721008, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021758345246780664, "grad_norm": 6.0910139083862305, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8668302297592163, "num_tokens": 323573495.0, "step": 8484 }, { "epoch": 1.0793792138404783, "ewc_loss": 0.05194525420665741, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021793885389342904, "grad_norm": 6.080476760864258, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8720964789390564, "num_tokens": 323610900.0, "step": 8485 }, { "epoch": 1.0795064241190688, "ewc_loss": 0.05189885199069977, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021747482242062688, "grad_norm": 6.079319953918457, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8613952398300171, "num_tokens": 323650983.0, "step": 8486 }, { "epoch": 1.0796336343976594, "ewc_loss": 0.051963962614536285, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021812593331560493, "grad_norm": 6.117316722869873, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8547984957695007, "num_tokens": 323690155.0, "step": 8487 }, { "epoch": 1.0797608446762499, "ewc_loss": 0.05188419669866562, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021732831373810768, "grad_norm": 6.054364204406738, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8762363195419312, "num_tokens": 323732212.0, "step": 8488 }, { "epoch": 1.0798880549548404, "ewc_loss": 0.05191609263420105, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021764727716799825, "grad_norm": 6.107194423675537, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8715318441390991, "num_tokens": 323773400.0, "step": 8489 }, { "epoch": 1.080015265233431, "ewc_loss": 0.05185769125819206, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021706323605030775, "grad_norm": 6.086803913116455, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.869423508644104, "num_tokens": 323812753.0, "step": 8490 }, { "epoch": 1.0801424755120215, "ewc_loss": 0.051931899040937424, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021780531096737832, "grad_norm": 6.05051851272583, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8646602034568787, "num_tokens": 323855835.0, "step": 8491 }, { "epoch": 1.0802696857906118, "ewc_loss": 0.05189686268568039, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002174549299525097, "grad_norm": 6.059953689575195, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8760043382644653, "num_tokens": 323896065.0, "step": 8492 }, { "epoch": 1.0803968960692023, "ewc_loss": 0.05189462751150131, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002174326073145494, "grad_norm": 6.126089572906494, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8653805255889893, "num_tokens": 323934351.0, "step": 8493 }, { "epoch": 1.0805241063477928, "ewc_loss": 0.05190514028072357, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002175377303501591, "grad_norm": 6.1134209632873535, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.852389931678772, "num_tokens": 323968878.0, "step": 8494 }, { "epoch": 1.0806513166263834, "ewc_loss": 0.051858481019735336, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021707113774027675, "grad_norm": 6.079893112182617, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8622783422470093, "num_tokens": 324004838.0, "step": 8495 }, { "epoch": 1.080778526904974, "ewc_loss": 0.05188889056444168, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002173752145608887, "grad_norm": 6.104691982269287, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8788374662399292, "num_tokens": 324040851.0, "step": 8496 }, { "epoch": 1.0809057371835644, "ewc_loss": 0.051859065890312195, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021707700216211379, "grad_norm": 6.084632873535156, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8506470918655396, "num_tokens": 324084675.0, "step": 8497 }, { "epoch": 1.081032947462155, "ewc_loss": 0.05190465971827507, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021753292821813375, "grad_norm": 6.10150146484375, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.868104100227356, "num_tokens": 324122136.0, "step": 8498 }, { "epoch": 1.0811601577407455, "ewc_loss": 0.051859673112630844, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021708305575884879, "grad_norm": 6.104365348815918, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8616517782211304, "num_tokens": 324153142.0, "step": 8499 }, { "epoch": 1.081287368019336, "ewc_loss": 0.051865141838788986, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000217137741856277, "grad_norm": 6.105108737945557, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8440753221511841, "num_tokens": 324187244.0, "step": 8500 }, { "epoch": 1.0814145782979265, "ewc_loss": 0.051855504512786865, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021704137907363474, "grad_norm": 6.0813212394714355, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8658169507980347, "num_tokens": 324225644.0, "step": 8501 }, { "epoch": 1.081541788576517, "ewc_loss": 0.05189035087823868, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000217389824683778, "grad_norm": 6.0715765953063965, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8710463047027588, "num_tokens": 324270394.0, "step": 8502 }, { "epoch": 1.0816689988551076, "ewc_loss": 0.05191892012953758, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021767552243545651, "grad_norm": 6.061651229858398, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8740255236625671, "num_tokens": 324311244.0, "step": 8503 }, { "epoch": 1.0817962091336981, "ewc_loss": 0.05186391621828079, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002171254891436547, "grad_norm": 6.0720295906066895, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8447641134262085, "num_tokens": 324350821.0, "step": 8504 }, { "epoch": 1.0819234194122884, "ewc_loss": 0.05192077159881592, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000217694032471627, "grad_norm": 6.0944719314575195, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8616496920585632, "num_tokens": 324388255.0, "step": 8505 }, { "epoch": 1.082050629690879, "ewc_loss": 0.051937032490968704, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000217856650124304, "grad_norm": 6.09110689163208, "learning_rate": 1e-06, "loss": 0.5237, "mean_token_accuracy": 0.8380483984947205, "num_tokens": 324430068.0, "step": 8506 }, { "epoch": 1.0821778399694695, "ewc_loss": 0.051953282207250595, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021801915136165917, "grad_norm": 6.0969462394714355, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8686463832855225, "num_tokens": 324465341.0, "step": 8507 }, { "epoch": 1.08230505024806, "ewc_loss": 0.0519094318151474, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021758067305199802, "grad_norm": 6.066614151000977, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8711940050125122, "num_tokens": 324502199.0, "step": 8508 }, { "epoch": 1.0824322605266505, "ewc_loss": 0.05198260396718979, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000218312387005426, "grad_norm": 6.116363048553467, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8645410537719727, "num_tokens": 324535898.0, "step": 8509 }, { "epoch": 1.082559470805241, "ewc_loss": 0.051921773701906204, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021770405874121934, "grad_norm": 6.083701133728027, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8520714044570923, "num_tokens": 324574404.0, "step": 8510 }, { "epoch": 1.0826866810838316, "ewc_loss": 0.051969945430755615, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021818579989485443, "grad_norm": 6.107985019683838, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8616116046905518, "num_tokens": 324611743.0, "step": 8511 }, { "epoch": 1.0828138913624221, "ewc_loss": 0.052134349942207336, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021860912966076285, "grad_norm": 6.032649993896484, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8531675934791565, "num_tokens": 324653126.0, "step": 8512 }, { "epoch": 1.0829411016410126, "ewc_loss": 0.05217535048723221, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00021901911532040685, "grad_norm": 6.080007553100586, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8694418668746948, "num_tokens": 324692957.0, "step": 8513 }, { "epoch": 1.0830683119196032, "ewc_loss": 0.052237074822187424, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021841567649971694, "grad_norm": 6.078350067138672, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8587682247161865, "num_tokens": 324737214.0, "step": 8514 }, { "epoch": 1.0831955221981937, "ewc_loss": 0.052318695932626724, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002192318788729608, "grad_norm": 6.10826301574707, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8745025396347046, "num_tokens": 324773098.0, "step": 8515 }, { "epoch": 1.083322732476784, "ewc_loss": 0.05201561003923416, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021864242444280535, "grad_norm": 6.108156681060791, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8515288829803467, "num_tokens": 324815846.0, "step": 8516 }, { "epoch": 1.0834499427553745, "ewc_loss": 0.05218592286109924, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021790413302369416, "grad_norm": 6.053069114685059, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.860032320022583, "num_tokens": 324855912.0, "step": 8517 }, { "epoch": 1.083577153033965, "ewc_loss": 0.05204300954937935, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021891642245464027, "grad_norm": 6.166825294494629, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8570177555084229, "num_tokens": 324893090.0, "step": 8518 }, { "epoch": 1.0837043633125556, "ewc_loss": 0.051956888288259506, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021805521100759506, "grad_norm": 6.070196628570557, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8629635572433472, "num_tokens": 324929779.0, "step": 8519 }, { "epoch": 1.0838315735911461, "ewc_loss": 0.05205491930246353, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.000219035500776954, "grad_norm": 6.141937255859375, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8685542345046997, "num_tokens": 324961349.0, "step": 8520 }, { "epoch": 1.0839587838697367, "ewc_loss": 0.0519944503903389, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021843085414730012, "grad_norm": 6.0736470222473145, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8627220392227173, "num_tokens": 324998688.0, "step": 8521 }, { "epoch": 1.0840859941483272, "ewc_loss": 0.052013084292411804, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021861719142179936, "grad_norm": 6.1078715324401855, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8553637266159058, "num_tokens": 325040601.0, "step": 8522 }, { "epoch": 1.0842132044269177, "ewc_loss": 0.05205434560775757, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002190298109780997, "grad_norm": 6.084104061126709, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8665935397148132, "num_tokens": 325081729.0, "step": 8523 }, { "epoch": 1.0843404147055082, "ewc_loss": 0.05202116072177887, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021869792544748634, "grad_norm": 6.113832473754883, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8518958687782288, "num_tokens": 325118886.0, "step": 8524 }, { "epoch": 1.0844676249840988, "ewc_loss": 0.05203753709793091, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021886167814955115, "grad_norm": 6.0448174476623535, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8812699913978577, "num_tokens": 325156417.0, "step": 8525 }, { "epoch": 1.0845948352626893, "ewc_loss": 0.05200906842947006, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021857701358385384, "grad_norm": 6.099598407745361, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8740701079368591, "num_tokens": 325200902.0, "step": 8526 }, { "epoch": 1.0847220455412798, "ewc_loss": 0.05206894874572754, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021917583944741637, "grad_norm": 6.115983486175537, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8650400042533875, "num_tokens": 325233362.0, "step": 8527 }, { "epoch": 1.0848492558198704, "ewc_loss": 0.05197322368621826, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021821854170411825, "grad_norm": 6.136691570281982, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8626163005828857, "num_tokens": 325274646.0, "step": 8528 }, { "epoch": 1.0849764660984607, "ewc_loss": 0.05226140469312668, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021865896997042, "grad_norm": 6.086031436920166, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.8787060379981995, "num_tokens": 325304735.0, "step": 8529 }, { "epoch": 1.0851036763770512, "ewc_loss": 0.052012763917446136, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021861399000044912, "grad_norm": 6.099017143249512, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.866294801235199, "num_tokens": 325339833.0, "step": 8530 }, { "epoch": 1.0852308866556417, "ewc_loss": 0.05203036218881607, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002187899372074753, "grad_norm": 6.140335559844971, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8508455753326416, "num_tokens": 325373838.0, "step": 8531 }, { "epoch": 1.0853580969342322, "ewc_loss": 0.052006110548973083, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021854742954019457, "grad_norm": 6.032598495483398, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8839016556739807, "num_tokens": 325412913.0, "step": 8532 }, { "epoch": 1.0854853072128228, "ewc_loss": 0.052101247012615204, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002194987901020795, "grad_norm": 6.15983247756958, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8685010671615601, "num_tokens": 325451830.0, "step": 8533 }, { "epoch": 1.0856125174914133, "ewc_loss": 0.05202899128198624, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021877625840716064, "grad_norm": 6.105002403259277, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8455963134765625, "num_tokens": 325487162.0, "step": 8534 }, { "epoch": 1.0857397277700038, "ewc_loss": 0.05207554250955582, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002192417741753161, "grad_norm": 6.106500625610352, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8735126256942749, "num_tokens": 325525906.0, "step": 8535 }, { "epoch": 1.0858669380485944, "ewc_loss": 0.052326928824186325, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002193142136093229, "grad_norm": 6.107565402984619, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8750075101852417, "num_tokens": 325559925.0, "step": 8536 }, { "epoch": 1.0859941483271849, "ewc_loss": 0.05229552090167999, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002190001105191186, "grad_norm": 6.048425674438477, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8621441125869751, "num_tokens": 325599980.0, "step": 8537 }, { "epoch": 1.0861213586057754, "ewc_loss": 0.05237782001495361, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021982312318868935, "grad_norm": 6.151440620422363, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8463361859321594, "num_tokens": 325631745.0, "step": 8538 }, { "epoch": 1.086248568884366, "ewc_loss": 0.05233369022607803, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021938185091130435, "grad_norm": 6.057959079742432, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8595585227012634, "num_tokens": 325673583.0, "step": 8539 }, { "epoch": 1.0863757791629565, "ewc_loss": 0.05241283401846886, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022017327137291431, "grad_norm": 6.117385387420654, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8548415899276733, "num_tokens": 325712995.0, "step": 8540 }, { "epoch": 1.0865029894415468, "ewc_loss": 0.052420660853385925, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022025153157301247, "grad_norm": 6.120882511138916, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8684996366500854, "num_tokens": 325748259.0, "step": 8541 }, { "epoch": 1.0866301997201373, "ewc_loss": 0.05231102555990219, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021915517572779208, "grad_norm": 6.025977611541748, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8681516647338867, "num_tokens": 325787676.0, "step": 8542 }, { "epoch": 1.0867574099987278, "ewc_loss": 0.052394695580005646, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002199918671976775, "grad_norm": 6.138982772827148, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8380465507507324, "num_tokens": 325827819.0, "step": 8543 }, { "epoch": 1.0868846202773184, "ewc_loss": 0.05232247710227966, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021926967019680887, "grad_norm": 6.067394256591797, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8718822002410889, "num_tokens": 325866328.0, "step": 8544 }, { "epoch": 1.0870118305559089, "ewc_loss": 0.05240652710199356, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022011017426848412, "grad_norm": 6.094040870666504, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8580713272094727, "num_tokens": 325906752.0, "step": 8545 }, { "epoch": 1.0871390408344994, "ewc_loss": 0.052335307002067566, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002193979889852926, "grad_norm": 6.079251289367676, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8566756248474121, "num_tokens": 325943765.0, "step": 8546 }, { "epoch": 1.08726625111309, "ewc_loss": 0.052421048283576965, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022025540238246322, "grad_norm": 6.113809585571289, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.86958909034729, "num_tokens": 325981471.0, "step": 8547 }, { "epoch": 1.0873934613916805, "ewc_loss": 0.05237988382577896, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021984378690831363, "grad_norm": 6.154775619506836, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8682328462600708, "num_tokens": 326017253.0, "step": 8548 }, { "epoch": 1.087520671670271, "ewc_loss": 0.052330538630485535, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021935030235908926, "grad_norm": 6.130898475646973, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.863549530506134, "num_tokens": 326053484.0, "step": 8549 }, { "epoch": 1.0876478819488615, "ewc_loss": 0.05203799158334732, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021886624745093286, "grad_norm": 6.165164947509766, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.869831919670105, "num_tokens": 326084779.0, "step": 8550 }, { "epoch": 1.087775092227452, "ewc_loss": 0.05205276608467102, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002190140075981617, "grad_norm": 6.112009525299072, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8608297109603882, "num_tokens": 326120795.0, "step": 8551 }, { "epoch": 1.0879023025060426, "ewc_loss": 0.05204314738512039, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002189177757827565, "grad_norm": 6.106500625610352, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8611584901809692, "num_tokens": 326155991.0, "step": 8552 }, { "epoch": 1.0880295127846331, "ewc_loss": 0.05201108008623123, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002185971534345299, "grad_norm": 6.093930721282959, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8588454723358154, "num_tokens": 326195738.0, "step": 8553 }, { "epoch": 1.0881567230632234, "ewc_loss": 0.052080027759075165, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021928659407421947, "grad_norm": 6.096792697906494, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8628622889518738, "num_tokens": 326237773.0, "step": 8554 }, { "epoch": 1.088283933341814, "ewc_loss": 0.0520506277680397, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021899260173086077, "grad_norm": 6.084860801696777, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8764533400535583, "num_tokens": 326278589.0, "step": 8555 }, { "epoch": 1.0884111436204045, "ewc_loss": 0.05207522213459015, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021923855820205063, "grad_norm": 6.098794937133789, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8562260866165161, "num_tokens": 326316862.0, "step": 8556 }, { "epoch": 1.088538353898995, "ewc_loss": 0.05207601934671402, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021924650354776531, "grad_norm": 6.161157608032227, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.860028862953186, "num_tokens": 326353892.0, "step": 8557 }, { "epoch": 1.0886655641775855, "ewc_loss": 0.052076853811740875, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021925488545093685, "grad_norm": 6.089035511016846, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8680411577224731, "num_tokens": 326389478.0, "step": 8558 }, { "epoch": 1.088792774456176, "ewc_loss": 0.05237283557653427, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021977329743094742, "grad_norm": 10.842890739440918, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8576551079750061, "num_tokens": 326428072.0, "step": 8559 }, { "epoch": 1.0889199847347666, "ewc_loss": 0.05848180502653122, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002808629651553929, "grad_norm": 6.956250190734863, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8664695620536804, "num_tokens": 326466374.0, "step": 8560 }, { "epoch": 1.0890471950133571, "ewc_loss": 0.05106111243367195, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00020665605552494526, "grad_norm": 5.81987190246582, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8586996793746948, "num_tokens": 326501081.0, "step": 8561 }, { "epoch": 1.0891744052919476, "ewc_loss": 0.05312534421682358, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00022973978775553405, "grad_norm": 6.582756042480469, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8711448311805725, "num_tokens": 326532539.0, "step": 8562 }, { "epoch": 1.0893016155705382, "ewc_loss": 0.052551448345184326, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002240007888758555, "grad_norm": 6.0477142333984375, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8684580326080322, "num_tokens": 326569481.0, "step": 8563 }, { "epoch": 1.0894288258491287, "ewc_loss": 0.05247200280427933, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00022320635616779327, "grad_norm": 6.328928470611572, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8605383634567261, "num_tokens": 326609746.0, "step": 8564 }, { "epoch": 1.089556036127719, "ewc_loss": 0.052416518330574036, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00022265149164013565, "grad_norm": 6.0996994972229, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8534458875656128, "num_tokens": 326652379.0, "step": 8565 }, { "epoch": 1.0896832464063095, "ewc_loss": 0.05250653252005577, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002211102400906384, "grad_norm": 6.203207969665527, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.858726978302002, "num_tokens": 326687617.0, "step": 8566 }, { "epoch": 1.0898104566849, "ewc_loss": 0.05265234410762787, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022256838565226644, "grad_norm": 6.2127509117126465, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8543219566345215, "num_tokens": 326724155.0, "step": 8567 }, { "epoch": 1.0899376669634906, "ewc_loss": 0.052444763481616974, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022049256949685514, "grad_norm": 6.1131978034973145, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8617087602615356, "num_tokens": 326768234.0, "step": 8568 }, { "epoch": 1.0900648772420811, "ewc_loss": 0.052459247410297394, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022063740470912308, "grad_norm": 6.111288070678711, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8680552244186401, "num_tokens": 326810423.0, "step": 8569 }, { "epoch": 1.0901920875206716, "ewc_loss": 0.05241210386157036, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022016596631146967, "grad_norm": 6.13496208190918, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8609710931777954, "num_tokens": 326851834.0, "step": 8570 }, { "epoch": 1.0903192977992622, "ewc_loss": 0.05244627222418785, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022050764528103173, "grad_norm": 6.11314582824707, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8495989441871643, "num_tokens": 326889334.0, "step": 8571 }, { "epoch": 1.0904465080778527, "ewc_loss": 0.0523725301027298, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.000219770212424919, "grad_norm": 6.107856750488281, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8510429859161377, "num_tokens": 326931884.0, "step": 8572 }, { "epoch": 1.0905737183564432, "ewc_loss": 0.05246785655617714, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022072347928769886, "grad_norm": 6.143669605255127, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8704793453216553, "num_tokens": 326969043.0, "step": 8573 }, { "epoch": 1.0907009286350338, "ewc_loss": 0.05238477885723114, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021989272499922663, "grad_norm": 6.23097038269043, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8631225824356079, "num_tokens": 326998369.0, "step": 8574 }, { "epoch": 1.0908281389136243, "ewc_loss": 0.05233516916632652, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021939660655334592, "grad_norm": 6.044476509094238, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8619338274002075, "num_tokens": 327036909.0, "step": 8575 }, { "epoch": 1.0909553491922148, "ewc_loss": 0.05244363471865654, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022048127721063793, "grad_norm": 6.215852737426758, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8544157147407532, "num_tokens": 327076053.0, "step": 8576 }, { "epoch": 1.0910825594708053, "ewc_loss": 0.05230088159441948, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021905373432673514, "grad_norm": 6.021557807922363, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8698372840881348, "num_tokens": 327112285.0, "step": 8577 }, { "epoch": 1.0912097697493957, "ewc_loss": 0.052381470799446106, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021985960484016687, "grad_norm": 6.137906551361084, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8685810565948486, "num_tokens": 327151591.0, "step": 8578 }, { "epoch": 1.0913369800279862, "ewc_loss": 0.052366506308317184, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002197099820477888, "grad_norm": 6.078619956970215, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8483722805976868, "num_tokens": 327193600.0, "step": 8579 }, { "epoch": 1.0914641903065767, "ewc_loss": 0.0523730143904686, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021977505821269006, "grad_norm": 6.132792949676514, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8528239727020264, "num_tokens": 327230249.0, "step": 8580 }, { "epoch": 1.0915914005851672, "ewc_loss": 0.052342742681503296, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021947234927210957, "grad_norm": 6.083484172821045, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8764445185661316, "num_tokens": 327270043.0, "step": 8581 }, { "epoch": 1.0917186108637578, "ewc_loss": 0.05234800651669502, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021952498354949057, "grad_norm": 6.159136772155762, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8645685911178589, "num_tokens": 327309396.0, "step": 8582 }, { "epoch": 1.0918458211423483, "ewc_loss": 0.052350785583257675, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021955277770757675, "grad_norm": 6.098062992095947, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8699427843093872, "num_tokens": 327349497.0, "step": 8583 }, { "epoch": 1.0919730314209388, "ewc_loss": 0.05234609544277191, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002195058623328805, "grad_norm": 6.188375473022461, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.854731559753418, "num_tokens": 327385648.0, "step": 8584 }, { "epoch": 1.0921002416995294, "ewc_loss": 0.05228985473513603, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021894345991313457, "grad_norm": 6.064081192016602, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8636909127235413, "num_tokens": 327426379.0, "step": 8585 }, { "epoch": 1.0922274519781199, "ewc_loss": 0.05227603390812874, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021880526037421077, "grad_norm": 6.130795001983643, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8676459789276123, "num_tokens": 327467555.0, "step": 8586 }, { "epoch": 1.0923546622567104, "ewc_loss": 0.05231081694364548, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021915310935582966, "grad_norm": 6.124442100524902, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8590778112411499, "num_tokens": 327505303.0, "step": 8587 }, { "epoch": 1.092481872535301, "ewc_loss": 0.05236315727233887, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002196764835389331, "grad_norm": 6.14157772064209, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8690640926361084, "num_tokens": 327547854.0, "step": 8588 }, { "epoch": 1.0926090828138915, "ewc_loss": 0.0522441565990448, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021848647156730294, "grad_norm": 6.182678699493408, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8651120662689209, "num_tokens": 327584132.0, "step": 8589 }, { "epoch": 1.0927362930924818, "ewc_loss": 0.05225801840424538, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002186251076636836, "grad_norm": 6.077885627746582, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.850585401058197, "num_tokens": 327629250.0, "step": 8590 }, { "epoch": 1.0928635033710723, "ewc_loss": 0.0522642582654953, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021868752082809806, "grad_norm": 6.153239727020264, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8546562194824219, "num_tokens": 327666988.0, "step": 8591 }, { "epoch": 1.0929907136496628, "ewc_loss": 0.05225483328104019, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021859326807316393, "grad_norm": 6.073026657104492, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8645721673965454, "num_tokens": 327705955.0, "step": 8592 }, { "epoch": 1.0931179239282534, "ewc_loss": 0.052289895713329315, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021894386736676097, "grad_norm": 6.092440128326416, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8545805215835571, "num_tokens": 327746362.0, "step": 8593 }, { "epoch": 1.0932451342068439, "ewc_loss": 0.05232086032629013, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021925350301899016, "grad_norm": 6.096973419189453, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8625391721725464, "num_tokens": 327789034.0, "step": 8594 }, { "epoch": 1.0933723444854344, "ewc_loss": 0.05228717625141144, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021891668438911438, "grad_norm": 6.165968894958496, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8519666194915771, "num_tokens": 327824203.0, "step": 8595 }, { "epoch": 1.093499554764025, "ewc_loss": 0.05233391374349594, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021938404825050384, "grad_norm": 6.131064414978027, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8636163473129272, "num_tokens": 327860340.0, "step": 8596 }, { "epoch": 1.0936267650426155, "ewc_loss": 0.05204085633158684, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021889488562010229, "grad_norm": 6.1157379150390625, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8549913763999939, "num_tokens": 327901709.0, "step": 8597 }, { "epoch": 1.093753975321206, "ewc_loss": 0.052021220326423645, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021869853662792593, "grad_norm": 6.084414005279541, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8630814552307129, "num_tokens": 327940327.0, "step": 8598 }, { "epoch": 1.0938811855997965, "ewc_loss": 0.05206403136253357, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021912663942202926, "grad_norm": 6.077951908111572, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8678882122039795, "num_tokens": 327977966.0, "step": 8599 }, { "epoch": 1.094008395878387, "ewc_loss": 0.05204112082719803, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021889753406867385, "grad_norm": 6.083368301391602, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8586738109588623, "num_tokens": 328021143.0, "step": 8600 }, { "epoch": 1.0941356061569776, "ewc_loss": 0.05211927741765976, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021967911743558943, "grad_norm": 6.088532447814941, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8687642812728882, "num_tokens": 328060792.0, "step": 8601 }, { "epoch": 1.094262816435568, "ewc_loss": 0.05213721841573715, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021985852799843997, "grad_norm": 6.1378045082092285, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8633384704589844, "num_tokens": 328099698.0, "step": 8602 }, { "epoch": 1.0943900267141584, "ewc_loss": 0.052130602300167084, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00021979233133606613, "grad_norm": 6.066873550415039, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8655576705932617, "num_tokens": 328141075.0, "step": 8603 }, { "epoch": 1.094517236992749, "ewc_loss": 0.05217337608337402, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.00022022008488420397, "grad_norm": 6.14460563659668, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8554254174232483, "num_tokens": 328178945.0, "step": 8604 }, { "epoch": 1.0946444472713395, "ewc_loss": 0.052120599895715714, "ewc_loss_diag": 3.0159950256347656e-05, "ewc_loss_parallel": 0.0002196923305746168, "grad_norm": 6.13823938369751, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8361742496490479, "num_tokens": 328214423.0, "step": 8605 }, { "epoch": 1.09477165754993, "ewc_loss": 0.05240380018949509, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022008294763509184, "grad_norm": 6.168235778808594, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8627845048904419, "num_tokens": 328248640.0, "step": 8606 }, { "epoch": 1.0948988678285205, "ewc_loss": 0.052353665232658386, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00021958159049972892, "grad_norm": 6.1544365882873535, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8408569097518921, "num_tokens": 328283016.0, "step": 8607 }, { "epoch": 1.095026078107111, "ewc_loss": 0.05240471661090851, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022009207168594003, "grad_norm": 6.10152006149292, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8662757873535156, "num_tokens": 328322377.0, "step": 8608 }, { "epoch": 1.0951532883857016, "ewc_loss": 0.05244651809334755, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022051010455470532, "grad_norm": 6.172051906585693, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8643399477005005, "num_tokens": 328359409.0, "step": 8609 }, { "epoch": 1.0952804986642921, "ewc_loss": 0.05246568098664284, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022070173872634768, "grad_norm": 6.13580846786499, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8666483163833618, "num_tokens": 328401257.0, "step": 8610 }, { "epoch": 1.0954077089428826, "ewc_loss": 0.052417270839214325, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022021762561053038, "grad_norm": 6.1084699630737305, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8647023439407349, "num_tokens": 328435886.0, "step": 8611 }, { "epoch": 1.0955349192214732, "ewc_loss": 0.05243825912475586, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002204275078838691, "grad_norm": 6.162064075469971, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8532881736755371, "num_tokens": 328472091.0, "step": 8612 }, { "epoch": 1.0956621295000637, "ewc_loss": 0.0524250864982605, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022029579849913716, "grad_norm": 6.1292901039123535, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8543639183044434, "num_tokens": 328509453.0, "step": 8613 }, { "epoch": 1.095789339778654, "ewc_loss": 0.052498772740364075, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022103264927864075, "grad_norm": 6.164590358734131, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8630345463752747, "num_tokens": 328545590.0, "step": 8614 }, { "epoch": 1.0959165500572445, "ewc_loss": 0.052419714629650116, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022024204372428358, "grad_norm": 6.08807897567749, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8520519733428955, "num_tokens": 328583093.0, "step": 8615 }, { "epoch": 1.096043760335835, "ewc_loss": 0.05246991664171219, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022074409935157746, "grad_norm": 6.103757381439209, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8454453945159912, "num_tokens": 328625312.0, "step": 8616 }, { "epoch": 1.0961709706144256, "ewc_loss": 0.05247579514980316, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002208028599852696, "grad_norm": 6.140032768249512, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8556671142578125, "num_tokens": 328655392.0, "step": 8617 }, { "epoch": 1.0962981808930161, "ewc_loss": 0.052499786019325256, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022104280651547015, "grad_norm": 6.160867214202881, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8468588590621948, "num_tokens": 328693686.0, "step": 8618 }, { "epoch": 1.0964253911716066, "ewc_loss": 0.05245321989059448, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002205771452281624, "grad_norm": 6.07940149307251, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8731589317321777, "num_tokens": 328728051.0, "step": 8619 }, { "epoch": 1.0965526014501972, "ewc_loss": 0.05254398658871651, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022148479183670133, "grad_norm": 6.1648406982421875, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8516429662704468, "num_tokens": 328768091.0, "step": 8620 }, { "epoch": 1.0966798117287877, "ewc_loss": 0.052502021193504333, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022106515825726092, "grad_norm": 6.090349197387695, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8583292365074158, "num_tokens": 328803851.0, "step": 8621 }, { "epoch": 1.0968070220073782, "ewc_loss": 0.05259832367300987, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022202816035132855, "grad_norm": 6.164234638214111, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.872991144657135, "num_tokens": 328837429.0, "step": 8622 }, { "epoch": 1.0969342322859688, "ewc_loss": 0.05254330113530159, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022147793788462877, "grad_norm": 6.140127182006836, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8608723878860474, "num_tokens": 328874264.0, "step": 8623 }, { "epoch": 1.0970614425645593, "ewc_loss": 0.05260632187128067, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022210813767742366, "grad_norm": 6.143473148345947, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.864851713180542, "num_tokens": 328911627.0, "step": 8624 }, { "epoch": 1.0971886528431498, "ewc_loss": 0.05256793648004532, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022172427270561457, "grad_norm": 6.185451984405518, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8551470041275024, "num_tokens": 328944471.0, "step": 8625 }, { "epoch": 1.0973158631217403, "ewc_loss": 0.05251534283161163, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022119833738543093, "grad_norm": 6.106276512145996, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.870133638381958, "num_tokens": 328983826.0, "step": 8626 }, { "epoch": 1.0974430734003306, "ewc_loss": 0.05258888751268387, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002219337911810726, "grad_norm": 6.193419456481934, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8626933097839355, "num_tokens": 329019385.0, "step": 8627 }, { "epoch": 1.0975702836789212, "ewc_loss": 0.05260001868009567, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002208243968198076, "grad_norm": 6.123838901519775, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8497390747070312, "num_tokens": 329065179.0, "step": 8628 }, { "epoch": 1.0976974939575117, "ewc_loss": 0.05253676697611809, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002214125997852534, "grad_norm": 6.191691875457764, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.86991286277771, "num_tokens": 329101077.0, "step": 8629 }, { "epoch": 1.0978247042361022, "ewc_loss": 0.052391417324543, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002199590962845832, "grad_norm": 6.120617866516113, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8668861389160156, "num_tokens": 329137538.0, "step": 8630 }, { "epoch": 1.0979519145146928, "ewc_loss": 0.05276504531502724, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002200332673965022, "grad_norm": 6.145578384399414, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8757011294364929, "num_tokens": 329171952.0, "step": 8631 }, { "epoch": 1.0980791247932833, "ewc_loss": 0.05278225988149643, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022020540200173855, "grad_norm": 6.058938503265381, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8708409070968628, "num_tokens": 329214956.0, "step": 8632 }, { "epoch": 1.0982063350718738, "ewc_loss": 0.052454642951488495, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002205913478974253, "grad_norm": 6.152248859405518, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8526344299316406, "num_tokens": 329254228.0, "step": 8633 }, { "epoch": 1.0983335453504643, "ewc_loss": 0.052433811128139496, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022038305178284645, "grad_norm": 6.23744535446167, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8630208373069763, "num_tokens": 329288337.0, "step": 8634 }, { "epoch": 1.0984607556290549, "ewc_loss": 0.05244337022304535, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002204786433139816, "grad_norm": 6.1184000968933105, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8560867309570312, "num_tokens": 329329118.0, "step": 8635 }, { "epoch": 1.0985879659076454, "ewc_loss": 0.052465230226516724, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002206972276326269, "grad_norm": 6.110274791717529, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8478517532348633, "num_tokens": 329366301.0, "step": 8636 }, { "epoch": 1.098715176186236, "ewc_loss": 0.052435100078582764, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022039594477973878, "grad_norm": 6.218596458435059, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8498752117156982, "num_tokens": 329401612.0, "step": 8637 }, { "epoch": 1.0988423864648265, "ewc_loss": 0.05245476961135864, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022059259936213493, "grad_norm": 6.084814071655273, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8639538288116455, "num_tokens": 329440403.0, "step": 8638 }, { "epoch": 1.0989695967434168, "ewc_loss": 0.05250811576843262, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002211260871263221, "grad_norm": 6.127882957458496, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8717328310012817, "num_tokens": 329478699.0, "step": 8639 }, { "epoch": 1.0990968070220073, "ewc_loss": 0.05246800184249878, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.00022072494903113693, "grad_norm": 6.113495826721191, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8612293004989624, "num_tokens": 329517860.0, "step": 8640 }, { "epoch": 1.0992240173005978, "ewc_loss": 0.05265577882528305, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022138201165944338, "grad_norm": 6.222969055175781, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8680187463760376, "num_tokens": 329555385.0, "step": 8641 }, { "epoch": 1.0993512275791884, "ewc_loss": 0.05260385200381279, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022086274111643434, "grad_norm": 6.128222942352295, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8581562638282776, "num_tokens": 329594775.0, "step": 8642 }, { "epoch": 1.0994784378577789, "ewc_loss": 0.05260752886533737, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022089952835813165, "grad_norm": 6.127220630645752, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8661226034164429, "num_tokens": 329629872.0, "step": 8643 }, { "epoch": 1.0996056481363694, "ewc_loss": 0.052602700889110565, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022085121599957347, "grad_norm": 6.148478031158447, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8567113876342773, "num_tokens": 329665263.0, "step": 8644 }, { "epoch": 1.09973285841496, "ewc_loss": 0.05262362211942673, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022106041433289647, "grad_norm": 6.1166229248046875, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8523231744766235, "num_tokens": 329705853.0, "step": 8645 }, { "epoch": 1.0998600686935505, "ewc_loss": 0.052647583186626434, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.000221300040720962, "grad_norm": 6.186157703399658, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8697018027305603, "num_tokens": 329741690.0, "step": 8646 }, { "epoch": 1.099987278972141, "ewc_loss": 0.052615247666835785, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022097668261267245, "grad_norm": 6.1260786056518555, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8551492094993591, "num_tokens": 329781548.0, "step": 8647 }, { "epoch": 1.1001144892507315, "ewc_loss": 0.052615292370319366, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022097716282587498, "grad_norm": 6.172921180725098, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8685585856437683, "num_tokens": 329822304.0, "step": 8648 }, { "epoch": 1.100241699529322, "ewc_loss": 0.05259589105844498, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002207831566920504, "grad_norm": 6.155354976654053, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8635088205337524, "num_tokens": 329856065.0, "step": 8649 }, { "epoch": 1.1003689098079126, "ewc_loss": 0.05254426598548889, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022026689839549363, "grad_norm": 6.1253204345703125, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8470046520233154, "num_tokens": 329895002.0, "step": 8650 }, { "epoch": 1.100496120086503, "ewc_loss": 0.05254479497671127, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022027219529263675, "grad_norm": 6.0948309898376465, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8557672500610352, "num_tokens": 329938810.0, "step": 8651 }, { "epoch": 1.1006233303650934, "ewc_loss": 0.05281098932027817, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022049272956792265, "grad_norm": 6.1529059410095215, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8744940757751465, "num_tokens": 329976513.0, "step": 8652 }, { "epoch": 1.100750540643684, "ewc_loss": 0.05255930498242378, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022041727788746357, "grad_norm": 6.141631603240967, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8694871664047241, "num_tokens": 330014013.0, "step": 8653 }, { "epoch": 1.1008777509222745, "ewc_loss": 0.05283428356051445, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002207256475230679, "grad_norm": 6.154324531555176, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8577018976211548, "num_tokens": 330052577.0, "step": 8654 }, { "epoch": 1.101004961200865, "ewc_loss": 0.052588894963264465, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022071314742788672, "grad_norm": 6.122375011444092, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8607996702194214, "num_tokens": 330091385.0, "step": 8655 }, { "epoch": 1.1011321714794555, "ewc_loss": 0.05265354365110397, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022135967446956784, "grad_norm": 6.1470232009887695, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8702243566513062, "num_tokens": 330130285.0, "step": 8656 }, { "epoch": 1.101259381758046, "ewc_loss": 0.052801862359046936, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022040145995561033, "grad_norm": 6.138518810272217, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8487118482589722, "num_tokens": 330166823.0, "step": 8657 }, { "epoch": 1.1013865920366366, "ewc_loss": 0.052613209933042526, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022095630993135273, "grad_norm": 6.2103776931762695, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8642836809158325, "num_tokens": 330200655.0, "step": 8658 }, { "epoch": 1.101513802315227, "ewc_loss": 0.05250868946313858, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021991111862007529, "grad_norm": 6.11075496673584, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8617028594017029, "num_tokens": 330243656.0, "step": 8659 }, { "epoch": 1.1016410125938176, "ewc_loss": 0.05283041298389435, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002206869248766452, "grad_norm": 6.156307220458984, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8544836640357971, "num_tokens": 330287200.0, "step": 8660 }, { "epoch": 1.1017682228724082, "ewc_loss": 0.05252326279878616, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022005682694725692, "grad_norm": 6.10725212097168, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8643558621406555, "num_tokens": 330324794.0, "step": 8661 }, { "epoch": 1.1018954331509987, "ewc_loss": 0.05312367156147957, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022117812477517873, "grad_norm": 6.17785120010376, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8635842800140381, "num_tokens": 330365297.0, "step": 8662 }, { "epoch": 1.102022643429589, "ewc_loss": 0.05256742984056473, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022049852123018354, "grad_norm": 6.119341850280762, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8796210289001465, "num_tokens": 330408046.0, "step": 8663 }, { "epoch": 1.1021498537081795, "ewc_loss": 0.05253071337938309, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022013137640897185, "grad_norm": 6.112265110015869, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8585768938064575, "num_tokens": 330447815.0, "step": 8664 }, { "epoch": 1.10227706398677, "ewc_loss": 0.05255883187055588, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022041256306692958, "grad_norm": 6.16591739654541, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8505067825317383, "num_tokens": 330487796.0, "step": 8665 }, { "epoch": 1.1024042742653606, "ewc_loss": 0.05303805321455002, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022032191918697208, "grad_norm": 6.137612342834473, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.87361079454422, "num_tokens": 330531155.0, "step": 8666 }, { "epoch": 1.1025314845439511, "ewc_loss": 0.05254904925823212, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022031471598893404, "grad_norm": 6.213557243347168, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8706725835800171, "num_tokens": 330568611.0, "step": 8667 }, { "epoch": 1.1026586948225416, "ewc_loss": 0.05248138681054115, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021963808103464544, "grad_norm": 6.140986442565918, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8569047451019287, "num_tokens": 330608085.0, "step": 8668 }, { "epoch": 1.1027859051011322, "ewc_loss": 0.0525357723236084, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022018194431439042, "grad_norm": 6.205533981323242, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8443449139595032, "num_tokens": 330644776.0, "step": 8669 }, { "epoch": 1.1029131153797227, "ewc_loss": 0.052513930946588516, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021996352006681263, "grad_norm": 6.148496627807617, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8518120050430298, "num_tokens": 330686401.0, "step": 8670 }, { "epoch": 1.1030403256583132, "ewc_loss": 0.0524524487555027, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021934870164841413, "grad_norm": 6.195189476013184, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.867179274559021, "num_tokens": 330723115.0, "step": 8671 }, { "epoch": 1.1031675359369038, "ewc_loss": 0.05251225456595421, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021994677081238478, "grad_norm": 6.179275989532471, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8771328926086426, "num_tokens": 330761170.0, "step": 8672 }, { "epoch": 1.1032947462154943, "ewc_loss": 0.052430711686611176, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021913135424256325, "grad_norm": 6.20334529876709, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8626728653907776, "num_tokens": 330790230.0, "step": 8673 }, { "epoch": 1.1034219564940848, "ewc_loss": 0.05246199667453766, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021944417676422745, "grad_norm": 6.093344211578369, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8734573125839233, "num_tokens": 330826178.0, "step": 8674 }, { "epoch": 1.1035491667726753, "ewc_loss": 0.05252782255411148, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022010244720149785, "grad_norm": 6.145768165588379, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8688899874687195, "num_tokens": 330866341.0, "step": 8675 }, { "epoch": 1.1036763770512656, "ewc_loss": 0.05250705033540726, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021989473316352814, "grad_norm": 6.173888206481934, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8480287790298462, "num_tokens": 330903955.0, "step": 8676 }, { "epoch": 1.1038035873298562, "ewc_loss": 0.05261324346065521, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022095663007348776, "grad_norm": 6.137955188751221, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8593186736106873, "num_tokens": 330943024.0, "step": 8677 }, { "epoch": 1.1039307976084467, "ewc_loss": 0.052817873656749725, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022056157467886806, "grad_norm": 6.172614574432373, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8556856513023376, "num_tokens": 330981004.0, "step": 8678 }, { "epoch": 1.1040580078870372, "ewc_loss": 0.05254337191581726, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002202579635195434, "grad_norm": 6.138803482055664, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.861423134803772, "num_tokens": 331021532.0, "step": 8679 }, { "epoch": 1.1041852181656278, "ewc_loss": 0.05258945748209953, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022071879357099533, "grad_norm": 6.144542217254639, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8692660331726074, "num_tokens": 331058010.0, "step": 8680 }, { "epoch": 1.1043124284442183, "ewc_loss": 0.05259394645690918, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022076368622947484, "grad_norm": 6.1571574211120605, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8708591461181641, "num_tokens": 331090713.0, "step": 8681 }, { "epoch": 1.1044396387228088, "ewc_loss": 0.052629269659519196, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022111689031589776, "grad_norm": 6.197961807250977, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8681769371032715, "num_tokens": 331127532.0, "step": 8682 }, { "epoch": 1.1045668490013993, "ewc_loss": 0.05267936736345291, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022161786910146475, "grad_norm": 6.408632278442383, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.8406231999397278, "num_tokens": 331172088.0, "step": 8683 }, { "epoch": 1.1046940592799899, "ewc_loss": 0.05285729095339775, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022095571330282837, "grad_norm": 6.3737897872924805, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.856574535369873, "num_tokens": 331213419.0, "step": 8684 }, { "epoch": 1.1048212695585804, "ewc_loss": 0.052368778735399246, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021851201017852873, "grad_norm": 6.084990501403809, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8610140085220337, "num_tokens": 331250102.0, "step": 8685 }, { "epoch": 1.104948479837171, "ewc_loss": 0.05250846967101097, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002199089212808758, "grad_norm": 6.314676761627197, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8639196753501892, "num_tokens": 331284597.0, "step": 8686 }, { "epoch": 1.1050756901157615, "ewc_loss": 0.05247772857546806, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021960149751976132, "grad_norm": 6.125548839569092, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8547697067260742, "num_tokens": 331330688.0, "step": 8687 }, { "epoch": 1.1052029003943518, "ewc_loss": 0.05257198214530945, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002205440541729331, "grad_norm": 6.281867980957031, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8654428124427795, "num_tokens": 331368777.0, "step": 8688 }, { "epoch": 1.1053301106729423, "ewc_loss": 0.05251127481460571, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00021993696282152086, "grad_norm": 6.087141036987305, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8665594458580017, "num_tokens": 331414369.0, "step": 8689 }, { "epoch": 1.1054573209515328, "ewc_loss": 0.052604448050260544, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022086869284976274, "grad_norm": 6.25326681137085, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8686242699623108, "num_tokens": 331452788.0, "step": 8690 }, { "epoch": 1.1055845312301233, "ewc_loss": 0.05256769433617592, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022050115512683988, "grad_norm": 6.195135593414307, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8511638641357422, "num_tokens": 331489575.0, "step": 8691 }, { "epoch": 1.1057117415087139, "ewc_loss": 0.05261734873056412, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022099772468209267, "grad_norm": 6.339365482330322, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8794376850128174, "num_tokens": 331529129.0, "step": 8692 }, { "epoch": 1.1058389517873044, "ewc_loss": 0.05252041667699814, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022002836340107024, "grad_norm": 6.132996082305908, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8577778339385986, "num_tokens": 331564989.0, "step": 8693 }, { "epoch": 1.105966162065895, "ewc_loss": 0.05263540893793106, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002211783139500767, "grad_norm": 6.485821723937988, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8647630214691162, "num_tokens": 331602561.0, "step": 8694 }, { "epoch": 1.1060933723444855, "ewc_loss": 0.05256836116313934, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022050781990401447, "grad_norm": 6.080421447753906, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8633561134338379, "num_tokens": 331646902.0, "step": 8695 }, { "epoch": 1.106220582623076, "ewc_loss": 0.052674856036901474, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022157278726808727, "grad_norm": 6.517445087432861, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8631514310836792, "num_tokens": 331682347.0, "step": 8696 }, { "epoch": 1.1063477929016665, "ewc_loss": 0.052577655762434006, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022060077753849328, "grad_norm": 6.044040679931641, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8543350696563721, "num_tokens": 331725813.0, "step": 8697 }, { "epoch": 1.106475003180257, "ewc_loss": 0.052691951394081116, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022174371406435966, "grad_norm": 6.272422790527344, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8414453268051147, "num_tokens": 331767856.0, "step": 8698 }, { "epoch": 1.1066022134588476, "ewc_loss": 0.05266717076301575, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002214959094999358, "grad_norm": 6.264895439147949, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8709867596626282, "num_tokens": 331804238.0, "step": 8699 }, { "epoch": 1.106729423737438, "ewc_loss": 0.05262136086821556, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022103782976046205, "grad_norm": 6.195897102355957, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8629804849624634, "num_tokens": 331839963.0, "step": 8700 }, { "epoch": 1.1068566340160284, "ewc_loss": 0.0526248961687088, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022107316181063652, "grad_norm": 6.218101978302002, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8654499053955078, "num_tokens": 331881563.0, "step": 8701 }, { "epoch": 1.106983844294619, "ewc_loss": 0.052499182522296906, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002198160655098036, "grad_norm": 6.172973155975342, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8582441806793213, "num_tokens": 331925540.0, "step": 8702 }, { "epoch": 1.1071110545732095, "ewc_loss": 0.052649836987257004, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022132259618956596, "grad_norm": 6.186415672302246, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8706798553466797, "num_tokens": 331963171.0, "step": 8703 }, { "epoch": 1.1072382648518, "ewc_loss": 0.05257366597652435, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022056090529076755, "grad_norm": 6.183826446533203, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.882087767124176, "num_tokens": 331998310.0, "step": 8704 }, { "epoch": 1.1073654751303905, "ewc_loss": 0.0525817833840847, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022064204677008092, "grad_norm": 6.231086730957031, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8679672479629517, "num_tokens": 332034181.0, "step": 8705 }, { "epoch": 1.107492685408981, "ewc_loss": 0.05259653553366661, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002207895740866661, "grad_norm": 6.172530174255371, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8721637725830078, "num_tokens": 332075368.0, "step": 8706 }, { "epoch": 1.1076198956875716, "ewc_loss": 0.05265358090400696, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022136005281936377, "grad_norm": 6.2238850593566895, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.873976469039917, "num_tokens": 332113281.0, "step": 8707 }, { "epoch": 1.107747105966162, "ewc_loss": 0.05265939608216286, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002214181877207011, "grad_norm": 6.21624231338501, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8545767664909363, "num_tokens": 332154206.0, "step": 8708 }, { "epoch": 1.1078743162447526, "ewc_loss": 0.052355170249938965, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022081731003709137, "grad_norm": 6.215601444244385, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8474574089050293, "num_tokens": 332195409.0, "step": 8709 }, { "epoch": 1.1080015265233432, "ewc_loss": 0.05241304636001587, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002213960833614692, "grad_norm": 6.149753093719482, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8707644939422607, "num_tokens": 332233836.0, "step": 8710 }, { "epoch": 1.1081287368019337, "ewc_loss": 0.05241294950246811, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022139509383123368, "grad_norm": 6.1795783042907715, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.867411732673645, "num_tokens": 332274167.0, "step": 8711 }, { "epoch": 1.108255947080524, "ewc_loss": 0.05270878225564957, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002219120506197214, "grad_norm": 6.224730491638184, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8579921722412109, "num_tokens": 332315856.0, "step": 8712 }, { "epoch": 1.1083831573591145, "ewc_loss": 0.052486713975667953, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022213277406990528, "grad_norm": 6.162669658660889, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8620654940605164, "num_tokens": 332366365.0, "step": 8713 }, { "epoch": 1.108510367637705, "ewc_loss": 0.05245644971728325, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002218301233369857, "grad_norm": 6.285375118255615, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8593800067901611, "num_tokens": 332400411.0, "step": 8714 }, { "epoch": 1.1086375779162956, "ewc_loss": 0.052486732602119446, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022213297779671848, "grad_norm": 6.176344394683838, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8654143810272217, "num_tokens": 332441959.0, "step": 8715 }, { "epoch": 1.108764788194886, "ewc_loss": 0.05247984081506729, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002220640453742817, "grad_norm": 6.294257640838623, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8495990037918091, "num_tokens": 332479545.0, "step": 8716 }, { "epoch": 1.1088919984734766, "ewc_loss": 0.05244304984807968, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022169610019773245, "grad_norm": 6.213688850402832, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8731892108917236, "num_tokens": 332516240.0, "step": 8717 }, { "epoch": 1.1090192087520672, "ewc_loss": 0.05249224603176117, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022218807134777308, "grad_norm": 6.259458065032959, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8663442730903625, "num_tokens": 332553488.0, "step": 8718 }, { "epoch": 1.1091464190306577, "ewc_loss": 0.05244164168834686, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022168202849570662, "grad_norm": 6.280323505401611, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8523359298706055, "num_tokens": 332588646.0, "step": 8719 }, { "epoch": 1.1092736293092482, "ewc_loss": 0.05254896730184555, "ewc_loss_diag": 3.039836883544922e-05, "ewc_loss_parallel": 0.0002215345884906128, "grad_norm": 6.252381324768066, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8526289463043213, "num_tokens": 332629103.0, "step": 8720 }, { "epoch": 1.1094008395878387, "ewc_loss": 0.0524103045463562, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002213686821050942, "grad_norm": 6.201322555541992, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8672316074371338, "num_tokens": 332664820.0, "step": 8721 }, { "epoch": 1.1095280498664293, "ewc_loss": 0.05244860425591469, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022175165941007435, "grad_norm": 6.187788486480713, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8637683391571045, "num_tokens": 332699772.0, "step": 8722 }, { "epoch": 1.1096552601450198, "ewc_loss": 0.05248104780912399, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022207607980817556, "grad_norm": 6.289938926696777, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8704507350921631, "num_tokens": 332733478.0, "step": 8723 }, { "epoch": 1.1097824704236103, "ewc_loss": 0.05241618677973747, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022142748639453202, "grad_norm": 6.177582263946533, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8585110902786255, "num_tokens": 332774733.0, "step": 8724 }, { "epoch": 1.1099096807022006, "ewc_loss": 0.052496567368507385, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022223130508791655, "grad_norm": 6.226595401763916, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8389667272567749, "num_tokens": 332812970.0, "step": 8725 }, { "epoch": 1.1100368909807912, "ewc_loss": 0.052508529275655746, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022235092183109373, "grad_norm": 6.176396369934082, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8588600158691406, "num_tokens": 332854974.0, "step": 8726 }, { "epoch": 1.1101641012593817, "ewc_loss": 0.05248074233531952, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002220730675617233, "grad_norm": 6.281952857971191, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8478657007217407, "num_tokens": 332886462.0, "step": 8727 }, { "epoch": 1.1102913115379722, "ewc_loss": 0.05251571536064148, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002224227791884914, "grad_norm": 6.1646599769592285, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8581546545028687, "num_tokens": 332928614.0, "step": 8728 }, { "epoch": 1.1104185218165628, "ewc_loss": 0.05279199779033661, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022274417278822511, "grad_norm": 6.25792932510376, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8631181716918945, "num_tokens": 332962447.0, "step": 8729 }, { "epoch": 1.1105457320951533, "ewc_loss": 0.052789345383644104, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022271765919867903, "grad_norm": 6.269025802612305, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8537085056304932, "num_tokens": 332991395.0, "step": 8730 }, { "epoch": 1.1106729423737438, "ewc_loss": 0.052735403180122375, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002221782342530787, "grad_norm": 6.205343246459961, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8548794984817505, "num_tokens": 333027852.0, "step": 8731 }, { "epoch": 1.1108001526523343, "ewc_loss": 0.05280306190252304, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022285485465545207, "grad_norm": 6.215208530426025, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8642209768295288, "num_tokens": 333063337.0, "step": 8732 }, { "epoch": 1.1109273629309249, "ewc_loss": 0.05274885892868042, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022231281036511064, "grad_norm": 6.2286481857299805, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8522956371307373, "num_tokens": 333094632.0, "step": 8733 }, { "epoch": 1.1110545732095154, "ewc_loss": 0.0527648851275444, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022247308515943587, "grad_norm": 6.152248382568359, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8749417066574097, "num_tokens": 333134071.0, "step": 8734 }, { "epoch": 1.111181783488106, "ewc_loss": 0.05278697982430458, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022269401233643293, "grad_norm": 6.191038608551025, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8672102689743042, "num_tokens": 333171288.0, "step": 8735 }, { "epoch": 1.1113089937666965, "ewc_loss": 0.05278956517577171, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022271987108979374, "grad_norm": 6.1913018226623535, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8680776953697205, "num_tokens": 333212111.0, "step": 8736 }, { "epoch": 1.1114362040452868, "ewc_loss": 0.052799828350543976, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022282252029981464, "grad_norm": 6.200386047363281, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8673372268676758, "num_tokens": 333247557.0, "step": 8737 }, { "epoch": 1.1115634143238773, "ewc_loss": 0.05274919793009758, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022231620096135885, "grad_norm": 6.165287017822266, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8622709512710571, "num_tokens": 333278705.0, "step": 8738 }, { "epoch": 1.1116906246024678, "ewc_loss": 0.052855461835861206, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002233788400189951, "grad_norm": 6.197922706604004, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8647400140762329, "num_tokens": 333316493.0, "step": 8739 }, { "epoch": 1.1118178348810583, "ewc_loss": 0.052837591618299484, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022320014249999076, "grad_norm": 6.142497539520264, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8648551106452942, "num_tokens": 333360384.0, "step": 8740 }, { "epoch": 1.1119450451596489, "ewc_loss": 0.05285673588514328, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022339160204865038, "grad_norm": 6.219127655029297, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8625555038452148, "num_tokens": 333395568.0, "step": 8741 }, { "epoch": 1.1120722554382394, "ewc_loss": 0.05279082804918289, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002227325167041272, "grad_norm": 6.134095668792725, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8666890263557434, "num_tokens": 333429267.0, "step": 8742 }, { "epoch": 1.11219946571683, "ewc_loss": 0.05287031829357147, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022352741507347673, "grad_norm": 6.182562828063965, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.867874801158905, "num_tokens": 333469908.0, "step": 8743 }, { "epoch": 1.1123266759954205, "ewc_loss": 0.05285734683275223, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022339768474921584, "grad_norm": 6.164173126220703, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8656006455421448, "num_tokens": 333507865.0, "step": 8744 }, { "epoch": 1.112453886274011, "ewc_loss": 0.052865710109472275, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002234813291579485, "grad_norm": 6.230897426605225, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8735663890838623, "num_tokens": 333544011.0, "step": 8745 }, { "epoch": 1.1125810965526015, "ewc_loss": 0.05289324373006821, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022375663684215397, "grad_norm": 6.1670732498168945, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8652973175048828, "num_tokens": 333581949.0, "step": 8746 }, { "epoch": 1.112708306831192, "ewc_loss": 0.05281946435570717, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.000223018869291991, "grad_norm": 6.184979438781738, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8704742193222046, "num_tokens": 333615994.0, "step": 8747 }, { "epoch": 1.1128355171097826, "ewc_loss": 0.05279898643493652, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022281409474089742, "grad_norm": 6.177928447723389, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8784583210945129, "num_tokens": 333654454.0, "step": 8748 }, { "epoch": 1.112962727388373, "ewc_loss": 0.05283208191394806, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002231450198451057, "grad_norm": 6.137680530548096, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8550142645835876, "num_tokens": 333700206.0, "step": 8749 }, { "epoch": 1.1130899376669634, "ewc_loss": 0.05287279188632965, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022355213877744973, "grad_norm": 6.1630754470825195, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8667703866958618, "num_tokens": 333737538.0, "step": 8750 }, { "epoch": 1.113217147945554, "ewc_loss": 0.05282977968454361, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022312201326712966, "grad_norm": 6.195162296295166, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8641806840896606, "num_tokens": 333771237.0, "step": 8751 }, { "epoch": 1.1133443582241445, "ewc_loss": 0.052925191819667816, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022407612414099276, "grad_norm": 6.245869159698486, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8603053689002991, "num_tokens": 333806522.0, "step": 8752 }, { "epoch": 1.113471568502735, "ewc_loss": 0.05281909555196762, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022301515855360776, "grad_norm": 6.103377342224121, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8614370822906494, "num_tokens": 333846656.0, "step": 8753 }, { "epoch": 1.1135987787813255, "ewc_loss": 0.05338267236948013, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002237681474070996, "grad_norm": 6.3255934715271, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8545892834663391, "num_tokens": 333886294.0, "step": 8754 }, { "epoch": 1.113725989059916, "ewc_loss": 0.05283493921160698, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022317361435852945, "grad_norm": 6.147852420806885, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8583644032478333, "num_tokens": 333923868.0, "step": 8755 }, { "epoch": 1.1138531993385066, "ewc_loss": 0.05287241190671921, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022354834072757512, "grad_norm": 6.2508769035339355, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.867743730545044, "num_tokens": 333957187.0, "step": 8756 }, { "epoch": 1.113980409617097, "ewc_loss": 0.0528419092297554, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022324331803247333, "grad_norm": 6.152199745178223, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8768987059593201, "num_tokens": 333999061.0, "step": 8757 }, { "epoch": 1.1141076198956876, "ewc_loss": 0.053123027086257935, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022361308219842613, "grad_norm": 6.172417640686035, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8577784299850464, "num_tokens": 334043595.0, "step": 8758 }, { "epoch": 1.1142348301742782, "ewc_loss": 0.052877649664878845, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022360074217431247, "grad_norm": 6.216825485229492, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8674351572990417, "num_tokens": 334078487.0, "step": 8759 }, { "epoch": 1.1143620404528687, "ewc_loss": 0.05284901708364487, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002233144041383639, "grad_norm": 6.113602161407471, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8707016706466675, "num_tokens": 334119610.0, "step": 8760 }, { "epoch": 1.114489250731459, "ewc_loss": 0.052935756742954254, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022418180014938116, "grad_norm": 6.241720676422119, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8580416440963745, "num_tokens": 334160149.0, "step": 8761 }, { "epoch": 1.1146164610100495, "ewc_loss": 0.052849411964416504, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022331833315547556, "grad_norm": 6.206688404083252, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8635356426239014, "num_tokens": 334198667.0, "step": 8762 }, { "epoch": 1.11474367128864, "ewc_loss": 0.052861765027046204, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002234418789157644, "grad_norm": 6.24217414855957, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8641749620437622, "num_tokens": 334233838.0, "step": 8763 }, { "epoch": 1.1148708815672306, "ewc_loss": 0.05284375697374344, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022326181351672858, "grad_norm": 6.1721343994140625, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8540776371955872, "num_tokens": 334275703.0, "step": 8764 }, { "epoch": 1.114998091845821, "ewc_loss": 0.05285829305648804, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022340717259794474, "grad_norm": 6.249492645263672, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8750951886177063, "num_tokens": 334311119.0, "step": 8765 }, { "epoch": 1.1151253021244116, "ewc_loss": 0.05279763042926788, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022280050325207412, "grad_norm": 6.212474822998047, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8556836843490601, "num_tokens": 334348298.0, "step": 8766 }, { "epoch": 1.1152525124030022, "ewc_loss": 0.05274803191423416, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022230454487726092, "grad_norm": 6.179650783538818, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8499487042427063, "num_tokens": 334386986.0, "step": 8767 }, { "epoch": 1.1153797226815927, "ewc_loss": 0.0528678223490715, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022350242943502963, "grad_norm": 6.1936469078063965, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8671658039093018, "num_tokens": 334423263.0, "step": 8768 }, { "epoch": 1.1155069329601832, "ewc_loss": 0.05280372500419617, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022286146122496575, "grad_norm": 6.187536239624023, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.87847900390625, "num_tokens": 334460363.0, "step": 8769 }, { "epoch": 1.1156341432387737, "ewc_loss": 0.05287649855017662, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022358920250553638, "grad_norm": 6.229499816894531, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8671258687973022, "num_tokens": 334501883.0, "step": 8770 }, { "epoch": 1.1157613535173643, "ewc_loss": 0.05284474790096283, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022327170881908387, "grad_norm": 6.245929718017578, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.862079381942749, "num_tokens": 334536242.0, "step": 8771 }, { "epoch": 1.1158885637959548, "ewc_loss": 0.05284735560417175, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022329775674734265, "grad_norm": 6.2357025146484375, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8743611574172974, "num_tokens": 334569685.0, "step": 8772 }, { "epoch": 1.1160157740745453, "ewc_loss": 0.05285567417740822, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022338096459861845, "grad_norm": 6.218003273010254, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.866958737373352, "num_tokens": 334604946.0, "step": 8773 }, { "epoch": 1.1161429843531356, "ewc_loss": 0.052828289568424225, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022310714120976627, "grad_norm": 6.15431022644043, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8688490390777588, "num_tokens": 334645495.0, "step": 8774 }, { "epoch": 1.1162701946317262, "ewc_loss": 0.05281231924891472, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022294741938821971, "grad_norm": 6.186409950256348, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8573652505874634, "num_tokens": 334684239.0, "step": 8775 }, { "epoch": 1.1163974049103167, "ewc_loss": 0.052763931453228, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002224635099992156, "grad_norm": 6.230796813964844, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8685939908027649, "num_tokens": 334713799.0, "step": 8776 }, { "epoch": 1.1165246151889072, "ewc_loss": 0.05281362682580948, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002229604870080948, "grad_norm": 6.1003336906433105, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8839839100837708, "num_tokens": 334753374.0, "step": 8777 }, { "epoch": 1.1166518254674977, "ewc_loss": 0.05288407951593399, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002236650325357914, "grad_norm": 6.190032005310059, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8635595440864563, "num_tokens": 334792867.0, "step": 8778 }, { "epoch": 1.1167790357460883, "ewc_loss": 0.05284510552883148, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022327528859023005, "grad_norm": 6.147427082061768, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8598791360855103, "num_tokens": 334832664.0, "step": 8779 }, { "epoch": 1.1169062460246788, "ewc_loss": 0.05294084548950195, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022423268819693476, "grad_norm": 6.205508708953857, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8560820817947388, "num_tokens": 334876763.0, "step": 8780 }, { "epoch": 1.1170334563032693, "ewc_loss": 0.05288870632648468, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022371129307430238, "grad_norm": 6.228287696838379, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8632438778877258, "num_tokens": 334909431.0, "step": 8781 }, { "epoch": 1.1171606665818599, "ewc_loss": 0.05282434821128845, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022306769096758217, "grad_norm": 6.251732349395752, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8611972332000732, "num_tokens": 334941939.0, "step": 8782 }, { "epoch": 1.1172878768604504, "ewc_loss": 0.052821800112724304, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022304223966784775, "grad_norm": 6.119957447052002, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8726772665977478, "num_tokens": 334981338.0, "step": 8783 }, { "epoch": 1.117415087139041, "ewc_loss": 0.05286756157875061, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002234998537460342, "grad_norm": 6.195776462554932, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8637243509292603, "num_tokens": 335020660.0, "step": 8784 }, { "epoch": 1.1175422974176314, "ewc_loss": 0.05285605043172836, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022338471899274737, "grad_norm": 6.206747531890869, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8661040663719177, "num_tokens": 335057377.0, "step": 8785 }, { "epoch": 1.1176695076962218, "ewc_loss": 0.05288848280906677, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022370903752744198, "grad_norm": 6.19655704498291, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.853600025177002, "num_tokens": 335092912.0, "step": 8786 }, { "epoch": 1.1177967179748123, "ewc_loss": 0.0528184212744236, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022300843556877226, "grad_norm": 6.193668365478516, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8790512084960938, "num_tokens": 335129157.0, "step": 8787 }, { "epoch": 1.1179239282534028, "ewc_loss": 0.052907880395650864, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002239030145574361, "grad_norm": 6.1830573081970215, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8642776608467102, "num_tokens": 335168065.0, "step": 8788 }, { "epoch": 1.1180511385319933, "ewc_loss": 0.052887458354234695, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022369880753103644, "grad_norm": 6.119049549102783, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8621172904968262, "num_tokens": 335218876.0, "step": 8789 }, { "epoch": 1.1181783488105839, "ewc_loss": 0.05293405055999756, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022416473075281829, "grad_norm": 6.1455841064453125, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8737772703170776, "num_tokens": 335256300.0, "step": 8790 }, { "epoch": 1.1183055590891744, "ewc_loss": 0.052917055785655975, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022399476438295096, "grad_norm": 6.200471878051758, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8730491399765015, "num_tokens": 335291035.0, "step": 8791 }, { "epoch": 1.118432769367765, "ewc_loss": 0.052999045699834824, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022481467749457806, "grad_norm": 6.164674758911133, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8751447200775146, "num_tokens": 335330092.0, "step": 8792 }, { "epoch": 1.1185599796463555, "ewc_loss": 0.05295249819755554, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002243492053821683, "grad_norm": 6.2695465087890625, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8486270904541016, "num_tokens": 335367146.0, "step": 8793 }, { "epoch": 1.118687189924946, "ewc_loss": 0.05285966768860817, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022342089505400509, "grad_norm": 6.10988187789917, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8634294271469116, "num_tokens": 335405069.0, "step": 8794 }, { "epoch": 1.1188144002035365, "ewc_loss": 0.052995942533016205, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022478362370748073, "grad_norm": 6.240636825561523, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8532573580741882, "num_tokens": 335443779.0, "step": 8795 }, { "epoch": 1.118941610482127, "ewc_loss": 0.05287609621882439, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022358518617693335, "grad_norm": 6.132622241973877, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.854918360710144, "num_tokens": 335485978.0, "step": 8796 }, { "epoch": 1.1190688207607176, "ewc_loss": 0.05298667401075363, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002246909571113065, "grad_norm": 6.196697235107422, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8673492670059204, "num_tokens": 335530375.0, "step": 8797 }, { "epoch": 1.119196031039308, "ewc_loss": 0.05291908234357834, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022401506430469453, "grad_norm": 6.203742027282715, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8670568466186523, "num_tokens": 335569109.0, "step": 8798 }, { "epoch": 1.1193232413178984, "ewc_loss": 0.05297001451253891, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022452436678577214, "grad_norm": 6.269566059112549, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8490961790084839, "num_tokens": 335599283.0, "step": 8799 }, { "epoch": 1.119450451596489, "ewc_loss": 0.052885204553604126, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002236762666143477, "grad_norm": 6.162473201751709, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8705662488937378, "num_tokens": 335634792.0, "step": 8800 }, { "epoch": 1.1195776618750795, "ewc_loss": 0.05300761014223099, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.000224900315515697, "grad_norm": 6.241908073425293, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8757821917533875, "num_tokens": 335672869.0, "step": 8801 }, { "epoch": 1.11970487215367, "ewc_loss": 0.052884794771671295, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022367219207808375, "grad_norm": 6.152851104736328, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.868374228477478, "num_tokens": 335714996.0, "step": 8802 }, { "epoch": 1.1198320824322605, "ewc_loss": 0.05302061140537262, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022503032232634723, "grad_norm": 6.241135597229004, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8546398878097534, "num_tokens": 335756884.0, "step": 8803 }, { "epoch": 1.119959292710851, "ewc_loss": 0.0529417060315609, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002242412738269195, "grad_norm": 6.154179573059082, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8731778860092163, "num_tokens": 335793935.0, "step": 8804 }, { "epoch": 1.1200865029894416, "ewc_loss": 0.05305226147174835, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022534685558639467, "grad_norm": 6.22749137878418, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8758134841918945, "num_tokens": 335834169.0, "step": 8805 }, { "epoch": 1.120213713268032, "ewc_loss": 0.0534486398100853, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022442778572440147, "grad_norm": 35.786231994628906, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8814364671707153, "num_tokens": 335872302.0, "step": 8806 }, { "epoch": 1.1203409235466226, "ewc_loss": 0.07596787810325623, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0004496201581787318, "grad_norm": 9.419015884399414, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.864203691482544, "num_tokens": 335908074.0, "step": 8807 }, { "epoch": 1.1204681338252132, "ewc_loss": 0.05432716757059097, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023321309708990157, "grad_norm": 5.3899054527282715, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8747468590736389, "num_tokens": 335946268.0, "step": 8808 }, { "epoch": 1.1205953441038037, "ewc_loss": 0.06334331631660461, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0003282574180047959, "grad_norm": 8.509559631347656, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.865276575088501, "num_tokens": 335985364.0, "step": 8809 }, { "epoch": 1.120722554382394, "ewc_loss": 0.06779302656650543, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00037275446811690927, "grad_norm": 8.312508583068848, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8669477701187134, "num_tokens": 336025300.0, "step": 8810 }, { "epoch": 1.1208497646609845, "ewc_loss": 0.05762547254562378, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00027107895584777, "grad_norm": 6.435321807861328, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8669285774230957, "num_tokens": 336062105.0, "step": 8811 }, { "epoch": 1.120976974939575, "ewc_loss": 0.05835200846195221, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002783442905638367, "grad_norm": 7.394045829772949, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8536126017570496, "num_tokens": 336102944.0, "step": 8812 }, { "epoch": 1.1211041852181656, "ewc_loss": 0.05974636599421501, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002922878775279969, "grad_norm": 6.973729133605957, "learning_rate": 1e-06, "loss": 0.3746, "mean_token_accuracy": 0.8905047178268433, "num_tokens": 336142964.0, "step": 8813 }, { "epoch": 1.121231395496756, "ewc_loss": 0.05619959533214569, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002568201452959329, "grad_norm": 6.627469539642334, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8666588664054871, "num_tokens": 336186962.0, "step": 8814 }, { "epoch": 1.1213586057753466, "ewc_loss": 0.05657574534416199, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002605816989671439, "grad_norm": 6.8361687660217285, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8732610940933228, "num_tokens": 336220775.0, "step": 8815 }, { "epoch": 1.1214858160539372, "ewc_loss": 0.05590645223855972, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002538887201808393, "grad_norm": 6.553128719329834, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8601511716842651, "num_tokens": 336263067.0, "step": 8816 }, { "epoch": 1.1216130263325277, "ewc_loss": 0.05514683201909065, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00024629253312014043, "grad_norm": 6.615650653839111, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8674966096878052, "num_tokens": 336308463.0, "step": 8817 }, { "epoch": 1.1217402366111182, "ewc_loss": 0.0550033263862133, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002448574814479798, "grad_norm": 6.475307941436768, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8770376443862915, "num_tokens": 336346965.0, "step": 8818 }, { "epoch": 1.1218674468897087, "ewc_loss": 0.05458588898181915, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00024068313359748572, "grad_norm": 6.5637383460998535, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8661554455757141, "num_tokens": 336382114.0, "step": 8819 }, { "epoch": 1.1219946571682993, "ewc_loss": 0.05422976613044739, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002371218870393932, "grad_norm": 6.450404644012451, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8685341477394104, "num_tokens": 336414020.0, "step": 8820 }, { "epoch": 1.1221218674468898, "ewc_loss": 0.05403536558151245, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023517789668403566, "grad_norm": 6.461553573608398, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8614595532417297, "num_tokens": 336452522.0, "step": 8821 }, { "epoch": 1.1222490777254803, "ewc_loss": 0.053797319531440735, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002327973925275728, "grad_norm": 6.4085798263549805, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8747622966766357, "num_tokens": 336484593.0, "step": 8822 }, { "epoch": 1.1223762880040706, "ewc_loss": 0.05363916605710983, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023121586127672344, "grad_norm": 6.475377082824707, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8462918996810913, "num_tokens": 336518964.0, "step": 8823 }, { "epoch": 1.1225034982826612, "ewc_loss": 0.05318956449627876, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.0002291612618137151, "grad_norm": 6.3173418045043945, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8509984016418457, "num_tokens": 336556320.0, "step": 8824 }, { "epoch": 1.1226307085612517, "ewc_loss": 0.053418610244989395, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022901032934896648, "grad_norm": 6.410261154174805, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8536975383758545, "num_tokens": 336595473.0, "step": 8825 }, { "epoch": 1.1227579188398422, "ewc_loss": 0.052966900169849396, "ewc_loss_diag": 3.0279159545898438e-05, "ewc_loss_parallel": 0.00022693461505696177, "grad_norm": 6.3144683837890625, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8639550805091858, "num_tokens": 336631793.0, "step": 8826 }, { "epoch": 1.1228851291184327, "ewc_loss": 0.05313867703080177, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022621099196840078, "grad_norm": 6.305041790008545, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8488188982009888, "num_tokens": 336671230.0, "step": 8827 }, { "epoch": 1.1230123393970233, "ewc_loss": 0.05304840952157974, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022530830756295472, "grad_norm": 6.300246715545654, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8641858100891113, "num_tokens": 336712015.0, "step": 8828 }, { "epoch": 1.1231395496756138, "ewc_loss": 0.053056586533784866, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022539007477462292, "grad_norm": 6.3656744956970215, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8486039638519287, "num_tokens": 336741206.0, "step": 8829 }, { "epoch": 1.1232667599542043, "ewc_loss": 0.05297628417611122, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022458705643657595, "grad_norm": 6.245576858520508, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8566755652427673, "num_tokens": 336778838.0, "step": 8830 }, { "epoch": 1.1233939702327949, "ewc_loss": 0.05298038572072983, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022462804918177426, "grad_norm": 6.274251461029053, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8698458671569824, "num_tokens": 336815471.0, "step": 8831 }, { "epoch": 1.1235211805113854, "ewc_loss": 0.052962660789489746, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022445080685429275, "grad_norm": 6.218692302703857, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8653772473335266, "num_tokens": 336850870.0, "step": 8832 }, { "epoch": 1.123648390789976, "ewc_loss": 0.05301386117935181, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022496281599160284, "grad_norm": 6.256453990936279, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8690172433853149, "num_tokens": 336891779.0, "step": 8833 }, { "epoch": 1.1237756010685664, "ewc_loss": 0.05298136919736862, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022463788627646863, "grad_norm": 6.250120639801025, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.87413489818573, "num_tokens": 336931171.0, "step": 8834 }, { "epoch": 1.1239028113471567, "ewc_loss": 0.05303860455751419, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022521024220623076, "grad_norm": 6.276971817016602, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8744194507598877, "num_tokens": 336963080.0, "step": 8835 }, { "epoch": 1.1240300216257473, "ewc_loss": 0.05298115313053131, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002246357616968453, "grad_norm": 6.267637252807617, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8491498231887817, "num_tokens": 337000817.0, "step": 8836 }, { "epoch": 1.1241572319043378, "ewc_loss": 0.05300690606236458, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022489327238872647, "grad_norm": 6.292348384857178, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8568419814109802, "num_tokens": 337037249.0, "step": 8837 }, { "epoch": 1.1242844421829283, "ewc_loss": 0.05302226543426514, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002250468824058771, "grad_norm": 6.223833084106445, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8717106580734253, "num_tokens": 337077752.0, "step": 8838 }, { "epoch": 1.1244116524615189, "ewc_loss": 0.052999064326286316, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022481488122139126, "grad_norm": 6.201918125152588, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8599547147750854, "num_tokens": 337119649.0, "step": 8839 }, { "epoch": 1.1245388627401094, "ewc_loss": 0.05305136740207672, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.000225337891606614, "grad_norm": 6.27778434753418, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8481755256652832, "num_tokens": 337154376.0, "step": 8840 }, { "epoch": 1.1246660730187, "ewc_loss": 0.05355061590671539, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022544758394360542, "grad_norm": 6.258613586425781, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8726016283035278, "num_tokens": 337193377.0, "step": 8841 }, { "epoch": 1.1247932832972904, "ewc_loss": 0.05301448702812195, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002249691024189815, "grad_norm": 6.1901164054870605, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8536300659179688, "num_tokens": 337233176.0, "step": 8842 }, { "epoch": 1.124920493575881, "ewc_loss": 0.05305769294500351, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022540116333402693, "grad_norm": 6.305773735046387, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8500946164131165, "num_tokens": 337262995.0, "step": 8843 }, { "epoch": 1.1250477038544715, "ewc_loss": 0.05298725515604019, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002246967633254826, "grad_norm": 6.201717376708984, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.863135576248169, "num_tokens": 337302466.0, "step": 8844 }, { "epoch": 1.125174914133062, "ewc_loss": 0.05303303152322769, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002251545520266518, "grad_norm": 6.227329730987549, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8750032186508179, "num_tokens": 337341197.0, "step": 8845 }, { "epoch": 1.1253021244116526, "ewc_loss": 0.05304292216897011, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022525344684254378, "grad_norm": 6.218319892883301, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8665554523468018, "num_tokens": 337375602.0, "step": 8846 }, { "epoch": 1.125429334690243, "ewc_loss": 0.05302209407091141, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022504516527988017, "grad_norm": 6.257268905639648, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8541709184646606, "num_tokens": 337406685.0, "step": 8847 }, { "epoch": 1.1255565449688334, "ewc_loss": 0.05303528532385826, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022517706383951008, "grad_norm": 6.159422397613525, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.858942985534668, "num_tokens": 337447364.0, "step": 8848 }, { "epoch": 1.125683755247424, "ewc_loss": 0.05307828262448311, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022560704383067787, "grad_norm": 6.228196620941162, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8628643155097961, "num_tokens": 337487840.0, "step": 8849 }, { "epoch": 1.1258109655260145, "ewc_loss": 0.053069304674863815, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022551727306563407, "grad_norm": 6.226533889770508, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8681938052177429, "num_tokens": 337525117.0, "step": 8850 }, { "epoch": 1.125938175804605, "ewc_loss": 0.05304940789937973, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022531827562488616, "grad_norm": 6.167526721954346, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8562023639678955, "num_tokens": 337567024.0, "step": 8851 }, { "epoch": 1.1260653860831955, "ewc_loss": 0.053096842020750046, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022579263895750046, "grad_norm": 6.2496538162231445, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8629435300827026, "num_tokens": 337600117.0, "step": 8852 }, { "epoch": 1.126192596361786, "ewc_loss": 0.05302325636148453, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022505680681206286, "grad_norm": 6.158494472503662, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8483080863952637, "num_tokens": 337637188.0, "step": 8853 }, { "epoch": 1.1263198066403766, "ewc_loss": 0.05316845327615738, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022650875325780362, "grad_norm": 6.228524208068848, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8594664335250854, "num_tokens": 337676492.0, "step": 8854 }, { "epoch": 1.126447016918967, "ewc_loss": 0.05304130166769028, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022523722145706415, "grad_norm": 6.209891319274902, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8779231309890747, "num_tokens": 337713468.0, "step": 8855 }, { "epoch": 1.1265742271975576, "ewc_loss": 0.05311810225248337, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022600522788707167, "grad_norm": 6.249143123626709, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8499577045440674, "num_tokens": 337750944.0, "step": 8856 }, { "epoch": 1.1267014374761481, "ewc_loss": 0.05308224633336067, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022564668324775994, "grad_norm": 6.182262420654297, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8778272271156311, "num_tokens": 337793802.0, "step": 8857 }, { "epoch": 1.1268286477547387, "ewc_loss": 0.05307808518409729, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022560505021829158, "grad_norm": 6.311548709869385, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.87562096118927, "num_tokens": 337828550.0, "step": 8858 }, { "epoch": 1.126955858033329, "ewc_loss": 0.053108032792806625, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002259045431856066, "grad_norm": 6.231573104858398, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8731605410575867, "num_tokens": 337862168.0, "step": 8859 }, { "epoch": 1.1270830683119195, "ewc_loss": 0.053028956055641174, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022511377756018192, "grad_norm": 6.244175434112549, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8601923584938049, "num_tokens": 337898837.0, "step": 8860 }, { "epoch": 1.12721027859051, "ewc_loss": 0.05298542603850365, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022467847156804055, "grad_norm": 6.2357869148254395, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8503137230873108, "num_tokens": 337939608.0, "step": 8861 }, { "epoch": 1.1273374888691006, "ewc_loss": 0.05306382477283478, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022546248510479927, "grad_norm": 6.208009719848633, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8629216551780701, "num_tokens": 337983412.0, "step": 8862 }, { "epoch": 1.127464699147691, "ewc_loss": 0.05302715301513672, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022509574773721397, "grad_norm": 6.270103454589844, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8727495670318604, "num_tokens": 338022171.0, "step": 8863 }, { "epoch": 1.1275919094262816, "ewc_loss": 0.05299399048089981, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022476413869298995, "grad_norm": 6.208774566650391, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8596978187561035, "num_tokens": 338058838.0, "step": 8864 }, { "epoch": 1.1277191197048722, "ewc_loss": 0.053070396184921265, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022552820155397058, "grad_norm": 6.259033679962158, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8586949110031128, "num_tokens": 338095080.0, "step": 8865 }, { "epoch": 1.1278463299834627, "ewc_loss": 0.053028348833322525, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002251077094115317, "grad_norm": 6.25583553314209, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8466441631317139, "num_tokens": 338134206.0, "step": 8866 }, { "epoch": 1.1279735402620532, "ewc_loss": 0.05298204720020294, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002246446965727955, "grad_norm": 6.177136421203613, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8692564368247986, "num_tokens": 338173794.0, "step": 8867 }, { "epoch": 1.1281007505406437, "ewc_loss": 0.053035199642181396, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022517623438034207, "grad_norm": 6.279837608337402, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8599966764450073, "num_tokens": 338206591.0, "step": 8868 }, { "epoch": 1.1282279608192343, "ewc_loss": 0.05301760882139206, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002250003017252311, "grad_norm": 6.213892936706543, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.876008152961731, "num_tokens": 338238728.0, "step": 8869 }, { "epoch": 1.1283551710978248, "ewc_loss": 0.053090475499629974, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022572895977646112, "grad_norm": 6.223247051239014, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.873275876045227, "num_tokens": 338282372.0, "step": 8870 }, { "epoch": 1.1284823813764153, "ewc_loss": 0.05304649472236633, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022528917179442942, "grad_norm": 6.237715721130371, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8577494621276855, "num_tokens": 338319175.0, "step": 8871 }, { "epoch": 1.1286095916550056, "ewc_loss": 0.05302467197179794, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002250709367217496, "grad_norm": 6.187327861785889, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8604108095169067, "num_tokens": 338357967.0, "step": 8872 }, { "epoch": 1.1287368019335962, "ewc_loss": 0.05309351161122322, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022575934417545795, "grad_norm": 6.305447578430176, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8696414828300476, "num_tokens": 338391431.0, "step": 8873 }, { "epoch": 1.1288640122121867, "ewc_loss": 0.05303444713354111, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022516868193633854, "grad_norm": 6.178247928619385, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.871853232383728, "num_tokens": 338426298.0, "step": 8874 }, { "epoch": 1.1289912224907772, "ewc_loss": 0.05309388414025307, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002257630549138412, "grad_norm": 6.277141571044922, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8453971147537231, "num_tokens": 338468147.0, "step": 8875 }, { "epoch": 1.1291184327693677, "ewc_loss": 0.05303201451897621, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022514436568599194, "grad_norm": 6.211087226867676, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8465209007263184, "num_tokens": 338517095.0, "step": 8876 }, { "epoch": 1.1292456430479583, "ewc_loss": 0.05308697372674942, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022569394786842167, "grad_norm": 6.239711761474609, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8693836331367493, "num_tokens": 338549925.0, "step": 8877 }, { "epoch": 1.1293728533265488, "ewc_loss": 0.05309499055147171, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022577412892132998, "grad_norm": 6.25300407409668, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.866470456123352, "num_tokens": 338583966.0, "step": 8878 }, { "epoch": 1.1295000636051393, "ewc_loss": 0.05306267738342285, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022545101819559932, "grad_norm": 6.253029823303223, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.867233395576477, "num_tokens": 338622715.0, "step": 8879 }, { "epoch": 1.1296272738837299, "ewc_loss": 0.05301766097545624, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022500082559417933, "grad_norm": 6.2086334228515625, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8495475649833679, "num_tokens": 338659920.0, "step": 8880 }, { "epoch": 1.1297544841623204, "ewc_loss": 0.05329360440373421, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002253188577014953, "grad_norm": 6.309441089630127, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8572450876235962, "num_tokens": 338698183.0, "step": 8881 }, { "epoch": 1.129881694440911, "ewc_loss": 0.053232137113809586, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022470418480224907, "grad_norm": 6.271639823913574, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8519717454910278, "num_tokens": 338735853.0, "step": 8882 }, { "epoch": 1.1300089047195012, "ewc_loss": 0.05331911891698837, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022557398187927902, "grad_norm": 6.259319305419922, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8533563613891602, "num_tokens": 338771714.0, "step": 8883 }, { "epoch": 1.1301361149980917, "ewc_loss": 0.05323874577879906, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002247702650493011, "grad_norm": 6.292876720428467, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.881416916847229, "num_tokens": 338813823.0, "step": 8884 }, { "epoch": 1.1302633252766823, "ewc_loss": 0.053066134452819824, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002254855789942667, "grad_norm": 6.160821914672852, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8668347597122192, "num_tokens": 338858137.0, "step": 8885 }, { "epoch": 1.1303905355552728, "ewc_loss": 0.05305018275976181, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022532603179570287, "grad_norm": 6.2915120124816895, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8523589968681335, "num_tokens": 338893653.0, "step": 8886 }, { "epoch": 1.1305177458338633, "ewc_loss": 0.05300682783126831, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022489248658530414, "grad_norm": 6.313039779663086, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8437714576721191, "num_tokens": 338931230.0, "step": 8887 }, { "epoch": 1.1306449561124539, "ewc_loss": 0.05302811414003372, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022510538110509515, "grad_norm": 6.215280055999756, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8642228841781616, "num_tokens": 338966921.0, "step": 8888 }, { "epoch": 1.1307721663910444, "ewc_loss": 0.053023241460323334, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022505664674099535, "grad_norm": 6.3298234939575195, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.84526526927948, "num_tokens": 339000957.0, "step": 8889 }, { "epoch": 1.130899376669635, "ewc_loss": 0.0529898926615715, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002247231313958764, "grad_norm": 6.187469959259033, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8547696471214294, "num_tokens": 339035876.0, "step": 8890 }, { "epoch": 1.1310265869482254, "ewc_loss": 0.05305038020014763, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022532802540808916, "grad_norm": 6.233328819274902, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8639142513275146, "num_tokens": 339073700.0, "step": 8891 }, { "epoch": 1.131153797226816, "ewc_loss": 0.053258299827575684, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022496578458230942, "grad_norm": 6.238419055938721, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.866952657699585, "num_tokens": 339108222.0, "step": 8892 }, { "epoch": 1.1312810075054065, "ewc_loss": 0.05308106541633606, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022563489619642496, "grad_norm": 6.197398662567139, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.874732494354248, "num_tokens": 339145749.0, "step": 8893 }, { "epoch": 1.131408217783997, "ewc_loss": 0.05309094488620758, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022573368914891034, "grad_norm": 6.292672157287598, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8488044142723083, "num_tokens": 339183852.0, "step": 8894 }, { "epoch": 1.1315354280625876, "ewc_loss": 0.05304955691099167, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002253197890240699, "grad_norm": 6.190682411193848, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8773791193962097, "num_tokens": 339218766.0, "step": 8895 }, { "epoch": 1.131662638341178, "ewc_loss": 0.05306033045053482, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002254275168525055, "grad_norm": 6.207425117492676, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8640536665916443, "num_tokens": 339259932.0, "step": 8896 }, { "epoch": 1.1317898486197684, "ewc_loss": 0.05308675393462181, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002256917505292222, "grad_norm": 6.228228569030762, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8680514693260193, "num_tokens": 339302201.0, "step": 8897 }, { "epoch": 1.131917058898359, "ewc_loss": 0.05307838320732117, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022560804791282862, "grad_norm": 6.228756427764893, "learning_rate": 1e-06, "loss": 0.5646, "mean_token_accuracy": 0.8345412611961365, "num_tokens": 339342091.0, "step": 8898 }, { "epoch": 1.1320442691769494, "ewc_loss": 0.0530930832028389, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022575506591238081, "grad_norm": 6.188096523284912, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8577377200126648, "num_tokens": 339379951.0, "step": 8899 }, { "epoch": 1.13217147945554, "ewc_loss": 0.053157608956098557, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022640031238552183, "grad_norm": 6.2354326248168945, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.857454776763916, "num_tokens": 339422243.0, "step": 8900 }, { "epoch": 1.1322986897341305, "ewc_loss": 0.053095556795597076, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002257797896163538, "grad_norm": 6.134789943695068, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8775726556777954, "num_tokens": 339464969.0, "step": 8901 }, { "epoch": 1.132425900012721, "ewc_loss": 0.05342502146959305, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022663301206193864, "grad_norm": 6.254946708679199, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8660749793052673, "num_tokens": 339502382.0, "step": 8902 }, { "epoch": 1.1325531102913116, "ewc_loss": 0.053100354969501495, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022582776728086174, "grad_norm": 6.203829288482666, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8487122058868408, "num_tokens": 339542875.0, "step": 8903 }, { "epoch": 1.132680320569902, "ewc_loss": 0.053162217140197754, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002264464128529653, "grad_norm": 6.231481552124023, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8489202260971069, "num_tokens": 339580776.0, "step": 8904 }, { "epoch": 1.1328075308484926, "ewc_loss": 0.05317254364490509, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022654964413959533, "grad_norm": 6.202417373657227, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8759481310844421, "num_tokens": 339620195.0, "step": 8905 }, { "epoch": 1.1329347411270831, "ewc_loss": 0.05318835377693176, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022670775069855154, "grad_norm": 6.242104530334473, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8707019686698914, "num_tokens": 339659234.0, "step": 8906 }, { "epoch": 1.1330619514056737, "ewc_loss": 0.05313989147543907, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022622314281761646, "grad_norm": 6.171363353729248, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8536832332611084, "num_tokens": 339703259.0, "step": 8907 }, { "epoch": 1.133189161684264, "ewc_loss": 0.053155671805143356, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022638094378635287, "grad_norm": 6.232374668121338, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8709525465965271, "num_tokens": 339745901.0, "step": 8908 }, { "epoch": 1.1333163719628545, "ewc_loss": 0.05308688431978226, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002256930893054232, "grad_norm": 6.210200309753418, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.853569507598877, "num_tokens": 339790417.0, "step": 8909 }, { "epoch": 1.133443582241445, "ewc_loss": 0.05311165750026703, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022594082111027092, "grad_norm": 6.285037040710449, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8553105592727661, "num_tokens": 339827224.0, "step": 8910 }, { "epoch": 1.1335707925200356, "ewc_loss": 0.05306015908718109, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022542578517459333, "grad_norm": 6.244053363800049, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8581466674804688, "num_tokens": 339859932.0, "step": 8911 }, { "epoch": 1.133698002798626, "ewc_loss": 0.05308088660240173, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022563309175893664, "grad_norm": 6.3308610916137695, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8512378931045532, "num_tokens": 339902416.0, "step": 8912 }, { "epoch": 1.1338252130772166, "ewc_loss": 0.05303692817687988, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002251934929518029, "grad_norm": 6.175198554992676, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8708301782608032, "num_tokens": 339940495.0, "step": 8913 }, { "epoch": 1.1339524233558071, "ewc_loss": 0.05300632864236832, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022488748072646558, "grad_norm": 6.194080829620361, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8455374836921692, "num_tokens": 339983010.0, "step": 8914 }, { "epoch": 1.1340796336343977, "ewc_loss": 0.053115181624889374, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022597603674512357, "grad_norm": 6.2638349533081055, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.863533079624176, "num_tokens": 340020683.0, "step": 8915 }, { "epoch": 1.1342068439129882, "ewc_loss": 0.05303434655070305, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022516769240610301, "grad_norm": 6.191486835479736, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8548687696456909, "num_tokens": 340063179.0, "step": 8916 }, { "epoch": 1.1343340541915787, "ewc_loss": 0.053086455911397934, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002256887819385156, "grad_norm": 6.237528324127197, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8729026317596436, "num_tokens": 340100166.0, "step": 8917 }, { "epoch": 1.1344612644701693, "ewc_loss": 0.05303690582513809, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022519327467307448, "grad_norm": 6.169184684753418, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8585917949676514, "num_tokens": 340144325.0, "step": 8918 }, { "epoch": 1.1345884747487598, "ewc_loss": 0.05310923233628273, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022591653396375477, "grad_norm": 6.298069477081299, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8620348572731018, "num_tokens": 340180563.0, "step": 8919 }, { "epoch": 1.1347156850273503, "ewc_loss": 0.053093910217285156, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002257633168483153, "grad_norm": 6.193615436553955, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8506463170051575, "num_tokens": 340224621.0, "step": 8920 }, { "epoch": 1.1348428953059406, "ewc_loss": 0.053087614476680756, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022570036526303738, "grad_norm": 6.2177348136901855, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8690294027328491, "num_tokens": 340264810.0, "step": 8921 }, { "epoch": 1.1349701055845312, "ewc_loss": 0.05300723761320114, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002248965756734833, "grad_norm": 6.207254409790039, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.874840259552002, "num_tokens": 340301158.0, "step": 8922 }, { "epoch": 1.1350973158631217, "ewc_loss": 0.05320003256201744, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002268245443701744, "grad_norm": 6.307535648345947, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8631259799003601, "num_tokens": 340339806.0, "step": 8923 }, { "epoch": 1.1352245261417122, "ewc_loss": 0.053075067698955536, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022557488409802318, "grad_norm": 6.165982723236084, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8682767152786255, "num_tokens": 340386412.0, "step": 8924 }, { "epoch": 1.1353517364203027, "ewc_loss": 0.053172044456005096, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.000226544652832672, "grad_norm": 6.242110729217529, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8722100853919983, "num_tokens": 340431391.0, "step": 8925 }, { "epoch": 1.1354789466988933, "ewc_loss": 0.05311556160449982, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002259798493469134, "grad_norm": 6.233489990234375, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8666595220565796, "num_tokens": 340465904.0, "step": 8926 }, { "epoch": 1.1356061569774838, "ewc_loss": 0.05321807414293289, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022700497356709093, "grad_norm": 6.263796329498291, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8681640028953552, "num_tokens": 340504290.0, "step": 8927 }, { "epoch": 1.1357333672560743, "ewc_loss": 0.053166162222623825, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022648584854323417, "grad_norm": 6.270008563995361, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8566718697547913, "num_tokens": 340539100.0, "step": 8928 }, { "epoch": 1.1358605775346649, "ewc_loss": 0.05315052717924118, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022632951731793582, "grad_norm": 6.301268577575684, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8786380290985107, "num_tokens": 340569005.0, "step": 8929 }, { "epoch": 1.1359877878132554, "ewc_loss": 0.0531587153673172, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022641135728918016, "grad_norm": 6.213343620300293, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.861839234828949, "num_tokens": 340613476.0, "step": 8930 }, { "epoch": 1.136114998091846, "ewc_loss": 0.05313761532306671, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022620036907028407, "grad_norm": 6.286770820617676, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8558217287063599, "num_tokens": 340647982.0, "step": 8931 }, { "epoch": 1.1362422083704362, "ewc_loss": 0.05309365689754486, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022576081391889602, "grad_norm": 6.299668312072754, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.84052973985672, "num_tokens": 340683270.0, "step": 8932 }, { "epoch": 1.1363694186490267, "ewc_loss": 0.05309773236513138, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022580151562578976, "grad_norm": 6.231970310211182, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8662101030349731, "num_tokens": 340719955.0, "step": 8933 }, { "epoch": 1.1364966289276173, "ewc_loss": 0.053090568631887436, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022572990565095097, "grad_norm": 6.4603447914123535, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8637524843215942, "num_tokens": 340753934.0, "step": 8934 }, { "epoch": 1.1366238392062078, "ewc_loss": 0.05309673026204109, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022579151846002787, "grad_norm": 6.231036186218262, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8690544366836548, "num_tokens": 340787545.0, "step": 8935 }, { "epoch": 1.1367510494847983, "ewc_loss": 0.053139813244342804, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022622237156610936, "grad_norm": 6.279176235198975, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8722816705703735, "num_tokens": 340826362.0, "step": 8936 }, { "epoch": 1.1368782597633889, "ewc_loss": 0.053097423166036606, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022579844517167658, "grad_norm": 6.268017292022705, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8671413064002991, "num_tokens": 340857871.0, "step": 8937 }, { "epoch": 1.1370054700419794, "ewc_loss": 0.05360648036003113, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022600620286539197, "grad_norm": 6.223940849304199, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8695825338363647, "num_tokens": 340897736.0, "step": 8938 }, { "epoch": 1.13713268032057, "ewc_loss": 0.053094957023859024, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002257737796753645, "grad_norm": 6.241953372955322, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8582621812820435, "num_tokens": 340935774.0, "step": 8939 }, { "epoch": 1.1372598905991604, "ewc_loss": 0.053102850914001465, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022585275291930884, "grad_norm": 6.2455153465271, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8653439283370972, "num_tokens": 340971497.0, "step": 8940 }, { "epoch": 1.137387100877751, "ewc_loss": 0.05363178253173828, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022625924611929804, "grad_norm": 6.29899787902832, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8624188303947449, "num_tokens": 341016209.0, "step": 8941 }, { "epoch": 1.1375143111563415, "ewc_loss": 0.05310511589050293, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002258753520436585, "grad_norm": 6.241498947143555, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8576256036758423, "num_tokens": 341052137.0, "step": 8942 }, { "epoch": 1.137641521434932, "ewc_loss": 0.05310690402984619, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022589328000321984, "grad_norm": 6.339614391326904, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8668310642242432, "num_tokens": 341083417.0, "step": 8943 }, { "epoch": 1.1377687317135226, "ewc_loss": 0.05355337634682655, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002254751743748784, "grad_norm": 6.207723140716553, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.865689754486084, "num_tokens": 341127798.0, "step": 8944 }, { "epoch": 1.137895941992113, "ewc_loss": 0.053070783615112305, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022553204325959086, "grad_norm": 6.288815975189209, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8493925929069519, "num_tokens": 341173485.0, "step": 8945 }, { "epoch": 1.1380231522707034, "ewc_loss": 0.05300513654947281, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022487560636363924, "grad_norm": 6.258886337280273, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8522006273269653, "num_tokens": 341212834.0, "step": 8946 }, { "epoch": 1.138150362549294, "ewc_loss": 0.053555071353912354, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022549211280420423, "grad_norm": 6.306343078613281, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8560550212860107, "num_tokens": 341246497.0, "step": 8947 }, { "epoch": 1.1382775728278844, "ewc_loss": 0.05295651778578758, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022438939777202904, "grad_norm": 6.169130802154541, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8473882675170898, "num_tokens": 341291498.0, "step": 8948 }, { "epoch": 1.138404783106475, "ewc_loss": 0.05310411751270294, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022586536942981184, "grad_norm": 6.3271331787109375, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8564580678939819, "num_tokens": 341328104.0, "step": 8949 }, { "epoch": 1.1385319933850655, "ewc_loss": 0.05351332575082779, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002250746765639633, "grad_norm": 6.217136859893799, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.87556391954422, "num_tokens": 341359042.0, "step": 8950 }, { "epoch": 1.138659203663656, "ewc_loss": 0.053071439266204834, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022553862072527409, "grad_norm": 6.247352123260498, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8630158305168152, "num_tokens": 341403529.0, "step": 8951 }, { "epoch": 1.1387864139422466, "ewc_loss": 0.053540848195552826, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022534986783284694, "grad_norm": 6.2639641761779785, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8686866760253906, "num_tokens": 341442509.0, "step": 8952 }, { "epoch": 1.138913624220837, "ewc_loss": 0.05303312838077545, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002251555270049721, "grad_norm": 6.182212829589844, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8603257536888123, "num_tokens": 341483015.0, "step": 8953 }, { "epoch": 1.1390408344994276, "ewc_loss": 0.05307517200708389, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002255759172840044, "grad_norm": 6.256675720214844, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8619593977928162, "num_tokens": 341519676.0, "step": 8954 }, { "epoch": 1.1391680447780181, "ewc_loss": 0.05305548757314682, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002253790880786255, "grad_norm": 6.19429349899292, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8628132939338684, "num_tokens": 341563483.0, "step": 8955 }, { "epoch": 1.1392952550566087, "ewc_loss": 0.053125638514757156, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002260806068079546, "grad_norm": 6.310670852661133, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8547459840774536, "num_tokens": 341598387.0, "step": 8956 }, { "epoch": 1.139422465335199, "ewc_loss": 0.05305398255586624, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022536404139827937, "grad_norm": 6.185384273529053, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8667263984680176, "num_tokens": 341640253.0, "step": 8957 }, { "epoch": 1.1395496756137895, "ewc_loss": 0.053082674741744995, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022565099061466753, "grad_norm": 6.257961750030518, "learning_rate": 1e-06, "loss": 0.3745, "mean_token_accuracy": 0.8895504474639893, "num_tokens": 341675788.0, "step": 8958 }, { "epoch": 1.13967688589238, "ewc_loss": 0.053070664405822754, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002255308791063726, "grad_norm": 6.198949337005615, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8627254962921143, "num_tokens": 341716917.0, "step": 8959 }, { "epoch": 1.1398040961709706, "ewc_loss": 0.05306420475244522, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022546628315467387, "grad_norm": 6.249670505523682, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8620246052742004, "num_tokens": 341758936.0, "step": 8960 }, { "epoch": 1.139931306449561, "ewc_loss": 0.053082425147295, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022564847313333303, "grad_norm": 6.358318328857422, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8390754461288452, "num_tokens": 341797612.0, "step": 8961 }, { "epoch": 1.1400585167281516, "ewc_loss": 0.05330035090446472, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022538630582857877, "grad_norm": 6.22810173034668, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8587148189544678, "num_tokens": 341836061.0, "step": 8962 }, { "epoch": 1.1401857270067421, "ewc_loss": 0.0531076155602932, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022590036678593606, "grad_norm": 6.315943717956543, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8622472882270813, "num_tokens": 341875465.0, "step": 8963 }, { "epoch": 1.1403129372853327, "ewc_loss": 0.053028352558612823, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022510775306727737, "grad_norm": 6.207709312438965, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8564125299453735, "num_tokens": 341919795.0, "step": 8964 }, { "epoch": 1.1404401475639232, "ewc_loss": 0.05316225066781044, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002264467184431851, "grad_norm": 6.337961196899414, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8616567850112915, "num_tokens": 341953291.0, "step": 8965 }, { "epoch": 1.1405673578425137, "ewc_loss": 0.0530007928609848, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022483215434476733, "grad_norm": 6.263103008270264, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8808863162994385, "num_tokens": 341988364.0, "step": 8966 }, { "epoch": 1.1406945681211043, "ewc_loss": 0.05311126261949539, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002259368629893288, "grad_norm": 6.2586212158203125, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8660120368003845, "num_tokens": 342028478.0, "step": 8967 }, { "epoch": 1.1408217783996948, "ewc_loss": 0.05300132557749748, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022483748034574091, "grad_norm": 6.254641532897949, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8703891038894653, "num_tokens": 342067348.0, "step": 8968 }, { "epoch": 1.1409489886782853, "ewc_loss": 0.0532851442694664, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022523428197018802, "grad_norm": 6.292899131774902, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8595187664031982, "num_tokens": 342105845.0, "step": 8969 }, { "epoch": 1.1410761989568756, "ewc_loss": 0.05353764444589615, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022531782451551408, "grad_norm": 6.2636542320251465, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8629573583602905, "num_tokens": 342142072.0, "step": 8970 }, { "epoch": 1.1412034092354661, "ewc_loss": 0.05298798531293869, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022470405383501202, "grad_norm": 6.243171215057373, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8653274774551392, "num_tokens": 342176592.0, "step": 8971 }, { "epoch": 1.1413306195140567, "ewc_loss": 0.0533808209002018, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022619102674070746, "grad_norm": 6.311068058013916, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8672478199005127, "num_tokens": 342211896.0, "step": 8972 }, { "epoch": 1.1414578297926472, "ewc_loss": 0.05311490595340729, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022597330098506063, "grad_norm": 6.239580154418945, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.866532564163208, "num_tokens": 342256078.0, "step": 8973 }, { "epoch": 1.1415850400712377, "ewc_loss": 0.05311130732297897, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022593731409870088, "grad_norm": 6.29947566986084, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8654963970184326, "num_tokens": 342296766.0, "step": 8974 }, { "epoch": 1.1417122503498283, "ewc_loss": 0.053074322640895844, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022556741896551102, "grad_norm": 6.200006008148193, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.861280083656311, "num_tokens": 342333689.0, "step": 8975 }, { "epoch": 1.1418394606284188, "ewc_loss": 0.053208157420158386, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022690581681672484, "grad_norm": 6.351011276245117, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8736903071403503, "num_tokens": 342369605.0, "step": 8976 }, { "epoch": 1.1419666709070093, "ewc_loss": 0.053160376846790314, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022642797557637095, "grad_norm": 6.253293037414551, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8719140291213989, "num_tokens": 342410806.0, "step": 8977 }, { "epoch": 1.1420938811855998, "ewc_loss": 0.05313512310385704, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002261754561914131, "grad_norm": 6.251682281494141, "learning_rate": 1e-06, "loss": 0.3747, "mean_token_accuracy": 0.8888567090034485, "num_tokens": 342446957.0, "step": 8978 }, { "epoch": 1.1422210914641904, "ewc_loss": 0.05318589508533478, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022668318706564605, "grad_norm": 6.335289478302002, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8506374359130859, "num_tokens": 342480717.0, "step": 8979 }, { "epoch": 1.142348301742781, "ewc_loss": 0.053162187337875366, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022644607815891504, "grad_norm": 6.250832557678223, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8709464073181152, "num_tokens": 342520431.0, "step": 8980 }, { "epoch": 1.1424755120213712, "ewc_loss": 0.053187474608421326, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022669894678983837, "grad_norm": 6.309600830078125, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8595278263092041, "num_tokens": 342561735.0, "step": 8981 }, { "epoch": 1.1426027222999617, "ewc_loss": 0.05317116528749466, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022653584892395884, "grad_norm": 6.308705806732178, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8751806020736694, "num_tokens": 342593899.0, "step": 8982 }, { "epoch": 1.1427299325785523, "ewc_loss": 0.053343720734119415, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002258199965581298, "grad_norm": 6.276787281036377, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8627129793167114, "num_tokens": 342628044.0, "step": 8983 }, { "epoch": 1.1428571428571428, "ewc_loss": 0.05314664542675018, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022629066370427608, "grad_norm": 6.2792649269104, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.859997570514679, "num_tokens": 342665018.0, "step": 8984 }, { "epoch": 1.1429843531357333, "ewc_loss": 0.05315227806568146, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002263469941681251, "grad_norm": 6.264481544494629, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8518896698951721, "num_tokens": 342704832.0, "step": 8985 }, { "epoch": 1.1431115634143239, "ewc_loss": 0.05315390229225159, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002263632632093504, "grad_norm": 6.2582478523254395, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8719043135643005, "num_tokens": 342741598.0, "step": 8986 }, { "epoch": 1.1432387736929144, "ewc_loss": 0.05344206094741821, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022680342954117805, "grad_norm": 6.294154644012451, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8587285876274109, "num_tokens": 342777302.0, "step": 8987 }, { "epoch": 1.143365983971505, "ewc_loss": 0.05322468280792236, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022707105381414294, "grad_norm": 6.2998270988464355, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8433670401573181, "num_tokens": 342809578.0, "step": 8988 }, { "epoch": 1.1434931942500954, "ewc_loss": 0.05343412607908249, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022672406339552253, "grad_norm": 6.22136116027832, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8638485670089722, "num_tokens": 342845809.0, "step": 8989 }, { "epoch": 1.143620404528686, "ewc_loss": 0.05379072576761246, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022784866450820118, "grad_norm": 6.319490432739258, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.849997878074646, "num_tokens": 342886118.0, "step": 8990 }, { "epoch": 1.1437476148072765, "ewc_loss": 0.053423263132572174, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022661541879642755, "grad_norm": 6.23473596572876, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8572814464569092, "num_tokens": 342922517.0, "step": 8991 }, { "epoch": 1.143874825085867, "ewc_loss": 0.05351819843053818, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022756477119401097, "grad_norm": 6.279850959777832, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8568217754364014, "num_tokens": 342959741.0, "step": 8992 }, { "epoch": 1.1440020353644575, "ewc_loss": 0.053470611572265625, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022708892356604338, "grad_norm": 6.304694652557373, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8762820959091187, "num_tokens": 342991418.0, "step": 8993 }, { "epoch": 1.144129245643048, "ewc_loss": 0.05325201153755188, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022734432423021644, "grad_norm": 6.3290019035339355, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8803701400756836, "num_tokens": 343024263.0, "step": 8994 }, { "epoch": 1.1442564559216384, "ewc_loss": 0.05349935591220856, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022737635299563408, "grad_norm": 6.25787878036499, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8630849123001099, "num_tokens": 343066388.0, "step": 8995 }, { "epoch": 1.144383666200229, "ewc_loss": 0.0535135380923748, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022751819051336497, "grad_norm": 6.2552714347839355, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8651546239852905, "num_tokens": 343107631.0, "step": 8996 }, { "epoch": 1.1445108764788194, "ewc_loss": 0.05353260785341263, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022770886425860226, "grad_norm": 6.310699939727783, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8696095943450928, "num_tokens": 343143645.0, "step": 8997 }, { "epoch": 1.14463808675741, "ewc_loss": 0.05349180847406387, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.000227300901315175, "grad_norm": 6.259992599487305, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8494526147842407, "num_tokens": 343182076.0, "step": 8998 }, { "epoch": 1.1447652970360005, "ewc_loss": 0.05355791375041008, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022796195116825402, "grad_norm": 6.229430675506592, "learning_rate": 1e-06, "loss": 0.3937, "mean_token_accuracy": 0.8815417885780334, "num_tokens": 343222808.0, "step": 8999 }, { "epoch": 1.144892507314591, "ewc_loss": 0.05353229492902756, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002277057501487434, "grad_norm": 6.260498523712158, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8591668605804443, "num_tokens": 343268229.0, "step": 9000 }, { "epoch": 1.1450197175931816, "ewc_loss": 0.0535091757774353, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002274745493195951, "grad_norm": 6.324460983276367, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8554757833480835, "num_tokens": 343304459.0, "step": 9001 }, { "epoch": 1.145146927871772, "ewc_loss": 0.05349639803171158, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002273467689519748, "grad_norm": 6.278068542480469, "learning_rate": 1e-06, "loss": 0.5837, "mean_token_accuracy": 0.8250130414962769, "num_tokens": 343343211.0, "step": 9002 }, { "epoch": 1.1452741381503626, "ewc_loss": 0.053498879075050354, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002273715945193544, "grad_norm": 6.25139045715332, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8599374294281006, "num_tokens": 343388498.0, "step": 9003 }, { "epoch": 1.1454013484289531, "ewc_loss": 0.05344254523515701, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022680826077703387, "grad_norm": 6.218076229095459, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8600994944572449, "num_tokens": 343430101.0, "step": 9004 }, { "epoch": 1.1455285587075437, "ewc_loss": 0.0534697026014328, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022707984317094088, "grad_norm": 6.320908069610596, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8510845303535461, "num_tokens": 343466454.0, "step": 9005 }, { "epoch": 1.145655768986134, "ewc_loss": 0.053187593817710876, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022670015459880233, "grad_norm": 6.239107131958008, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.877927303314209, "num_tokens": 343506960.0, "step": 9006 }, { "epoch": 1.1457829792647245, "ewc_loss": 0.05339992791414261, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022638210793957114, "grad_norm": 6.263699531555176, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8705431818962097, "num_tokens": 343541133.0, "step": 9007 }, { "epoch": 1.145910189543315, "ewc_loss": 0.053169477730989456, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002265189978061244, "grad_norm": 6.247162818908691, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8712743520736694, "num_tokens": 343580266.0, "step": 9008 }, { "epoch": 1.1460373998219056, "ewc_loss": 0.05344850942492485, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022686790907755494, "grad_norm": 6.312021255493164, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8613964319229126, "num_tokens": 343611691.0, "step": 9009 }, { "epoch": 1.146164610100496, "ewc_loss": 0.05372147262096405, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022715612431056798, "grad_norm": 6.2656402587890625, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8685659766197205, "num_tokens": 343649355.0, "step": 9010 }, { "epoch": 1.1462918203790866, "ewc_loss": 0.05363300442695618, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022627146972808987, "grad_norm": 6.223207473754883, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.876092791557312, "num_tokens": 343681606.0, "step": 9011 }, { "epoch": 1.1464190306576771, "ewc_loss": 0.05368723347783089, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022681374684907496, "grad_norm": 6.216078281402588, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8737977743148804, "num_tokens": 343721592.0, "step": 9012 }, { "epoch": 1.1465462409362677, "ewc_loss": 0.053278692066669464, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022761111904401332, "grad_norm": 6.227672100067139, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8602489829063416, "num_tokens": 343764084.0, "step": 9013 }, { "epoch": 1.1466734512148582, "ewc_loss": 0.053678080439567566, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022672222985420376, "grad_norm": 6.216371536254883, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8640424013137817, "num_tokens": 343801213.0, "step": 9014 }, { "epoch": 1.1468006614934487, "ewc_loss": 0.05377809703350067, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022772235388401896, "grad_norm": 6.306591510772705, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.86333829164505, "num_tokens": 343839761.0, "step": 9015 }, { "epoch": 1.1469278717720393, "ewc_loss": 0.05372941493988037, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022723556321579963, "grad_norm": 6.208903789520264, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8628451824188232, "num_tokens": 343877792.0, "step": 9016 }, { "epoch": 1.1470550820506298, "ewc_loss": 0.053364939987659454, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022847364016342908, "grad_norm": 6.266348838806152, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8686860799789429, "num_tokens": 343915107.0, "step": 9017 }, { "epoch": 1.1471822923292203, "ewc_loss": 0.05326889455318451, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022751315555069596, "grad_norm": 6.2987284660339355, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8671735525131226, "num_tokens": 343949783.0, "step": 9018 }, { "epoch": 1.1473095026078106, "ewc_loss": 0.053236618638038635, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022719037951901555, "grad_norm": 6.2482686042785645, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8446000814437866, "num_tokens": 343991058.0, "step": 9019 }, { "epoch": 1.1474367128864011, "ewc_loss": 0.053258851170539856, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022741271823178977, "grad_norm": 6.25662899017334, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8643059134483337, "num_tokens": 344030424.0, "step": 9020 }, { "epoch": 1.1475639231649917, "ewc_loss": 0.053240980952978134, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022723402071278542, "grad_norm": 6.276299953460693, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8644107580184937, "num_tokens": 344071385.0, "step": 9021 }, { "epoch": 1.1476911334435822, "ewc_loss": 0.053746700286865234, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022740841086488217, "grad_norm": 6.2330708503723145, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8722115755081177, "num_tokens": 344109725.0, "step": 9022 }, { "epoch": 1.1478183437221727, "ewc_loss": 0.05374595895409584, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002274009893881157, "grad_norm": 6.327205181121826, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8704374432563782, "num_tokens": 344143947.0, "step": 9023 }, { "epoch": 1.1479455540007633, "ewc_loss": 0.05328473448753357, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022767158225178719, "grad_norm": 6.342792510986328, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8530805110931396, "num_tokens": 344179969.0, "step": 9024 }, { "epoch": 1.1480727642793538, "ewc_loss": 0.053777121007442474, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022771261865273118, "grad_norm": 6.274357318878174, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8738232851028442, "num_tokens": 344212279.0, "step": 9025 }, { "epoch": 1.1481999745579443, "ewc_loss": 0.05378948152065277, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002278362080687657, "grad_norm": 6.349776268005371, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.869455099105835, "num_tokens": 344248402.0, "step": 9026 }, { "epoch": 1.1483271848365348, "ewc_loss": 0.053797610104084015, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022791748051531613, "grad_norm": 6.250135898590088, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8602570295333862, "num_tokens": 344286557.0, "step": 9027 }, { "epoch": 1.1484543951151254, "ewc_loss": 0.05374635010957718, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00022740490385331213, "grad_norm": 6.245929718017578, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8642305135726929, "num_tokens": 344331014.0, "step": 9028 }, { "epoch": 1.148581605393716, "ewc_loss": 0.05356607958674431, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022804360196460038, "grad_norm": 6.2781476974487305, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8596254587173462, "num_tokens": 344373239.0, "step": 9029 }, { "epoch": 1.1487088156723062, "ewc_loss": 0.05360252037644386, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022840801102574915, "grad_norm": 6.267834186553955, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8783875107765198, "num_tokens": 344409871.0, "step": 9030 }, { "epoch": 1.1488360259508967, "ewc_loss": 0.05360717698931694, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022845459170639515, "grad_norm": 6.244195461273193, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8542347550392151, "num_tokens": 344452108.0, "step": 9031 }, { "epoch": 1.1489632362294873, "ewc_loss": 0.05357944220304489, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022817723220214248, "grad_norm": 6.300928592681885, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8574445843696594, "num_tokens": 344489798.0, "step": 9032 }, { "epoch": 1.1490904465080778, "ewc_loss": 0.05355698615312576, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002279526524944231, "grad_norm": 6.273171901702881, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8719884157180786, "num_tokens": 344528194.0, "step": 9033 }, { "epoch": 1.1492176567866683, "ewc_loss": 0.05329533666372299, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002277775784023106, "grad_norm": 6.326435565948486, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8479254245758057, "num_tokens": 344559699.0, "step": 9034 }, { "epoch": 1.1493448670652588, "ewc_loss": 0.05328623577952385, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022768658527638763, "grad_norm": 6.237617015838623, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8564496636390686, "num_tokens": 344598234.0, "step": 9035 }, { "epoch": 1.1494720773438494, "ewc_loss": 0.05333787202835083, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022820295998826623, "grad_norm": 6.299179553985596, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8564584851264954, "num_tokens": 344634780.0, "step": 9036 }, { "epoch": 1.14959928762244, "ewc_loss": 0.0535285547375679, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002276683517266065, "grad_norm": 6.226394176483154, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8674896955490112, "num_tokens": 344677167.0, "step": 9037 }, { "epoch": 1.1497264979010304, "ewc_loss": 0.05335909128189087, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002284151123603806, "grad_norm": 6.356156826019287, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8500604629516602, "num_tokens": 344713824.0, "step": 9038 }, { "epoch": 1.149853708179621, "ewc_loss": 0.053334563970565796, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022816983982920647, "grad_norm": 6.273409366607666, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8502382040023804, "num_tokens": 344755814.0, "step": 9039 }, { "epoch": 1.1499809184582115, "ewc_loss": 0.0533599928021431, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022842411999590695, "grad_norm": 6.279087066650391, "learning_rate": 1e-06, "loss": 0.4018, "mean_token_accuracy": 0.8776900768280029, "num_tokens": 344795022.0, "step": 9040 }, { "epoch": 1.150108128736802, "ewc_loss": 0.05339566990733147, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022878091840539128, "grad_norm": 6.3336687088012695, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8722964525222778, "num_tokens": 344833906.0, "step": 9041 }, { "epoch": 1.1502353390153925, "ewc_loss": 0.05328647419810295, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022768895723856986, "grad_norm": 6.31975793838501, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8682698011398315, "num_tokens": 344875975.0, "step": 9042 }, { "epoch": 1.150362549293983, "ewc_loss": 0.05330806225538254, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002279048494528979, "grad_norm": 6.281798362731934, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8666571974754333, "num_tokens": 344915433.0, "step": 9043 }, { "epoch": 1.1504897595725734, "ewc_loss": 0.05327332392334938, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002275574515806511, "grad_norm": 6.3255462646484375, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8673219680786133, "num_tokens": 344949038.0, "step": 9044 }, { "epoch": 1.150616969851164, "ewc_loss": 0.05324169248342514, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022724116570316255, "grad_norm": 6.33357048034668, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8648563027381897, "num_tokens": 344988260.0, "step": 9045 }, { "epoch": 1.1507441801297544, "ewc_loss": 0.053260765969753265, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022743189765606076, "grad_norm": 6.304308891296387, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8757727146148682, "num_tokens": 345024355.0, "step": 9046 }, { "epoch": 1.150871390408345, "ewc_loss": 0.053270332515239716, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022752753284294158, "grad_norm": 6.331360816955566, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8721745014190674, "num_tokens": 345063041.0, "step": 9047 }, { "epoch": 1.1509986006869355, "ewc_loss": 0.053161825984716415, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022644248383585364, "grad_norm": 6.277644634246826, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8821312189102173, "num_tokens": 345097813.0, "step": 9048 }, { "epoch": 1.151125810965526, "ewc_loss": 0.05325260013341904, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002273502032039687, "grad_norm": 6.338598251342773, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.859413743019104, "num_tokens": 345134534.0, "step": 9049 }, { "epoch": 1.1512530212441165, "ewc_loss": 0.05319418013095856, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022676603111904114, "grad_norm": 6.2989702224731445, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8674193620681763, "num_tokens": 345171018.0, "step": 9050 }, { "epoch": 1.151380231522707, "ewc_loss": 0.0532451868057251, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022727610485162586, "grad_norm": 6.34226655960083, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8745501041412354, "num_tokens": 345208006.0, "step": 9051 }, { "epoch": 1.1515074418012976, "ewc_loss": 0.05317212641239166, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022654548229184002, "grad_norm": 6.243747234344482, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8810045123100281, "num_tokens": 345246616.0, "step": 9052 }, { "epoch": 1.1516346520798881, "ewc_loss": 0.0531991571187973, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002268158132210374, "grad_norm": 6.306727409362793, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8793817162513733, "num_tokens": 345288095.0, "step": 9053 }, { "epoch": 1.1517618623584787, "ewc_loss": 0.053236640989780426, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022719064145348966, "grad_norm": 6.2486138343811035, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.850317120552063, "num_tokens": 345333291.0, "step": 9054 }, { "epoch": 1.151889072637069, "ewc_loss": 0.05321367830038071, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022696102678310126, "grad_norm": 6.282948970794678, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8623775243759155, "num_tokens": 345375093.0, "step": 9055 }, { "epoch": 1.1520162829156595, "ewc_loss": 0.053248003125190735, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022730424825567752, "grad_norm": 6.28224515914917, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8581662178039551, "num_tokens": 345415547.0, "step": 9056 }, { "epoch": 1.15214349319425, "ewc_loss": 0.0532117560505867, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022694177459925413, "grad_norm": 6.330048561096191, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.868071436882019, "num_tokens": 345457340.0, "step": 9057 }, { "epoch": 1.1522707034728406, "ewc_loss": 0.0532059520483017, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002268837415613234, "grad_norm": 6.285475254058838, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8594350218772888, "num_tokens": 345494711.0, "step": 9058 }, { "epoch": 1.152397913751431, "ewc_loss": 0.05324897542595863, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022731396893505007, "grad_norm": 6.332981109619141, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8557758331298828, "num_tokens": 345534249.0, "step": 9059 }, { "epoch": 1.1525251240300216, "ewc_loss": 0.05324847251176834, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022730894852429628, "grad_norm": 6.311092853546143, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8631075620651245, "num_tokens": 345570832.0, "step": 9060 }, { "epoch": 1.1526523343086121, "ewc_loss": 0.053233616054058075, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022716037346981466, "grad_norm": 6.275750160217285, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8593095541000366, "num_tokens": 345609007.0, "step": 9061 }, { "epoch": 1.1527795445872027, "ewc_loss": 0.05325227230787277, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022734694357495755, "grad_norm": 6.289664268493652, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8740398287773132, "num_tokens": 345647979.0, "step": 9062 }, { "epoch": 1.1529067548657932, "ewc_loss": 0.05328097194433212, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022763392189517617, "grad_norm": 6.304839134216309, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8566490411758423, "num_tokens": 345681821.0, "step": 9063 }, { "epoch": 1.1530339651443837, "ewc_loss": 0.05324611812829971, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022728541807737201, "grad_norm": 6.2745585441589355, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.853699803352356, "num_tokens": 345724274.0, "step": 9064 }, { "epoch": 1.1531611754229742, "ewc_loss": 0.053298160433769226, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022780580911785364, "grad_norm": 6.269928932189941, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8802538514137268, "num_tokens": 345763558.0, "step": 9065 }, { "epoch": 1.1532883857015648, "ewc_loss": 0.053223371505737305, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022705792798660696, "grad_norm": 6.323944568634033, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8588924407958984, "num_tokens": 345798486.0, "step": 9066 }, { "epoch": 1.1534155959801553, "ewc_loss": 0.05352190136909485, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002276018203701824, "grad_norm": 6.2803120613098145, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.860744833946228, "num_tokens": 345839379.0, "step": 9067 }, { "epoch": 1.1535428062587456, "ewc_loss": 0.05350440740585327, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022742686269339174, "grad_norm": 6.323598861694336, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8587821125984192, "num_tokens": 345876675.0, "step": 9068 }, { "epoch": 1.1536700165373361, "ewc_loss": 0.05358650162816048, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002282478380948305, "grad_norm": 6.286752223968506, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.8821806311607361, "num_tokens": 345913482.0, "step": 9069 }, { "epoch": 1.1537972268159267, "ewc_loss": 0.05348694324493408, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002272522688144818, "grad_norm": 6.307476043701172, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.859559178352356, "num_tokens": 345947671.0, "step": 9070 }, { "epoch": 1.1539244370945172, "ewc_loss": 0.0535387359559536, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022777017147745937, "grad_norm": 6.30685567855835, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8668306469917297, "num_tokens": 345981520.0, "step": 9071 }, { "epoch": 1.1540516473731077, "ewc_loss": 0.053546056151390076, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002278433967148885, "grad_norm": 6.271895885467529, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8462589979171753, "num_tokens": 346020050.0, "step": 9072 }, { "epoch": 1.1541788576516983, "ewc_loss": 0.05352408438920975, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022762366279494017, "grad_norm": 6.310424327850342, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8550664186477661, "num_tokens": 346050000.0, "step": 9073 }, { "epoch": 1.1543060679302888, "ewc_loss": 0.05356341972947121, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022801700106356293, "grad_norm": 6.3374857902526855, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8577159643173218, "num_tokens": 346085038.0, "step": 9074 }, { "epoch": 1.1544332782088793, "ewc_loss": 0.05359168350696564, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002282996429130435, "grad_norm": 6.244381904602051, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8737766742706299, "num_tokens": 346125871.0, "step": 9075 }, { "epoch": 1.1545604884874698, "ewc_loss": 0.05365734547376633, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022895623988006264, "grad_norm": 6.269877910614014, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8720120191574097, "num_tokens": 346160460.0, "step": 9076 }, { "epoch": 1.1546876987660604, "ewc_loss": 0.053579024970531464, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002281730412505567, "grad_norm": 6.265262126922607, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8529047966003418, "num_tokens": 346201621.0, "step": 9077 }, { "epoch": 1.154814909044651, "ewc_loss": 0.05366985872387886, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022908140090294182, "grad_norm": 6.33250093460083, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8658404350280762, "num_tokens": 346242680.0, "step": 9078 }, { "epoch": 1.1549421193232412, "ewc_loss": 0.053631141781806946, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022869420354254544, "grad_norm": 9.552406311035156, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8545109033584595, "num_tokens": 346275774.0, "step": 9079 }, { "epoch": 1.1550693296018317, "ewc_loss": 0.05702321603894234, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002626149798743427, "grad_norm": 6.693183898925781, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8653600215911865, "num_tokens": 346317648.0, "step": 9080 }, { "epoch": 1.1551965398804223, "ewc_loss": 0.052943289279937744, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002218157023889944, "grad_norm": 6.1992387771606445, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8666507601737976, "num_tokens": 346357736.0, "step": 9081 }, { "epoch": 1.1553237501590128, "ewc_loss": 0.053947582840919495, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002318586193723604, "grad_norm": 6.389883995056152, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8619444370269775, "num_tokens": 346397961.0, "step": 9082 }, { "epoch": 1.1554509604376033, "ewc_loss": 0.05358792841434479, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023070350289344788, "grad_norm": 6.23879861831665, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8753919005393982, "num_tokens": 346438268.0, "step": 9083 }, { "epoch": 1.1555781707161938, "ewc_loss": 0.053453296422958374, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022935718880034983, "grad_norm": 6.324069976806641, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8544304966926575, "num_tokens": 346479024.0, "step": 9084 }, { "epoch": 1.1557053809947844, "ewc_loss": 0.05339503288269043, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022877454466652125, "grad_norm": 6.218036651611328, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8695521354675293, "num_tokens": 346523526.0, "step": 9085 }, { "epoch": 1.155832591273375, "ewc_loss": 0.053535014390945435, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023017438070382923, "grad_norm": 6.344192028045654, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8723497986793518, "num_tokens": 346561985.0, "step": 9086 }, { "epoch": 1.1559598015519654, "ewc_loss": 0.05340752750635147, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.000228899487410672, "grad_norm": 6.23301362991333, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8507411479949951, "num_tokens": 346605600.0, "step": 9087 }, { "epoch": 1.156087011830556, "ewc_loss": 0.05377614498138428, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023014428734313697, "grad_norm": 6.334661483764648, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.876555323600769, "num_tokens": 346640798.0, "step": 9088 }, { "epoch": 1.1562142221091465, "ewc_loss": 0.053378671407699585, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022861090837977827, "grad_norm": 6.240237236022949, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8591431379318237, "num_tokens": 346678483.0, "step": 9089 }, { "epoch": 1.156341432387737, "ewc_loss": 0.0535244420170784, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023006864648777992, "grad_norm": 6.291479110717773, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8803632259368896, "num_tokens": 346714604.0, "step": 9090 }, { "epoch": 1.1564686426663275, "ewc_loss": 0.05342629551887512, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022908716346137226, "grad_norm": 6.3128790855407715, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8749874830245972, "num_tokens": 346747203.0, "step": 9091 }, { "epoch": 1.156595852944918, "ewc_loss": 0.053473927080631256, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002295634913025424, "grad_norm": 6.262172222137451, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8619112968444824, "num_tokens": 346790650.0, "step": 9092 }, { "epoch": 1.1567230632235084, "ewc_loss": 0.05370746925473213, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002294575097039342, "grad_norm": 6.3089599609375, "learning_rate": 1e-06, "loss": 0.3856, "mean_token_accuracy": 0.8849086165428162, "num_tokens": 346822872.0, "step": 9093 }, { "epoch": 1.156850273502099, "ewc_loss": 0.05371759086847305, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022955870372243226, "grad_norm": 6.304720878601074, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8705040812492371, "num_tokens": 346857999.0, "step": 9094 }, { "epoch": 1.1569774837806894, "ewc_loss": 0.053701627999544144, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022939909831620753, "grad_norm": 6.297463417053223, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8708036541938782, "num_tokens": 346891804.0, "step": 9095 }, { "epoch": 1.15710469405928, "ewc_loss": 0.05341630429029465, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002289872500114143, "grad_norm": 9.591699600219727, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.862180233001709, "num_tokens": 346931554.0, "step": 9096 }, { "epoch": 1.1572319043378705, "ewc_loss": 0.057268135249614716, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00026506418362259865, "grad_norm": 6.710165023803711, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8514916896820068, "num_tokens": 346965214.0, "step": 9097 }, { "epoch": 1.157359114616461, "ewc_loss": 0.053065765649080276, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002230404643341899, "grad_norm": 6.218974590301514, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8753084540367126, "num_tokens": 347001219.0, "step": 9098 }, { "epoch": 1.1574863248950515, "ewc_loss": 0.05391526222229004, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023397682525683194, "grad_norm": 6.445924758911133, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8468193411827087, "num_tokens": 347043212.0, "step": 9099 }, { "epoch": 1.157613535173642, "ewc_loss": 0.053657226264476776, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023139647964853793, "grad_norm": 6.270057678222656, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8761438131332397, "num_tokens": 347082249.0, "step": 9100 }, { "epoch": 1.1577407454522326, "ewc_loss": 0.0536908358335495, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023173259978648275, "grad_norm": 6.426084518432617, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8605262041091919, "num_tokens": 347119286.0, "step": 9101 }, { "epoch": 1.1578679557308231, "ewc_loss": 0.0535617396235466, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023044159752316773, "grad_norm": 6.259454727172852, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8769437074661255, "num_tokens": 347155549.0, "step": 9102 }, { "epoch": 1.1579951660094137, "ewc_loss": 0.05363047868013382, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023112900089472532, "grad_norm": 6.356220722198486, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8667793273925781, "num_tokens": 347196846.0, "step": 9103 }, { "epoch": 1.158122376288004, "ewc_loss": 0.053541429340839386, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023023852554615587, "grad_norm": 6.2564921379089355, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8776510953903198, "num_tokens": 347234028.0, "step": 9104 }, { "epoch": 1.1582495865665945, "ewc_loss": 0.05383822321891785, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002307650720467791, "grad_norm": 6.31207275390625, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8752870559692383, "num_tokens": 347276931.0, "step": 9105 }, { "epoch": 1.158376796845185, "ewc_loss": 0.05352242290973663, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023004844842944294, "grad_norm": 6.300915718078613, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.860910177230835, "num_tokens": 347317139.0, "step": 9106 }, { "epoch": 1.1585040071237755, "ewc_loss": 0.053509682416915894, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022992101730778813, "grad_norm": 6.327758312225342, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8651081323623657, "num_tokens": 347355718.0, "step": 9107 }, { "epoch": 1.158631217402366, "ewc_loss": 0.0534333661198616, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022915785666555166, "grad_norm": 6.3137102127075195, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8536189794540405, "num_tokens": 347393403.0, "step": 9108 }, { "epoch": 1.1587584276809566, "ewc_loss": 0.05342914164066315, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002291156124556437, "grad_norm": 6.278478622436523, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8706032037734985, "num_tokens": 347433736.0, "step": 9109 }, { "epoch": 1.1588856379595471, "ewc_loss": 0.05348573997616768, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022968162375036627, "grad_norm": 6.313892841339111, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8658174276351929, "num_tokens": 347474574.0, "step": 9110 }, { "epoch": 1.1590128482381377, "ewc_loss": 0.05345086380839348, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.000229332857998088, "grad_norm": 6.274484634399414, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8707135915756226, "num_tokens": 347518785.0, "step": 9111 }, { "epoch": 1.1591400585167282, "ewc_loss": 0.053493794053792953, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022976215404924005, "grad_norm": 6.3379034996032715, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8528998494148254, "num_tokens": 347557518.0, "step": 9112 }, { "epoch": 1.1592672687953187, "ewc_loss": 0.05348033830523491, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022962760704103857, "grad_norm": 6.351004600524902, "learning_rate": 1e-06, "loss": 0.5414, "mean_token_accuracy": 0.8353116512298584, "num_tokens": 347600094.0, "step": 9113 }, { "epoch": 1.1593944790739092, "ewc_loss": 0.05350475758314133, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002298718027304858, "grad_norm": 6.357322692871094, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8641118407249451, "num_tokens": 347633709.0, "step": 9114 }, { "epoch": 1.1595216893524998, "ewc_loss": 0.05347577854990959, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00022958200133871287, "grad_norm": 6.286631107330322, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.871894359588623, "num_tokens": 347670027.0, "step": 9115 }, { "epoch": 1.1596488996310903, "ewc_loss": 0.053519830107688904, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023002250236459076, "grad_norm": 6.3102545738220215, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8685098886489868, "num_tokens": 347707488.0, "step": 9116 }, { "epoch": 1.1597761099096806, "ewc_loss": 0.05379915237426758, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023037430946715176, "grad_norm": 6.287563323974609, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8592584133148193, "num_tokens": 347746030.0, "step": 9117 }, { "epoch": 1.1599033201882711, "ewc_loss": 0.053592801094055176, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023075223725754768, "grad_norm": 6.354428291320801, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8570467829704285, "num_tokens": 347783409.0, "step": 9118 }, { "epoch": 1.1600305304668617, "ewc_loss": 0.05353043973445892, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023012861493043602, "grad_norm": 6.29442834854126, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8715711832046509, "num_tokens": 347821335.0, "step": 9119 }, { "epoch": 1.1601577407454522, "ewc_loss": 0.053528472781181335, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023010894074104726, "grad_norm": 6.301596641540527, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8752585649490356, "num_tokens": 347857590.0, "step": 9120 }, { "epoch": 1.1602849510240427, "ewc_loss": 0.05359065532684326, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023073078773450106, "grad_norm": 6.317224025726318, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8767086267471313, "num_tokens": 347897000.0, "step": 9121 }, { "epoch": 1.1604121613026332, "ewc_loss": 0.05359247326850891, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023074894852470607, "grad_norm": 6.368765830993652, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.864800214767456, "num_tokens": 347928947.0, "step": 9122 }, { "epoch": 1.1605393715812238, "ewc_loss": 0.05353193357586861, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023014354519546032, "grad_norm": 6.3658881187438965, "learning_rate": 1e-06, "loss": 0.5669, "mean_token_accuracy": 0.8340156078338623, "num_tokens": 347959294.0, "step": 9123 }, { "epoch": 1.1606665818598143, "ewc_loss": 0.053826622664928436, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023064904962666333, "grad_norm": 6.3091583251953125, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8703721165657043, "num_tokens": 347993103.0, "step": 9124 }, { "epoch": 1.1607937921384048, "ewc_loss": 0.05374795198440552, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002298623585375026, "grad_norm": 6.286242485046387, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8766601085662842, "num_tokens": 348028412.0, "step": 9125 }, { "epoch": 1.1609210024169954, "ewc_loss": 0.05381212383508682, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002305040688952431, "grad_norm": 6.274945259094238, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8473771810531616, "num_tokens": 348071429.0, "step": 9126 }, { "epoch": 1.161048212695586, "ewc_loss": 0.053569331765174866, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023051754396874458, "grad_norm": 6.331505298614502, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8685734272003174, "num_tokens": 348113222.0, "step": 9127 }, { "epoch": 1.1611754229741762, "ewc_loss": 0.05409606546163559, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023090203467290848, "grad_norm": 6.33610200881958, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8632161617279053, "num_tokens": 348150188.0, "step": 9128 }, { "epoch": 1.1613026332527667, "ewc_loss": 0.0540485680103302, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023042710381560028, "grad_norm": 6.261775970458984, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8680189847946167, "num_tokens": 348188547.0, "step": 9129 }, { "epoch": 1.1614298435313573, "ewc_loss": 0.05412319302558899, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023117332602851093, "grad_norm": 6.377699851989746, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8587802648544312, "num_tokens": 348228624.0, "step": 9130 }, { "epoch": 1.1615570538099478, "ewc_loss": 0.05404401570558548, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023038157087285072, "grad_norm": 6.3052191734313965, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8609147071838379, "num_tokens": 348264932.0, "step": 9131 }, { "epoch": 1.1616842640885383, "ewc_loss": 0.05405852943658829, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023052671167533845, "grad_norm": 6.262684345245361, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8662705421447754, "num_tokens": 348307680.0, "step": 9132 }, { "epoch": 1.1618114743671288, "ewc_loss": 0.05408968776464462, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023083826818037778, "grad_norm": 6.363125801086426, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8788461685180664, "num_tokens": 348346449.0, "step": 9133 }, { "epoch": 1.1619386846457194, "ewc_loss": 0.05406155064702034, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002305569068994373, "grad_norm": 6.286717891693115, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8748385906219482, "num_tokens": 348386393.0, "step": 9134 }, { "epoch": 1.16206589492431, "ewc_loss": 0.05379820242524147, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002303648361703381, "grad_norm": 6.310897350311279, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8582838773727417, "num_tokens": 348429458.0, "step": 9135 }, { "epoch": 1.1621931052029004, "ewc_loss": 0.05370398238301277, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002294226287631318, "grad_norm": 6.295810699462891, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8521555066108704, "num_tokens": 348471365.0, "step": 9136 }, { "epoch": 1.162320315481491, "ewc_loss": 0.05383818596601486, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023076465004123747, "grad_norm": 6.327474117279053, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8487573862075806, "num_tokens": 348513425.0, "step": 9137 }, { "epoch": 1.1624475257600815, "ewc_loss": 0.05372358858585358, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022961868671700358, "grad_norm": 6.253298759460449, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8614457249641418, "num_tokens": 348551181.0, "step": 9138 }, { "epoch": 1.162574736038672, "ewc_loss": 0.05383648723363876, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023074769705999643, "grad_norm": 6.356230735778809, "learning_rate": 1e-06, "loss": 0.3757, "mean_token_accuracy": 0.8852952718734741, "num_tokens": 348583611.0, "step": 9139 }, { "epoch": 1.1627019463172625, "ewc_loss": 0.05373886227607727, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022977142361924052, "grad_norm": 6.259373664855957, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.863158106803894, "num_tokens": 348624281.0, "step": 9140 }, { "epoch": 1.162829156595853, "ewc_loss": 0.05358583480119705, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.0002306825917912647, "grad_norm": 6.285739421844482, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8627916574478149, "num_tokens": 348661416.0, "step": 9141 }, { "epoch": 1.1629563668744434, "ewc_loss": 0.05354071408510208, "ewc_loss_diag": 3.0517578125e-05, "ewc_loss_parallel": 0.00023023133690003306, "grad_norm": 6.360677719116211, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.842612087726593, "num_tokens": 348703907.0, "step": 9142 }, { "epoch": 1.163083577153034, "ewc_loss": 0.053776830434799194, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023015114129520953, "grad_norm": 6.328357219696045, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8754192590713501, "num_tokens": 348735617.0, "step": 9143 }, { "epoch": 1.1632107874316244, "ewc_loss": 0.05381246656179428, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023050745949149132, "grad_norm": 6.31593132019043, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8632000684738159, "num_tokens": 348773552.0, "step": 9144 }, { "epoch": 1.163337997710215, "ewc_loss": 0.05381385236978531, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023052134201861918, "grad_norm": 6.323697566986084, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8652439117431641, "num_tokens": 348811274.0, "step": 9145 }, { "epoch": 1.1634652079888055, "ewc_loss": 0.05383121222257614, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002306949463672936, "grad_norm": 6.36191987991333, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8523674607276917, "num_tokens": 348847773.0, "step": 9146 }, { "epoch": 1.163592418267396, "ewc_loss": 0.05380285531282425, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023041135864332318, "grad_norm": 6.295962810516357, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8557500839233398, "num_tokens": 348886356.0, "step": 9147 }, { "epoch": 1.1637196285459865, "ewc_loss": 0.05387238413095474, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023110666370484978, "grad_norm": 6.370361328125, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8754438161849976, "num_tokens": 348922892.0, "step": 9148 }, { "epoch": 1.163846838824577, "ewc_loss": 0.053825803101062775, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023064084234647453, "grad_norm": 6.35586404800415, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8832736015319824, "num_tokens": 348954924.0, "step": 9149 }, { "epoch": 1.1639740491031676, "ewc_loss": 0.05382295697927475, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023061237880028784, "grad_norm": 6.342322826385498, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8700400590896606, "num_tokens": 348997335.0, "step": 9150 }, { "epoch": 1.1641012593817581, "ewc_loss": 0.053768664598464966, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002300694613950327, "grad_norm": 6.351848602294922, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8528707027435303, "num_tokens": 349038221.0, "step": 9151 }, { "epoch": 1.1642284696603487, "ewc_loss": 0.053696125745773315, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022934407752472907, "grad_norm": 6.296852111816406, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8593224287033081, "num_tokens": 349076559.0, "step": 9152 }, { "epoch": 1.164355679938939, "ewc_loss": 0.05382885783910751, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023067141592036933, "grad_norm": 6.351645469665527, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8426692485809326, "num_tokens": 349112264.0, "step": 9153 }, { "epoch": 1.1644828902175295, "ewc_loss": 0.05373826622962952, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022976548643782735, "grad_norm": 6.290519714355469, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8763703107833862, "num_tokens": 349146571.0, "step": 9154 }, { "epoch": 1.16461010049612, "ewc_loss": 0.053834348917007446, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002307262911926955, "grad_norm": 6.3034820556640625, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8713175654411316, "num_tokens": 349183674.0, "step": 9155 }, { "epoch": 1.1647373107747105, "ewc_loss": 0.05386585742235184, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023104138381313533, "grad_norm": 6.274951457977295, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8690458536148071, "num_tokens": 349227042.0, "step": 9156 }, { "epoch": 1.164864521053301, "ewc_loss": 0.05388969928026199, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002312798023922369, "grad_norm": 6.322402477264404, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8597978353500366, "num_tokens": 349264072.0, "step": 9157 }, { "epoch": 1.1649917313318916, "ewc_loss": 0.05390745773911476, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023145739396568388, "grad_norm": 6.285022735595703, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8655095100402832, "num_tokens": 349304427.0, "step": 9158 }, { "epoch": 1.1651189416104821, "ewc_loss": 0.05385925620794296, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023097537632565945, "grad_norm": 6.3549065589904785, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8638526797294617, "num_tokens": 349346708.0, "step": 9159 }, { "epoch": 1.1652461518890727, "ewc_loss": 0.05396391451358795, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023202193551696837, "grad_norm": 6.310131072998047, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8625236749649048, "num_tokens": 349387716.0, "step": 9160 }, { "epoch": 1.1653733621676632, "ewc_loss": 0.05387365072965622, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023111932387109846, "grad_norm": 6.289876937866211, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8608126044273376, "num_tokens": 349425757.0, "step": 9161 }, { "epoch": 1.1655005724462537, "ewc_loss": 0.05389714613556862, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023135427909437567, "grad_norm": 6.281780242919922, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8719451427459717, "num_tokens": 349472097.0, "step": 9162 }, { "epoch": 1.1656277827248442, "ewc_loss": 0.0538976714015007, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023135950323194265, "grad_norm": 6.266932964324951, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8608092069625854, "num_tokens": 349513809.0, "step": 9163 }, { "epoch": 1.1657549930034348, "ewc_loss": 0.05396074801683426, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023199028510134667, "grad_norm": 6.349193096160889, "learning_rate": 1e-06, "loss": 0.3993, "mean_token_accuracy": 0.8788473606109619, "num_tokens": 349544906.0, "step": 9164 }, { "epoch": 1.1658822032820253, "ewc_loss": 0.0539248026907444, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023163083824329078, "grad_norm": 6.285809516906738, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8592036962509155, "num_tokens": 349582529.0, "step": 9165 }, { "epoch": 1.1660094135606156, "ewc_loss": 0.05395262688398361, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023190907086245716, "grad_norm": 6.341208457946777, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8600286841392517, "num_tokens": 349624871.0, "step": 9166 }, { "epoch": 1.1661366238392061, "ewc_loss": 0.053915783762931824, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023154063092079014, "grad_norm": 6.363794803619385, "learning_rate": 1e-06, "loss": 0.5553, "mean_token_accuracy": 0.8290349245071411, "num_tokens": 349660783.0, "step": 9167 }, { "epoch": 1.1662638341177967, "ewc_loss": 0.053875237703323364, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002311352000106126, "grad_norm": 6.294524192810059, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.87410968542099, "num_tokens": 349698430.0, "step": 9168 }, { "epoch": 1.1663910443963872, "ewc_loss": 0.05387865751981735, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023116938245948404, "grad_norm": 6.316146373748779, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8558920621871948, "num_tokens": 349734981.0, "step": 9169 }, { "epoch": 1.1665182546749777, "ewc_loss": 0.0541156530380249, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023109796165954322, "grad_norm": 6.361861705780029, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8583743572235107, "num_tokens": 349774267.0, "step": 9170 }, { "epoch": 1.1666454649535682, "ewc_loss": 0.05392995849251747, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023168239567894489, "grad_norm": 6.284441947937012, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8628726601600647, "num_tokens": 349811400.0, "step": 9171 }, { "epoch": 1.1667726752321588, "ewc_loss": 0.05387677997350693, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023115061048883945, "grad_norm": 6.333269119262695, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8556610941886902, "num_tokens": 349850031.0, "step": 9172 }, { "epoch": 1.1668998855107493, "ewc_loss": 0.05417587235569954, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023170013446360826, "grad_norm": 6.273558139801025, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8664562702178955, "num_tokens": 349889967.0, "step": 9173 }, { "epoch": 1.1670270957893398, "ewc_loss": 0.05421007424592972, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002320421626791358, "grad_norm": 6.314897537231445, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8581832051277161, "num_tokens": 349930276.0, "step": 9174 }, { "epoch": 1.1671543060679304, "ewc_loss": 0.054114826023578644, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023108966706786305, "grad_norm": 6.3974761962890625, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8595672845840454, "num_tokens": 349969889.0, "step": 9175 }, { "epoch": 1.1672815163465209, "ewc_loss": 0.053908318281173706, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023146599414758384, "grad_norm": 6.278649806976318, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8621492385864258, "num_tokens": 350005729.0, "step": 9176 }, { "epoch": 1.1674087266251112, "ewc_loss": 0.05388949438929558, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002312777505721897, "grad_norm": 6.320255756378174, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.870627224445343, "num_tokens": 350043130.0, "step": 9177 }, { "epoch": 1.1675359369037017, "ewc_loss": 0.05390387773513794, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023142161080613732, "grad_norm": 6.309625625610352, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8825187683105469, "num_tokens": 350081233.0, "step": 9178 }, { "epoch": 1.1676631471822922, "ewc_loss": 0.054119132459163666, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023113271163310856, "grad_norm": 6.283319473266602, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8502697348594666, "num_tokens": 350119403.0, "step": 9179 }, { "epoch": 1.1677903574608828, "ewc_loss": 0.05390183627605438, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023140115081332624, "grad_norm": 6.350574970245361, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8522727489471436, "num_tokens": 350159099.0, "step": 9180 }, { "epoch": 1.1679175677394733, "ewc_loss": 0.05415760353207588, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023151744971983135, "grad_norm": 6.301598072052002, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8636462688446045, "num_tokens": 350195642.0, "step": 9181 }, { "epoch": 1.1680447780180638, "ewc_loss": 0.05418813228607178, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023182271979749203, "grad_norm": 6.304582118988037, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8712866902351379, "num_tokens": 350236721.0, "step": 9182 }, { "epoch": 1.1681719882966544, "ewc_loss": 0.053937289863824844, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002317557082278654, "grad_norm": 6.34613037109375, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8583279848098755, "num_tokens": 350272691.0, "step": 9183 }, { "epoch": 1.168299198575245, "ewc_loss": 0.05422888323664665, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023223024618346244, "grad_norm": 6.277534484863281, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8617464303970337, "num_tokens": 350318299.0, "step": 9184 }, { "epoch": 1.1684264088538354, "ewc_loss": 0.054231684654951096, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023225825862027705, "grad_norm": 6.292957782745361, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8627234697341919, "num_tokens": 350360973.0, "step": 9185 }, { "epoch": 1.168553619132426, "ewc_loss": 0.05417871102690697, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023172851069830358, "grad_norm": 6.346634864807129, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8709940314292908, "num_tokens": 350400516.0, "step": 9186 }, { "epoch": 1.1686808294110165, "ewc_loss": 0.05394970253109932, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023187983606476337, "grad_norm": 6.340395927429199, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8593401908874512, "num_tokens": 350438147.0, "step": 9187 }, { "epoch": 1.168808039689607, "ewc_loss": 0.053905241191387177, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002314352459507063, "grad_norm": 6.283990383148193, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8686909675598145, "num_tokens": 350481604.0, "step": 9188 }, { "epoch": 1.1689352499681975, "ewc_loss": 0.05400996655225754, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002324824599782005, "grad_norm": 6.396714210510254, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8838581442832947, "num_tokens": 350518225.0, "step": 9189 }, { "epoch": 1.169062460246788, "ewc_loss": 0.05406802147626877, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023062163381837308, "grad_norm": 6.28000020980835, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8630653619766235, "num_tokens": 350553752.0, "step": 9190 }, { "epoch": 1.1691896705253784, "ewc_loss": 0.05400465428829193, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023242936003953218, "grad_norm": 6.31837797164917, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8694049715995789, "num_tokens": 350593676.0, "step": 9191 }, { "epoch": 1.169316880803969, "ewc_loss": 0.05390234291553497, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002314062148798257, "grad_norm": 6.297244071960449, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8625080585479736, "num_tokens": 350633618.0, "step": 9192 }, { "epoch": 1.1694440910825594, "ewc_loss": 0.053974367678165436, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023212650557979941, "grad_norm": 6.258612632751465, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8627029061317444, "num_tokens": 350678221.0, "step": 9193 }, { "epoch": 1.16957130136115, "ewc_loss": 0.05421849340200424, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023212636006064713, "grad_norm": 6.341463088989258, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8742074370384216, "num_tokens": 350718258.0, "step": 9194 }, { "epoch": 1.1696985116397405, "ewc_loss": 0.053973328322172165, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002321160864084959, "grad_norm": 6.285200595855713, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8736492991447449, "num_tokens": 350757370.0, "step": 9195 }, { "epoch": 1.169825721918331, "ewc_loss": 0.053949594497680664, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023187875922303647, "grad_norm": 6.296574592590332, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8640905022621155, "num_tokens": 350801394.0, "step": 9196 }, { "epoch": 1.1699529321969215, "ewc_loss": 0.05400921404361725, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023247496574185789, "grad_norm": 6.348483085632324, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8559819459915161, "num_tokens": 350840635.0, "step": 9197 }, { "epoch": 1.170080142475512, "ewc_loss": 0.05393945053219795, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023177731782197952, "grad_norm": 6.27277135848999, "learning_rate": 1e-06, "loss": 0.4005, "mean_token_accuracy": 0.8789184093475342, "num_tokens": 350879616.0, "step": 9198 }, { "epoch": 1.1702073527541026, "ewc_loss": 0.0540173277258873, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023255609266925603, "grad_norm": 6.3758544921875, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.86412113904953, "num_tokens": 350921710.0, "step": 9199 }, { "epoch": 1.1703345630326931, "ewc_loss": 0.05396953597664833, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023207816411741078, "grad_norm": 6.309381484985352, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8515810370445251, "num_tokens": 350964218.0, "step": 9200 }, { "epoch": 1.1704617733112836, "ewc_loss": 0.05396084487438202, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002319912746315822, "grad_norm": 6.324073791503906, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8733807802200317, "num_tokens": 351004511.0, "step": 9201 }, { "epoch": 1.170588983589874, "ewc_loss": 0.05403169244527817, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002326997637283057, "grad_norm": 6.3999342918396, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8590497970581055, "num_tokens": 351044583.0, "step": 9202 }, { "epoch": 1.1707161938684645, "ewc_loss": 0.05386122688651085, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023099507961887866, "grad_norm": 6.295316219329834, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8480701446533203, "num_tokens": 351083613.0, "step": 9203 }, { "epoch": 1.170843404147055, "ewc_loss": 0.05400786176323891, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002324614324606955, "grad_norm": 6.3803815841674805, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8574866056442261, "num_tokens": 351117412.0, "step": 9204 }, { "epoch": 1.1709706144256455, "ewc_loss": 0.05389752984046936, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002313580916961655, "grad_norm": 9.61001968383789, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8482013940811157, "num_tokens": 351154779.0, "step": 9205 }, { "epoch": 1.171097824704236, "ewc_loss": 0.057262152433395386, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002650043461471796, "grad_norm": 6.603700637817383, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8754147887229919, "num_tokens": 351191412.0, "step": 9206 }, { "epoch": 1.1712250349828266, "ewc_loss": 0.05362381786108017, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00022862099285703152, "grad_norm": 6.330208778381348, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8671877384185791, "num_tokens": 351235465.0, "step": 9207 }, { "epoch": 1.1713522452614171, "ewc_loss": 0.05437855422496796, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023616835824213922, "grad_norm": 6.434047222137451, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8696900606155396, "num_tokens": 351279326.0, "step": 9208 }, { "epoch": 1.1714794555400077, "ewc_loss": 0.05425563082098961, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002349391143070534, "grad_norm": 6.350489616394043, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8816328644752502, "num_tokens": 351316678.0, "step": 9209 }, { "epoch": 1.1716066658185982, "ewc_loss": 0.05409630015492439, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023334581055678427, "grad_norm": 6.4199628829956055, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8603576421737671, "num_tokens": 351352641.0, "step": 9210 }, { "epoch": 1.1717338760971887, "ewc_loss": 0.05405039340257645, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023288677039090544, "grad_norm": 6.414278507232666, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8556398153305054, "num_tokens": 351385845.0, "step": 9211 }, { "epoch": 1.1718610863757792, "ewc_loss": 0.05408690124750137, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023325181973632425, "grad_norm": 6.338706016540527, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8713805079460144, "num_tokens": 351425220.0, "step": 9212 }, { "epoch": 1.1719882966543698, "ewc_loss": 0.0540771558880806, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002331543801119551, "grad_norm": 6.3339314460754395, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8593328595161438, "num_tokens": 351467515.0, "step": 9213 }, { "epoch": 1.1721155069329603, "ewc_loss": 0.054025523364543915, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023263801995199174, "grad_norm": 6.358547210693359, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8619083166122437, "num_tokens": 351507571.0, "step": 9214 }, { "epoch": 1.1722427172115506, "ewc_loss": 0.05408812314271927, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023326404334511608, "grad_norm": 6.345907688140869, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8613793253898621, "num_tokens": 351546150.0, "step": 9215 }, { "epoch": 1.1723699274901411, "ewc_loss": 0.05397028475999832, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023208567290566862, "grad_norm": 6.331868648529053, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8571169972419739, "num_tokens": 351588574.0, "step": 9216 }, { "epoch": 1.1724971377687317, "ewc_loss": 0.0540471076965332, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023285389761440456, "grad_norm": 6.320309638977051, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8646538853645325, "num_tokens": 351626914.0, "step": 9217 }, { "epoch": 1.1726243480473222, "ewc_loss": 0.05398266017436981, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002322093932889402, "grad_norm": 6.349820137023926, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8535475730895996, "num_tokens": 351663574.0, "step": 9218 }, { "epoch": 1.1727515583259127, "ewc_loss": 0.05402865633368492, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023266937932930887, "grad_norm": 6.339898586273193, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8473742008209229, "num_tokens": 351703275.0, "step": 9219 }, { "epoch": 1.1728787686045032, "ewc_loss": 0.05407080054283142, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023309083189815283, "grad_norm": 6.321598529815674, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8543939590454102, "num_tokens": 351745713.0, "step": 9220 }, { "epoch": 1.1730059788830938, "ewc_loss": 0.054006852209568024, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023245134798344225, "grad_norm": 6.334049224853516, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8498060703277588, "num_tokens": 351780789.0, "step": 9221 }, { "epoch": 1.1731331891616843, "ewc_loss": 0.05400621145963669, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023244490148499608, "grad_norm": 6.270523548126221, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8734064102172852, "num_tokens": 351820352.0, "step": 9222 }, { "epoch": 1.1732603994402748, "ewc_loss": 0.05410407483577728, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023342356143984944, "grad_norm": 6.348207473754883, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8712917566299438, "num_tokens": 351860011.0, "step": 9223 }, { "epoch": 1.1733876097188654, "ewc_loss": 0.05406945198774338, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002330773277208209, "grad_norm": 6.3390092849731445, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8564757704734802, "num_tokens": 351898742.0, "step": 9224 }, { "epoch": 1.1735148199974559, "ewc_loss": 0.05407828092575073, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023316562874242663, "grad_norm": 6.296588897705078, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8511934280395508, "num_tokens": 351940201.0, "step": 9225 }, { "epoch": 1.1736420302760462, "ewc_loss": 0.054335035383701324, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002332917501917109, "grad_norm": 6.386130332946777, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8609204292297363, "num_tokens": 351971041.0, "step": 9226 }, { "epoch": 1.1737692405546367, "ewc_loss": 0.05406973510980606, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023308017989620566, "grad_norm": 6.338919639587402, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8456918001174927, "num_tokens": 352013002.0, "step": 9227 }, { "epoch": 1.1738964508332272, "ewc_loss": 0.0543040968477726, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002329823764739558, "grad_norm": 6.290796279907227, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8618990778923035, "num_tokens": 352051435.0, "step": 9228 }, { "epoch": 1.1740236611118178, "ewc_loss": 0.05428195372223854, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023276093997992575, "grad_norm": 6.349551200866699, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.869548499584198, "num_tokens": 352093295.0, "step": 9229 }, { "epoch": 1.1741508713904083, "ewc_loss": 0.054335251450538635, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023329391842707992, "grad_norm": 6.332478046417236, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8584967851638794, "num_tokens": 352136401.0, "step": 9230 }, { "epoch": 1.1742780816689988, "ewc_loss": 0.05426790565252304, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002326204557903111, "grad_norm": 6.317252159118652, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8742154836654663, "num_tokens": 352178696.0, "step": 9231 }, { "epoch": 1.1744052919475894, "ewc_loss": 0.054266899824142456, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023261038586497307, "grad_norm": 6.354048728942871, "learning_rate": 1e-06, "loss": 0.5263, "mean_token_accuracy": 0.8448274731636047, "num_tokens": 352214759.0, "step": 9232 }, { "epoch": 1.1745325022261799, "ewc_loss": 0.054263390600681305, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023257528664544225, "grad_norm": 6.339439392089844, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8760998249053955, "num_tokens": 352250520.0, "step": 9233 }, { "epoch": 1.1746597125047704, "ewc_loss": 0.05405988544225693, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023298166343010962, "grad_norm": 6.347465515136719, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8612291812896729, "num_tokens": 352288773.0, "step": 9234 }, { "epoch": 1.174786922783361, "ewc_loss": 0.05398843437433243, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023226714984048158, "grad_norm": 6.267544746398926, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8713325262069702, "num_tokens": 352326548.0, "step": 9235 }, { "epoch": 1.1749141330619515, "ewc_loss": 0.05433991551399231, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023334055731538683, "grad_norm": 6.3066086769104, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8730095624923706, "num_tokens": 352366537.0, "step": 9236 }, { "epoch": 1.175041343340542, "ewc_loss": 0.05424284189939499, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023236979905050248, "grad_norm": 6.358071804046631, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8470650911331177, "num_tokens": 352402390.0, "step": 9237 }, { "epoch": 1.1751685536191325, "ewc_loss": 0.054323598742485046, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002331774157937616, "grad_norm": 6.32708215713501, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8608410358428955, "num_tokens": 352442357.0, "step": 9238 }, { "epoch": 1.175295763897723, "ewc_loss": 0.05429434776306152, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023288486409001052, "grad_norm": 6.327502250671387, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.86159747838974, "num_tokens": 352476884.0, "step": 9239 }, { "epoch": 1.1754229741763134, "ewc_loss": 0.05436200648546219, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023356145538855344, "grad_norm": 6.316774368286133, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8643507957458496, "num_tokens": 352513448.0, "step": 9240 }, { "epoch": 1.175550184454904, "ewc_loss": 0.05433307960629463, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002332722069695592, "grad_norm": 6.305703639984131, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.883974552154541, "num_tokens": 352548420.0, "step": 9241 }, { "epoch": 1.1756773947334944, "ewc_loss": 0.054374516010284424, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023368658730760217, "grad_norm": 6.260311126708984, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8663414120674133, "num_tokens": 352592157.0, "step": 9242 }, { "epoch": 1.175804605012085, "ewc_loss": 0.05440214276313782, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023396284086629748, "grad_norm": 6.348437309265137, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8551496267318726, "num_tokens": 352626427.0, "step": 9243 }, { "epoch": 1.1759318152906755, "ewc_loss": 0.05436256527900696, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023356704332400113, "grad_norm": 6.292734146118164, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8736198544502258, "num_tokens": 352667075.0, "step": 9244 }, { "epoch": 1.176059025569266, "ewc_loss": 0.05448191612958908, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023476059141103178, "grad_norm": 6.385722637176514, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8611621856689453, "num_tokens": 352707114.0, "step": 9245 }, { "epoch": 1.1761862358478565, "ewc_loss": 0.05428525060415268, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023279391461983323, "grad_norm": 6.325185775756836, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8691157698631287, "num_tokens": 352738064.0, "step": 9246 }, { "epoch": 1.176313446126447, "ewc_loss": 0.0543876588344574, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023381799110211432, "grad_norm": 6.335139274597168, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.848244309425354, "num_tokens": 352779964.0, "step": 9247 }, { "epoch": 1.1764406564050376, "ewc_loss": 0.05433828756213188, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002333242737222463, "grad_norm": 6.296440601348877, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8718693256378174, "num_tokens": 352819574.0, "step": 9248 }, { "epoch": 1.1765678666836281, "ewc_loss": 0.054368384182453156, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002336252509849146, "grad_norm": 6.354188442230225, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8733524084091187, "num_tokens": 352859151.0, "step": 9249 }, { "epoch": 1.1766950769622184, "ewc_loss": 0.05426905304193497, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002326319518033415, "grad_norm": 6.3082427978515625, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8414173126220703, "num_tokens": 352901119.0, "step": 9250 }, { "epoch": 1.176822287240809, "ewc_loss": 0.05415588617324829, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023394165327772498, "grad_norm": 6.359058380126953, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8625832796096802, "num_tokens": 352945878.0, "step": 9251 }, { "epoch": 1.1769494975193995, "ewc_loss": 0.054268546402454376, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023262685863301158, "grad_norm": 6.775632381439209, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8579020500183105, "num_tokens": 352979819.0, "step": 9252 }, { "epoch": 1.17707670779799, "ewc_loss": 0.054186709225177765, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002318085025763139, "grad_norm": 6.2320637702941895, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8532615303993225, "num_tokens": 353023552.0, "step": 9253 }, { "epoch": 1.1772039180765805, "ewc_loss": 0.05427201837301254, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023266156495083123, "grad_norm": 6.400906085968018, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8643686175346375, "num_tokens": 353058178.0, "step": 9254 }, { "epoch": 1.177331128355171, "ewc_loss": 0.0541449710726738, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023139113909564912, "grad_norm": 6.2642502784729, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8728190660476685, "num_tokens": 353091428.0, "step": 9255 }, { "epoch": 1.1774583386337616, "ewc_loss": 0.054373204708099365, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023367346148006618, "grad_norm": 6.381557464599609, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8673146963119507, "num_tokens": 353131943.0, "step": 9256 }, { "epoch": 1.1775855489123521, "ewc_loss": 0.05398351699113846, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023221799347084016, "grad_norm": 6.30476713180542, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8553265333175659, "num_tokens": 353171213.0, "step": 9257 }, { "epoch": 1.1777127591909426, "ewc_loss": 0.054621271789073944, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023371272254735231, "grad_norm": 6.617276191711426, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8712790608406067, "num_tokens": 353205057.0, "step": 9258 }, { "epoch": 1.1778399694695332, "ewc_loss": 0.0539725162088871, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002321079809917137, "grad_norm": 6.231812000274658, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8498287796974182, "num_tokens": 353251164.0, "step": 9259 }, { "epoch": 1.1779671797481237, "ewc_loss": 0.05412915349006653, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002336743491468951, "grad_norm": 6.395938873291016, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8682878017425537, "num_tokens": 353288202.0, "step": 9260 }, { "epoch": 1.1780943900267142, "ewc_loss": 0.054042600095272064, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023280881578102708, "grad_norm": 6.389698028564453, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.862234354019165, "num_tokens": 353327396.0, "step": 9261 }, { "epoch": 1.1782216003053048, "ewc_loss": 0.05406178906559944, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023300069733522832, "grad_norm": 6.335652828216553, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8509016036987305, "num_tokens": 353366512.0, "step": 9262 }, { "epoch": 1.1783488105838953, "ewc_loss": 0.054563008248806, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023313007841352373, "grad_norm": 6.34193754196167, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8547053337097168, "num_tokens": 353408546.0, "step": 9263 }, { "epoch": 1.1784760208624856, "ewc_loss": 0.05457095801830292, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023320956097450107, "grad_norm": 6.362059593200684, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8709350824356079, "num_tokens": 353444950.0, "step": 9264 }, { "epoch": 1.1786032311410761, "ewc_loss": 0.0541081577539444, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023346439411398023, "grad_norm": 6.33342170715332, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8709321022033691, "num_tokens": 353483306.0, "step": 9265 }, { "epoch": 1.1787304414196667, "ewc_loss": 0.054097890853881836, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023336170124821365, "grad_norm": 6.329327583312988, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8622069358825684, "num_tokens": 353523038.0, "step": 9266 }, { "epoch": 1.1788576516982572, "ewc_loss": 0.05411609262228012, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002335437311558053, "grad_norm": 6.299605369567871, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8741284608840942, "num_tokens": 353564266.0, "step": 9267 }, { "epoch": 1.1789848619768477, "ewc_loss": 0.05407281965017319, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023311100085265934, "grad_norm": 6.284104824066162, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8661676049232483, "num_tokens": 353603558.0, "step": 9268 }, { "epoch": 1.1791120722554382, "ewc_loss": 0.05413226783275604, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.0002337054756935686, "grad_norm": 6.357113361358643, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8706110715866089, "num_tokens": 353648097.0, "step": 9269 }, { "epoch": 1.1792392825340288, "ewc_loss": 0.05409851670265198, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023336797312367707, "grad_norm": 6.324666500091553, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8489208221435547, "num_tokens": 353688970.0, "step": 9270 }, { "epoch": 1.1793664928126193, "ewc_loss": 0.05414725840091705, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023385541862808168, "grad_norm": 6.304689407348633, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.852764368057251, "num_tokens": 353732505.0, "step": 9271 }, { "epoch": 1.1794937030912098, "ewc_loss": 0.054131925106048584, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023370205599348992, "grad_norm": 6.306298732757568, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8716915845870972, "num_tokens": 353767460.0, "step": 9272 }, { "epoch": 1.1796209133698004, "ewc_loss": 0.05441112816333771, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002340526698390022, "grad_norm": 6.40761661529541, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8718289136886597, "num_tokens": 353801951.0, "step": 9273 }, { "epoch": 1.1797481236483909, "ewc_loss": 0.054363593459129333, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002335773315280676, "grad_norm": 6.310380458831787, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8705912828445435, "num_tokens": 353836815.0, "step": 9274 }, { "epoch": 1.1798753339269812, "ewc_loss": 0.054419465363025665, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023413606686517596, "grad_norm": 6.357429504394531, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8561598658561707, "num_tokens": 353876296.0, "step": 9275 }, { "epoch": 1.1800025442055717, "ewc_loss": 0.05409688502550125, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023335168953053653, "grad_norm": 6.339868068695068, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8467576503753662, "num_tokens": 353911792.0, "step": 9276 }, { "epoch": 1.1801297544841622, "ewc_loss": 0.054607465863227844, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023357468307949603, "grad_norm": 6.324110984802246, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8529985547065735, "num_tokens": 353949532.0, "step": 9277 }, { "epoch": 1.1802569647627528, "ewc_loss": 0.05411750078201294, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023355783196166158, "grad_norm": 6.278635025024414, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8503017425537109, "num_tokens": 353988622.0, "step": 9278 }, { "epoch": 1.1803841750413433, "ewc_loss": 0.0541442334651947, "ewc_loss_diag": 3.075599670410156e-05, "ewc_loss_parallel": 0.00023382512154057622, "grad_norm": 6.333656311035156, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8671295642852783, "num_tokens": 354028136.0, "step": 9279 }, { "epoch": 1.1805113853199338, "ewc_loss": 0.05464959889650345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002339959901291877, "grad_norm": 9.602953910827637, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8638678193092346, "num_tokens": 354065029.0, "step": 9280 }, { "epoch": 1.1806385955985244, "ewc_loss": 0.058091409504413605, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002684140927158296, "grad_norm": 6.704272747039795, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8736703395843506, "num_tokens": 354099279.0, "step": 9281 }, { "epoch": 1.1807658058771149, "ewc_loss": 0.05415109544992447, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002290109550813213, "grad_norm": 6.34170389175415, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8587043285369873, "num_tokens": 354138870.0, "step": 9282 }, { "epoch": 1.1808930161557054, "ewc_loss": 0.055024538189172745, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002377453783992678, "grad_norm": 6.550346374511719, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8515200018882751, "num_tokens": 354176498.0, "step": 9283 }, { "epoch": 1.181020226434296, "ewc_loss": 0.05479840934276581, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023548409808427095, "grad_norm": 6.371063709259033, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8443296551704407, "num_tokens": 354210063.0, "step": 9284 }, { "epoch": 1.1811474367128865, "ewc_loss": 0.05469890683889389, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023448905267287046, "grad_norm": 6.499564170837402, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.856576144695282, "num_tokens": 354248095.0, "step": 9285 }, { "epoch": 1.181274646991477, "ewc_loss": 0.05456968769431114, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023319688625633717, "grad_norm": 6.302618980407715, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8600088953971863, "num_tokens": 354286897.0, "step": 9286 }, { "epoch": 1.1814018572700675, "ewc_loss": 0.054766975343227386, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023516977671533823, "grad_norm": 6.413570404052734, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8718684911727905, "num_tokens": 354330685.0, "step": 9287 }, { "epoch": 1.181529067548658, "ewc_loss": 0.05459971725940704, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002334971504751593, "grad_norm": 6.284724235534668, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8545271158218384, "num_tokens": 354371120.0, "step": 9288 }, { "epoch": 1.1816562778272484, "ewc_loss": 0.05469192564487457, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002344192616874352, "grad_norm": 6.361236572265625, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8609418869018555, "num_tokens": 354410825.0, "step": 9289 }, { "epoch": 1.1817834881058389, "ewc_loss": 0.05469539761543274, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002344539825571701, "grad_norm": 6.324649810791016, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8608983755111694, "num_tokens": 354453623.0, "step": 9290 }, { "epoch": 1.1819106983844294, "ewc_loss": 0.05473317950963974, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023483179393224418, "grad_norm": 6.387440204620361, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.861927330493927, "num_tokens": 354491854.0, "step": 9291 }, { "epoch": 1.18203790866302, "ewc_loss": 0.05474685877561569, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023496859648730606, "grad_norm": 6.340144157409668, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8488060235977173, "num_tokens": 354530788.0, "step": 9292 }, { "epoch": 1.1821651189416105, "ewc_loss": 0.05472111701965332, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023471117310691625, "grad_norm": 6.32456636428833, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8817014694213867, "num_tokens": 354568576.0, "step": 9293 }, { "epoch": 1.182292329220201, "ewc_loss": 0.05469720810651779, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023447205603588372, "grad_norm": 6.312428951263428, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8635883331298828, "num_tokens": 354611307.0, "step": 9294 }, { "epoch": 1.1824195394987915, "ewc_loss": 0.05470363050699234, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002345363172935322, "grad_norm": 6.367552757263184, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8700588345527649, "num_tokens": 354648810.0, "step": 9295 }, { "epoch": 1.182546749777382, "ewc_loss": 0.05472264438867569, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023472645261790603, "grad_norm": 6.318981647491455, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8658533096313477, "num_tokens": 354688411.0, "step": 9296 }, { "epoch": 1.1826739600559726, "ewc_loss": 0.054653190076351166, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002340319042559713, "grad_norm": 6.335143089294434, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8615881204605103, "num_tokens": 354734700.0, "step": 9297 }, { "epoch": 1.1828011703345631, "ewc_loss": 0.054409489035606384, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002340363134862855, "grad_norm": 6.38680362701416, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8604361414909363, "num_tokens": 354765517.0, "step": 9298 }, { "epoch": 1.1829283806131534, "ewc_loss": 0.05440571904182434, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002339986094739288, "grad_norm": 6.292114734649658, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8665351867675781, "num_tokens": 354808563.0, "step": 9299 }, { "epoch": 1.183055590891744, "ewc_loss": 0.054472748190164566, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023466888524126261, "grad_norm": 6.433363914489746, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8448189496994019, "num_tokens": 354846618.0, "step": 9300 }, { "epoch": 1.1831828011703345, "ewc_loss": 0.05438612028956413, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023380260972771794, "grad_norm": 6.345961093902588, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8646553158760071, "num_tokens": 354881293.0, "step": 9301 }, { "epoch": 1.183310011448925, "ewc_loss": 0.05439665913581848, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023390799469780177, "grad_norm": 6.318403244018555, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.860713005065918, "num_tokens": 354925239.0, "step": 9302 }, { "epoch": 1.1834372217275155, "ewc_loss": 0.054627738893032074, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023377736215479672, "grad_norm": 6.311011791229248, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8498116135597229, "num_tokens": 354967836.0, "step": 9303 }, { "epoch": 1.183564432006106, "ewc_loss": 0.05468465015292168, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023434650211129338, "grad_norm": 6.405726432800293, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8465108871459961, "num_tokens": 355003386.0, "step": 9304 }, { "epoch": 1.1836916422846966, "ewc_loss": 0.054694611579179764, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023444612452294677, "grad_norm": 6.339352607727051, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8411626815795898, "num_tokens": 355043391.0, "step": 9305 }, { "epoch": 1.1838188525632871, "ewc_loss": 0.05466195568442345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000234119564993307, "grad_norm": 6.305688858032227, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8641242980957031, "num_tokens": 355082820.0, "step": 9306 }, { "epoch": 1.1839460628418776, "ewc_loss": 0.05471629276871681, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023466291895601898, "grad_norm": 6.370282173156738, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8834377527236938, "num_tokens": 355117531.0, "step": 9307 }, { "epoch": 1.1840732731204682, "ewc_loss": 0.054696060717105865, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000234460603678599, "grad_norm": 6.308474063873291, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8514208197593689, "num_tokens": 355159485.0, "step": 9308 }, { "epoch": 1.1842004833990587, "ewc_loss": 0.05477772653102875, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023527727171313018, "grad_norm": 6.379351615905762, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8575273752212524, "num_tokens": 355206051.0, "step": 9309 }, { "epoch": 1.1843276936776492, "ewc_loss": 0.054729510098695755, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023479510855395347, "grad_norm": 6.355953216552734, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8539803624153137, "num_tokens": 355242933.0, "step": 9310 }, { "epoch": 1.1844549039562398, "ewc_loss": 0.05478019267320633, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023530192265752703, "grad_norm": 6.292563438415527, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8742967844009399, "num_tokens": 355286311.0, "step": 9311 }, { "epoch": 1.1845821142348303, "ewc_loss": 0.054791711270809174, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023541713017039, "grad_norm": 6.323506832122803, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8621652722358704, "num_tokens": 355327844.0, "step": 9312 }, { "epoch": 1.1847093245134206, "ewc_loss": 0.05482139438390732, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023571393103338778, "grad_norm": 6.314252853393555, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8718698024749756, "num_tokens": 355368977.0, "step": 9313 }, { "epoch": 1.1848365347920111, "ewc_loss": 0.05482123792171478, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023571237397845834, "grad_norm": 6.3289713859558105, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.852938711643219, "num_tokens": 355411479.0, "step": 9314 }, { "epoch": 1.1849637450706016, "ewc_loss": 0.05485248565673828, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023602487635798752, "grad_norm": 6.380702018737793, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8812682628631592, "num_tokens": 355447370.0, "step": 9315 }, { "epoch": 1.1850909553491922, "ewc_loss": 0.05479025840759277, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023540257825516164, "grad_norm": 6.3036789894104, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8596770167350769, "num_tokens": 355490335.0, "step": 9316 }, { "epoch": 1.1852181656277827, "ewc_loss": 0.054855503141880035, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023605501337442547, "grad_norm": 6.361813068389893, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8628547191619873, "num_tokens": 355527843.0, "step": 9317 }, { "epoch": 1.1853453759063732, "ewc_loss": 0.05472744256258011, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002347744448343292, "grad_norm": 6.315343856811523, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8599326014518738, "num_tokens": 355566094.0, "step": 9318 }, { "epoch": 1.1854725861849638, "ewc_loss": 0.05484810471534729, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002359810605412349, "grad_norm": 6.411230087280273, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8704773187637329, "num_tokens": 355609505.0, "step": 9319 }, { "epoch": 1.1855997964635543, "ewc_loss": 0.05465957522392273, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023409575805999339, "grad_norm": 6.302804470062256, "learning_rate": 1e-06, "loss": 0.5348, "mean_token_accuracy": 0.838134765625, "num_tokens": 355650930.0, "step": 9320 }, { "epoch": 1.1857270067421448, "ewc_loss": 0.05462106317281723, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023615204554516822, "grad_norm": 6.404012203216553, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.872224748134613, "num_tokens": 355684075.0, "step": 9321 }, { "epoch": 1.1858542170207353, "ewc_loss": 0.05475478619337082, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002350478753214702, "grad_norm": 6.3117356300354, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8688628077507019, "num_tokens": 355723567.0, "step": 9322 }, { "epoch": 1.1859814272993259, "ewc_loss": 0.054503634572029114, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023497776419389993, "grad_norm": 6.38803243637085, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8480011224746704, "num_tokens": 355764915.0, "step": 9323 }, { "epoch": 1.1861086375779162, "ewc_loss": 0.05477160960435867, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023521609546151012, "grad_norm": 6.348849773406982, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8636051416397095, "num_tokens": 355804950.0, "step": 9324 }, { "epoch": 1.1862358478565067, "ewc_loss": 0.05477021634578705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002352021838305518, "grad_norm": 6.357819557189941, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8705378174781799, "num_tokens": 355844155.0, "step": 9325 }, { "epoch": 1.1863630581350972, "ewc_loss": 0.05469953641295433, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023449536820407957, "grad_norm": 6.296313762664795, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8791161179542542, "num_tokens": 355883971.0, "step": 9326 }, { "epoch": 1.1864902684136878, "ewc_loss": 0.05452495068311691, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002351908915443346, "grad_norm": 6.403700828552246, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8574727773666382, "num_tokens": 355922271.0, "step": 9327 }, { "epoch": 1.1866174786922783, "ewc_loss": 0.05453643947839737, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023530577891506255, "grad_norm": 6.3381829261779785, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8607620000839233, "num_tokens": 355961087.0, "step": 9328 }, { "epoch": 1.1867446889708688, "ewc_loss": 0.054503876715898514, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002349801652599126, "grad_norm": 6.329809665679932, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8668558597564697, "num_tokens": 355999553.0, "step": 9329 }, { "epoch": 1.1868718992494594, "ewc_loss": 0.05454474315047264, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023538884124718606, "grad_norm": 6.385214328765869, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8673036098480225, "num_tokens": 356038503.0, "step": 9330 }, { "epoch": 1.1869991095280499, "ewc_loss": 0.05445230007171631, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023446438717655838, "grad_norm": 6.286166191101074, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8740493059158325, "num_tokens": 356078538.0, "step": 9331 }, { "epoch": 1.1871263198066404, "ewc_loss": 0.054583437740802765, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023577579122502357, "grad_norm": 6.391298770904541, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8749387264251709, "num_tokens": 356115031.0, "step": 9332 }, { "epoch": 1.187253530085231, "ewc_loss": 0.05453068017959595, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002352481969865039, "grad_norm": 6.296888828277588, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8774411678314209, "num_tokens": 356154751.0, "step": 9333 }, { "epoch": 1.1873807403638215, "ewc_loss": 0.054674454033374786, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023668596986681223, "grad_norm": 6.438258171081543, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.847313642501831, "num_tokens": 356192105.0, "step": 9334 }, { "epoch": 1.187507950642412, "ewc_loss": 0.05466672033071518, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002341672225156799, "grad_norm": 6.351507663726807, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8518929481506348, "num_tokens": 356224964.0, "step": 9335 }, { "epoch": 1.1876351609210025, "ewc_loss": 0.05460990220308304, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002360404032515362, "grad_norm": 6.404393196105957, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8677360415458679, "num_tokens": 356261565.0, "step": 9336 }, { "epoch": 1.187762371199593, "ewc_loss": 0.05448310077190399, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.00023477240756619722, "grad_norm": 6.317089080810547, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8656615018844604, "num_tokens": 356301886.0, "step": 9337 }, { "epoch": 1.1878895814781834, "ewc_loss": 0.05464300885796547, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.000236371488426812, "grad_norm": 6.413353443145752, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8712860941886902, "num_tokens": 356339481.0, "step": 9338 }, { "epoch": 1.1880167917567739, "ewc_loss": 0.054515279829502106, "ewc_loss_diag": 3.0994415283203125e-05, "ewc_loss_parallel": 0.0002350941940676421, "grad_norm": 6.344775676727295, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8402991890907288, "num_tokens": 356387038.0, "step": 9339 }, { "epoch": 1.1881440020353644, "ewc_loss": 0.054842106997966766, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023592106299474835, "grad_norm": 6.424922943115234, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8589642643928528, "num_tokens": 356433256.0, "step": 9340 }, { "epoch": 1.188271212313955, "ewc_loss": 0.05472983047366142, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002347983099753037, "grad_norm": 6.321232795715332, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.852920413017273, "num_tokens": 356471282.0, "step": 9341 }, { "epoch": 1.1883984225925455, "ewc_loss": 0.05485853552818298, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023608535411767662, "grad_norm": 6.3799262046813965, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8683441877365112, "num_tokens": 356508951.0, "step": 9342 }, { "epoch": 1.188525632871136, "ewc_loss": 0.05480019375681877, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023550193873234093, "grad_norm": 6.352054119110107, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8655555248260498, "num_tokens": 356544487.0, "step": 9343 }, { "epoch": 1.1886528431497265, "ewc_loss": 0.05488599091768265, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023635993420612067, "grad_norm": 6.3831281661987305, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8692211508750916, "num_tokens": 356580873.0, "step": 9344 }, { "epoch": 1.188780053428317, "ewc_loss": 0.05481908842921257, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023569088079966605, "grad_norm": 6.418584823608398, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8611223101615906, "num_tokens": 356613622.0, "step": 9345 }, { "epoch": 1.1889072637069076, "ewc_loss": 0.05480489879846573, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023554898507427424, "grad_norm": 6.361718654632568, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8399918079376221, "num_tokens": 356660058.0, "step": 9346 }, { "epoch": 1.189034473985498, "ewc_loss": 0.05483909696340561, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002358909696340561, "grad_norm": 6.347503662109375, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8614203929901123, "num_tokens": 356701614.0, "step": 9347 }, { "epoch": 1.1891616842640884, "ewc_loss": 0.054832734167575836, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023582731955684721, "grad_norm": 6.3321919441223145, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.877626359462738, "num_tokens": 356737105.0, "step": 9348 }, { "epoch": 1.189288894542679, "ewc_loss": 0.05484382063150406, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023593820515088737, "grad_norm": 6.392195701599121, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8534897565841675, "num_tokens": 356779382.0, "step": 9349 }, { "epoch": 1.1894161048212695, "ewc_loss": 0.05483188107609749, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023581880668643862, "grad_norm": 6.40451192855835, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8581468462944031, "num_tokens": 356813836.0, "step": 9350 }, { "epoch": 1.18954331509986, "ewc_loss": 0.05475437641143799, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023504378623329103, "grad_norm": 6.406801700592041, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8528926372528076, "num_tokens": 356852591.0, "step": 9351 }, { "epoch": 1.1896705253784505, "ewc_loss": 0.054834865033626556, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023584863811265677, "grad_norm": 9.578184127807617, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.861777663230896, "num_tokens": 356889940.0, "step": 9352 }, { "epoch": 1.189797735657041, "ewc_loss": 0.05811409652233124, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002686409861780703, "grad_norm": 6.724918365478516, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8655269145965576, "num_tokens": 356927878.0, "step": 9353 }, { "epoch": 1.1899249459356316, "ewc_loss": 0.05437504127621651, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023125042207539082, "grad_norm": 6.445993423461914, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.871631383895874, "num_tokens": 356963339.0, "step": 9354 }, { "epoch": 1.1900521562142221, "ewc_loss": 0.05514582246541977, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002389582514297217, "grad_norm": 6.4401960372924805, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8677196502685547, "num_tokens": 357002999.0, "step": 9355 }, { "epoch": 1.1901793664928126, "ewc_loss": 0.05500933527946472, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002375933254370466, "grad_norm": 6.418623924255371, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8657742142677307, "num_tokens": 357044305.0, "step": 9356 }, { "epoch": 1.1903065767714032, "ewc_loss": 0.054974265396595, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023724266793578863, "grad_norm": 6.412832736968994, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8644595146179199, "num_tokens": 357082197.0, "step": 9357 }, { "epoch": 1.1904337870499937, "ewc_loss": 0.05484946072101593, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002359945938223973, "grad_norm": 6.452147483825684, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8446648716926575, "num_tokens": 357119541.0, "step": 9358 }, { "epoch": 1.1905609973285842, "ewc_loss": 0.05492140352725983, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023671402595937252, "grad_norm": 6.418160915374756, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8842439651489258, "num_tokens": 357159176.0, "step": 9359 }, { "epoch": 1.1906882076071748, "ewc_loss": 0.0549149289727211, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023664929904043674, "grad_norm": 6.464008808135986, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8629037141799927, "num_tokens": 357199204.0, "step": 9360 }, { "epoch": 1.1908154178857653, "ewc_loss": 0.05481275916099548, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023562757996842265, "grad_norm": 6.362250804901123, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8781022429466248, "num_tokens": 357236900.0, "step": 9361 }, { "epoch": 1.1909426281643556, "ewc_loss": 0.05491705238819122, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023667050118092448, "grad_norm": 6.438540935516357, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8638436794281006, "num_tokens": 357271622.0, "step": 9362 }, { "epoch": 1.1910698384429461, "ewc_loss": 0.0548066645860672, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002355666656512767, "grad_norm": 6.357246398925781, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8737556338310242, "num_tokens": 357311145.0, "step": 9363 }, { "epoch": 1.1911970487215366, "ewc_loss": 0.054783932864665985, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023533933563157916, "grad_norm": 6.405243396759033, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8798308372497559, "num_tokens": 357344609.0, "step": 9364 }, { "epoch": 1.1913242590001272, "ewc_loss": 0.0547446683049202, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002349466667510569, "grad_norm": 6.401276111602783, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8601840734481812, "num_tokens": 357378896.0, "step": 9365 }, { "epoch": 1.1914514692787177, "ewc_loss": 0.05475790426135063, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023507904552388936, "grad_norm": 6.340201377868652, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8666449785232544, "num_tokens": 357424730.0, "step": 9366 }, { "epoch": 1.1915786795573082, "ewc_loss": 0.054784998297691345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002353499730816111, "grad_norm": 6.424521446228027, "learning_rate": 1e-06, "loss": 0.5268, "mean_token_accuracy": 0.8452513217926025, "num_tokens": 357461152.0, "step": 9367 }, { "epoch": 1.1917058898358988, "ewc_loss": 0.054713498800992966, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023463497927878052, "grad_norm": 6.415173530578613, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.855712890625, "num_tokens": 357502305.0, "step": 9368 }, { "epoch": 1.1918331001144893, "ewc_loss": 0.05476023256778717, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023510234314016998, "grad_norm": 6.442309379577637, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8568955659866333, "num_tokens": 357539187.0, "step": 9369 }, { "epoch": 1.1919603103930798, "ewc_loss": 0.05473614111542702, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002348614070797339, "grad_norm": 6.418538570404053, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8519018292427063, "num_tokens": 357569520.0, "step": 9370 }, { "epoch": 1.1920875206716703, "ewc_loss": 0.05474498122930527, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000234949795412831, "grad_norm": 6.383213043212891, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8625596761703491, "num_tokens": 357605253.0, "step": 9371 }, { "epoch": 1.1922147309502609, "ewc_loss": 0.05474800616502762, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023498007794842124, "grad_norm": 6.414586544036865, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8735538125038147, "num_tokens": 357639701.0, "step": 9372 }, { "epoch": 1.1923419412288512, "ewc_loss": 0.054707251489162445, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023457252245862037, "grad_norm": 6.348640441894531, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8716932535171509, "num_tokens": 357675611.0, "step": 9373 }, { "epoch": 1.1924691515074417, "ewc_loss": 0.054805684834718704, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002355568576604128, "grad_norm": 6.429347515106201, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.874522864818573, "num_tokens": 357708521.0, "step": 9374 }, { "epoch": 1.1925963617860322, "ewc_loss": 0.05473107844591141, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023481081007048488, "grad_norm": 6.346776962280273, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8537495732307434, "num_tokens": 357749970.0, "step": 9375 }, { "epoch": 1.1927235720646228, "ewc_loss": 0.05475447326898575, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002350447466596961, "grad_norm": 6.432966709136963, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.873173713684082, "num_tokens": 357783287.0, "step": 9376 }, { "epoch": 1.1928507823432133, "ewc_loss": 0.05469835549592972, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023448355204891413, "grad_norm": 6.393184185028076, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8637211918830872, "num_tokens": 357818883.0, "step": 9377 }, { "epoch": 1.1929779926218038, "ewc_loss": 0.054714351892471313, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023464349214918911, "grad_norm": 6.315439701080322, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8767415881156921, "num_tokens": 357862862.0, "step": 9378 }, { "epoch": 1.1931052029003943, "ewc_loss": 0.054772716015577316, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002352271694689989, "grad_norm": 6.427871227264404, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8521772027015686, "num_tokens": 357901574.0, "step": 9379 }, { "epoch": 1.1932324131789849, "ewc_loss": 0.05469854921102524, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023448547290172428, "grad_norm": 6.366612911224365, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8629475235939026, "num_tokens": 357941822.0, "step": 9380 }, { "epoch": 1.1933596234575754, "ewc_loss": 0.054791077971458435, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023541077098343521, "grad_norm": 6.375030517578125, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8554238080978394, "num_tokens": 357982820.0, "step": 9381 }, { "epoch": 1.193486833736166, "ewc_loss": 0.054688192903995514, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023438193602487445, "grad_norm": 6.350055694580078, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8602153062820435, "num_tokens": 358020491.0, "step": 9382 }, { "epoch": 1.1936140440147565, "ewc_loss": 0.054797545075416565, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023547542514279485, "grad_norm": 6.368526935577393, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8645280003547668, "num_tokens": 358059775.0, "step": 9383 }, { "epoch": 1.193741254293347, "ewc_loss": 0.05475946143269539, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023509461607318372, "grad_norm": 6.328153610229492, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8712227940559387, "num_tokens": 358098672.0, "step": 9384 }, { "epoch": 1.1938684645719375, "ewc_loss": 0.054809801280498505, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002355980104766786, "grad_norm": 6.361833095550537, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8561428785324097, "num_tokens": 358140850.0, "step": 9385 }, { "epoch": 1.193995674850528, "ewc_loss": 0.054824162274599075, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023574162332806736, "grad_norm": 6.401158332824707, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8689268827438354, "num_tokens": 358175503.0, "step": 9386 }, { "epoch": 1.1941228851291183, "ewc_loss": 0.05476503074169159, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023515029170084745, "grad_norm": 6.381941318511963, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8651537299156189, "num_tokens": 358210615.0, "step": 9387 }, { "epoch": 1.1942500954077089, "ewc_loss": 0.05484432727098465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002359432546654716, "grad_norm": 6.615309715270996, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8658725619316101, "num_tokens": 358246605.0, "step": 9388 }, { "epoch": 1.1943773056862994, "ewc_loss": 0.05474866181612015, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000234986626310274, "grad_norm": 6.311036586761475, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8602803945541382, "num_tokens": 358283136.0, "step": 9389 }, { "epoch": 1.19450451596489, "ewc_loss": 0.05482332780957222, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002357332850806415, "grad_norm": 6.395003318786621, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8721661567687988, "num_tokens": 358325437.0, "step": 9390 }, { "epoch": 1.1946317262434805, "ewc_loss": 0.054701317101716995, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002345131797483191, "grad_norm": 6.309840202331543, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8629984855651855, "num_tokens": 358365360.0, "step": 9391 }, { "epoch": 1.194758936522071, "ewc_loss": 0.054810576140880585, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002356057520955801, "grad_norm": 6.326606273651123, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8785336017608643, "num_tokens": 358406502.0, "step": 9392 }, { "epoch": 1.1948861468006615, "ewc_loss": 0.05482662096619606, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002357662160648033, "grad_norm": 6.460119247436523, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8593837022781372, "num_tokens": 358437220.0, "step": 9393 }, { "epoch": 1.195013357079252, "ewc_loss": 0.054831016808748245, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002358101774007082, "grad_norm": 6.3827691078186035, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8510738611221313, "num_tokens": 358475314.0, "step": 9394 }, { "epoch": 1.1951405673578426, "ewc_loss": 0.05482596158981323, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023575962404720485, "grad_norm": 6.3614068031311035, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8723199963569641, "num_tokens": 358520777.0, "step": 9395 }, { "epoch": 1.195267777636433, "ewc_loss": 0.054832275956869125, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002358227502554655, "grad_norm": 6.403806686401367, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.88328617811203, "num_tokens": 358557973.0, "step": 9396 }, { "epoch": 1.1953949879150234, "ewc_loss": 0.054823920130729675, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000235739178606309, "grad_norm": 6.370597839355469, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8707880973815918, "num_tokens": 358596795.0, "step": 9397 }, { "epoch": 1.195522198193614, "ewc_loss": 0.0548592135310173, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023609213531017303, "grad_norm": 6.400449752807617, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8639713525772095, "num_tokens": 358632146.0, "step": 9398 }, { "epoch": 1.1956494084722045, "ewc_loss": 0.05477199703454971, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023521995171904564, "grad_norm": 6.364461898803711, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8715728521347046, "num_tokens": 358672194.0, "step": 9399 }, { "epoch": 1.195776618750795, "ewc_loss": 0.05479194223880768, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002354194293729961, "grad_norm": 6.389700412750244, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8713024854660034, "num_tokens": 358710192.0, "step": 9400 }, { "epoch": 1.1959038290293855, "ewc_loss": 0.054767951369285583, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023517954105045646, "grad_norm": 6.450639724731445, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8499823808670044, "num_tokens": 358749284.0, "step": 9401 }, { "epoch": 1.196031039307976, "ewc_loss": 0.05471824109554291, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023468243307434022, "grad_norm": 6.432379245758057, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8594409227371216, "num_tokens": 358782830.0, "step": 9402 }, { "epoch": 1.1961582495865666, "ewc_loss": 0.05469191074371338, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023441911616828293, "grad_norm": 6.363259792327881, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8620379567146301, "num_tokens": 358820696.0, "step": 9403 }, { "epoch": 1.196285459865157, "ewc_loss": 0.05472876504063606, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023478765797335654, "grad_norm": 6.488631725311279, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8709560036659241, "num_tokens": 358853616.0, "step": 9404 }, { "epoch": 1.1964126701437476, "ewc_loss": 0.054671116173267365, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023421115474775434, "grad_norm": 6.2886128425598145, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8731279373168945, "num_tokens": 358893420.0, "step": 9405 }, { "epoch": 1.1965398804223382, "ewc_loss": 0.05476713925600052, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002351714065298438, "grad_norm": 6.450521469116211, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8657640814781189, "num_tokens": 358930704.0, "step": 9406 }, { "epoch": 1.1966670907009287, "ewc_loss": 0.054663725197315216, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023413724557030946, "grad_norm": 6.2754225730896, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8772957921028137, "num_tokens": 358967730.0, "step": 9407 }, { "epoch": 1.1967943009795192, "ewc_loss": 0.05485287308692932, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023602874716743827, "grad_norm": 6.442992210388184, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8761943578720093, "num_tokens": 359007241.0, "step": 9408 }, { "epoch": 1.1969215112581097, "ewc_loss": 0.05475418269634247, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023504185082856566, "grad_norm": 6.372720718383789, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8510562181472778, "num_tokens": 359045573.0, "step": 9409 }, { "epoch": 1.1970487215367003, "ewc_loss": 0.05487608164548874, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002362607920076698, "grad_norm": 6.38729190826416, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8662739992141724, "num_tokens": 359083420.0, "step": 9410 }, { "epoch": 1.1971759318152906, "ewc_loss": 0.054717227816581726, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002346722612855956, "grad_norm": 6.376492977142334, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8775055408477783, "num_tokens": 359117555.0, "step": 9411 }, { "epoch": 1.1973031420938811, "ewc_loss": 0.05482541769742966, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023575419618282467, "grad_norm": 6.377387046813965, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8608134984970093, "num_tokens": 359154597.0, "step": 9412 }, { "epoch": 1.1974303523724716, "ewc_loss": 0.05475320667028427, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023503208649344742, "grad_norm": 6.362587928771973, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8575799465179443, "num_tokens": 359190074.0, "step": 9413 }, { "epoch": 1.1975575626510622, "ewc_loss": 0.05478659272193909, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023536593653261662, "grad_norm": 6.372334003448486, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8516442775726318, "num_tokens": 359232711.0, "step": 9414 }, { "epoch": 1.1976847729296527, "ewc_loss": 0.05475038290023804, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002350038557779044, "grad_norm": 6.430867671966553, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8627746105194092, "num_tokens": 359265582.0, "step": 9415 }, { "epoch": 1.1978119832082432, "ewc_loss": 0.05478439852595329, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000235343977692537, "grad_norm": 6.308299541473389, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8723838329315186, "num_tokens": 359303177.0, "step": 9416 }, { "epoch": 1.1979391934868338, "ewc_loss": 0.05481221526861191, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002356221666559577, "grad_norm": 6.370903015136719, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8738611340522766, "num_tokens": 359338712.0, "step": 9417 }, { "epoch": 1.1980664037654243, "ewc_loss": 0.05480622500181198, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023556224186904728, "grad_norm": 6.390594005584717, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8715249300003052, "num_tokens": 359372849.0, "step": 9418 }, { "epoch": 1.1981936140440148, "ewc_loss": 0.05481070280075073, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023560704721603543, "grad_norm": 6.398818016052246, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8684097528457642, "num_tokens": 359407981.0, "step": 9419 }, { "epoch": 1.1983208243226053, "ewc_loss": 0.05482972785830498, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023579728440381587, "grad_norm": 6.353768348693848, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8435484170913696, "num_tokens": 359450180.0, "step": 9420 }, { "epoch": 1.1984480346011959, "ewc_loss": 0.054798781871795654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023548779427073896, "grad_norm": 6.395026206970215, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8629986643791199, "num_tokens": 359487909.0, "step": 9421 }, { "epoch": 1.1985752448797862, "ewc_loss": 0.054864220321178436, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023614222300238907, "grad_norm": 6.363056659698486, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8750232458114624, "num_tokens": 359522609.0, "step": 9422 }, { "epoch": 1.1987024551583767, "ewc_loss": 0.054844096302986145, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023594097001478076, "grad_norm": 6.390039920806885, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8611814379692078, "num_tokens": 359560735.0, "step": 9423 }, { "epoch": 1.1988296654369672, "ewc_loss": 0.05484648048877716, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023596482060384005, "grad_norm": 6.312048435211182, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8578282594680786, "num_tokens": 359602994.0, "step": 9424 }, { "epoch": 1.1989568757155578, "ewc_loss": 0.054885655641555786, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023635652905795723, "grad_norm": 6.417677402496338, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8637250661849976, "num_tokens": 359638864.0, "step": 9425 }, { "epoch": 1.1990840859941483, "ewc_loss": 0.05491679906845093, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002366679982515052, "grad_norm": 6.485564231872559, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8717549443244934, "num_tokens": 359674202.0, "step": 9426 }, { "epoch": 1.1992112962727388, "ewc_loss": 0.05480428785085678, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002355428587179631, "grad_norm": 6.3859381675720215, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8688539862632751, "num_tokens": 359710408.0, "step": 9427 }, { "epoch": 1.1993385065513293, "ewc_loss": 0.054810672998428345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023560671252198517, "grad_norm": 6.326126575469971, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8717590570449829, "num_tokens": 359749068.0, "step": 9428 }, { "epoch": 1.1994657168299199, "ewc_loss": 0.05484406650066376, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023594064987264574, "grad_norm": 6.375723838806152, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8645762205123901, "num_tokens": 359788449.0, "step": 9429 }, { "epoch": 1.1995929271085104, "ewc_loss": 0.05485520511865616, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023605207388754934, "grad_norm": 6.395941257476807, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.858667254447937, "num_tokens": 359824167.0, "step": 9430 }, { "epoch": 1.199720137387101, "ewc_loss": 0.054762452840805054, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023512450570706278, "grad_norm": 6.344940185546875, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8591954112052917, "num_tokens": 359861476.0, "step": 9431 }, { "epoch": 1.1998473476656915, "ewc_loss": 0.05489358305931091, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002364358224440366, "grad_norm": 6.370631694793701, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8624649047851562, "num_tokens": 359906463.0, "step": 9432 }, { "epoch": 1.199974557944282, "ewc_loss": 0.054873026907444, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002362302620895207, "grad_norm": 6.373773097991943, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8671472072601318, "num_tokens": 359940576.0, "step": 9433 }, { "epoch": 1.2001017682228725, "ewc_loss": 0.05492393672466278, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023673934629186988, "grad_norm": 6.372265815734863, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8743353486061096, "num_tokens": 359973616.0, "step": 9434 }, { "epoch": 1.200228978501463, "ewc_loss": 0.054881010204553604, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023631010844837874, "grad_norm": 6.379481315612793, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8763772249221802, "num_tokens": 360011059.0, "step": 9435 }, { "epoch": 1.2003561887800533, "ewc_loss": 0.054874442517757416, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002362444211030379, "grad_norm": 6.31818962097168, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8731728792190552, "num_tokens": 360048307.0, "step": 9436 }, { "epoch": 1.2004833990586439, "ewc_loss": 0.054937347769737244, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002368735003983602, "grad_norm": 6.420691967010498, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8641999363899231, "num_tokens": 360086946.0, "step": 9437 }, { "epoch": 1.2006106093372344, "ewc_loss": 0.05486258491873741, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023612585209775716, "grad_norm": 6.3730878829956055, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8684632778167725, "num_tokens": 360121908.0, "step": 9438 }, { "epoch": 1.200737819615825, "ewc_loss": 0.054891474545001984, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023641476582270116, "grad_norm": 6.354493141174316, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8564238548278809, "num_tokens": 360164349.0, "step": 9439 }, { "epoch": 1.2008650298944155, "ewc_loss": 0.054810814559459686, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023560815316159278, "grad_norm": 6.3281402587890625, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8793340921401978, "num_tokens": 360197660.0, "step": 9440 }, { "epoch": 1.200992240173006, "ewc_loss": 0.054889846593141556, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023639846767764539, "grad_norm": 6.406444549560547, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8428999185562134, "num_tokens": 360240297.0, "step": 9441 }, { "epoch": 1.2011194504515965, "ewc_loss": 0.05485118180513382, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002360118378419429, "grad_norm": 6.416927814483643, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8534945249557495, "num_tokens": 360281566.0, "step": 9442 }, { "epoch": 1.201246660730187, "ewc_loss": 0.05490340292453766, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023653404787182808, "grad_norm": 6.445544719696045, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8706539869308472, "num_tokens": 360313927.0, "step": 9443 }, { "epoch": 1.2013738710087776, "ewc_loss": 0.05482473969459534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023574740043841302, "grad_norm": 6.374344348907471, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8679631948471069, "num_tokens": 360349369.0, "step": 9444 }, { "epoch": 1.201501081287368, "ewc_loss": 0.05489564687013626, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023645645705983043, "grad_norm": 6.413904190063477, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.863806962966919, "num_tokens": 360386996.0, "step": 9445 }, { "epoch": 1.2016282915659584, "ewc_loss": 0.054833050817251205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023583050642628223, "grad_norm": 6.355177879333496, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8684444427490234, "num_tokens": 360424883.0, "step": 9446 }, { "epoch": 1.201755501844549, "ewc_loss": 0.05491621419787407, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002366621483815834, "grad_norm": 6.451212406158447, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8685860633850098, "num_tokens": 360457992.0, "step": 9447 }, { "epoch": 1.2018827121231395, "ewc_loss": 0.05485587567090988, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023605875321663916, "grad_norm": 6.465625762939453, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.849593997001648, "num_tokens": 360497934.0, "step": 9448 }, { "epoch": 1.20200992240173, "ewc_loss": 0.054854974150657654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023604971647728235, "grad_norm": 6.3679518699646, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8609063625335693, "num_tokens": 360533928.0, "step": 9449 }, { "epoch": 1.2021371326803205, "ewc_loss": 0.05490725114941597, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023657250858377665, "grad_norm": 6.475634574890137, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8667120933532715, "num_tokens": 360571076.0, "step": 9450 }, { "epoch": 1.202264342958911, "ewc_loss": 0.05481225997209549, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023562260321341455, "grad_norm": 6.367668628692627, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8565254211425781, "num_tokens": 360611957.0, "step": 9451 }, { "epoch": 1.2023915532375016, "ewc_loss": 0.054893337190151215, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000236433363170363, "grad_norm": 6.410768985748291, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8575773239135742, "num_tokens": 360647823.0, "step": 9452 }, { "epoch": 1.202518763516092, "ewc_loss": 0.054848089814186096, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023598087136633694, "grad_norm": 6.374740123748779, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8565434813499451, "num_tokens": 360688341.0, "step": 9453 }, { "epoch": 1.2026459737946826, "ewc_loss": 0.05488027259707451, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023630273062735796, "grad_norm": 6.4781880378723145, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8560501337051392, "num_tokens": 360730417.0, "step": 9454 }, { "epoch": 1.2027731840732732, "ewc_loss": 0.054830800741910934, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023580800916533917, "grad_norm": 6.410229682922363, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.844070315361023, "num_tokens": 360771931.0, "step": 9455 }, { "epoch": 1.2029003943518637, "ewc_loss": 0.05483662337064743, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023586626048199832, "grad_norm": 6.381075859069824, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8621472120285034, "num_tokens": 360811460.0, "step": 9456 }, { "epoch": 1.2030276046304542, "ewc_loss": 0.054801154881715775, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023551154299639165, "grad_norm": 6.451333999633789, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.877556324005127, "num_tokens": 360849476.0, "step": 9457 }, { "epoch": 1.2031548149090447, "ewc_loss": 0.0548616386950016, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002361163788009435, "grad_norm": 6.412315368652344, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8536481857299805, "num_tokens": 360891178.0, "step": 9458 }, { "epoch": 1.2032820251876353, "ewc_loss": 0.05479276925325394, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002354276948608458, "grad_norm": 6.3704400062561035, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8603765964508057, "num_tokens": 360930503.0, "step": 9459 }, { "epoch": 1.2034092354662256, "ewc_loss": 0.05478159338235855, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023531592159997672, "grad_norm": 6.46185827255249, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8607707023620605, "num_tokens": 360963730.0, "step": 9460 }, { "epoch": 1.203536445744816, "ewc_loss": 0.054839953780174255, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023589951160829514, "grad_norm": 6.349710941314697, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8506393432617188, "num_tokens": 361003796.0, "step": 9461 }, { "epoch": 1.2036636560234066, "ewc_loss": 0.05489032715559006, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023640326980967075, "grad_norm": 6.432949066162109, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8634678721427917, "num_tokens": 361044818.0, "step": 9462 }, { "epoch": 1.2037908663019972, "ewc_loss": 0.054792869836091995, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023542869894299656, "grad_norm": 6.373003959655762, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8752491474151611, "num_tokens": 361080692.0, "step": 9463 }, { "epoch": 1.2039180765805877, "ewc_loss": 0.05494898557662964, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023698982840869576, "grad_norm": 6.457491397857666, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8543537855148315, "num_tokens": 361116609.0, "step": 9464 }, { "epoch": 1.2040452868591782, "ewc_loss": 0.054848022758960724, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023598024563398212, "grad_norm": 6.365379810333252, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8565747737884521, "num_tokens": 361154559.0, "step": 9465 }, { "epoch": 1.2041724971377687, "ewc_loss": 0.05492323637008667, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023673234682064503, "grad_norm": 6.417684555053711, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8727147579193115, "num_tokens": 361191005.0, "step": 9466 }, { "epoch": 1.2042997074163593, "ewc_loss": 0.054856590926647186, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023606591275893152, "grad_norm": 6.33068323135376, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8614749312400818, "num_tokens": 361230191.0, "step": 9467 }, { "epoch": 1.2044269176949498, "ewc_loss": 0.05495738983154297, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023707389482297003, "grad_norm": 6.464820384979248, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8564170598983765, "num_tokens": 361269132.0, "step": 9468 }, { "epoch": 1.2045541279735403, "ewc_loss": 0.05488801375031471, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023638014681637287, "grad_norm": 6.464224815368652, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8558394908905029, "num_tokens": 361306674.0, "step": 9469 }, { "epoch": 1.2046813382521309, "ewc_loss": 0.05489249899983406, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002364249958191067, "grad_norm": 6.333479404449463, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8651710152626038, "num_tokens": 361348309.0, "step": 9470 }, { "epoch": 1.2048085485307212, "ewc_loss": 0.05497588962316513, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023725887876935303, "grad_norm": 6.406379222869873, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8589010238647461, "num_tokens": 361386824.0, "step": 9471 }, { "epoch": 1.2049357588093117, "ewc_loss": 0.05483392998576164, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023583929578308016, "grad_norm": 6.379718780517578, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8442684412002563, "num_tokens": 361426590.0, "step": 9472 }, { "epoch": 1.2050629690879022, "ewc_loss": 0.05494242161512375, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002369242429267615, "grad_norm": 6.381949424743652, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8502242565155029, "num_tokens": 361463259.0, "step": 9473 }, { "epoch": 1.2051901793664928, "ewc_loss": 0.054903544485569, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023653543030377477, "grad_norm": 6.387330532073975, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8723136186599731, "num_tokens": 361498750.0, "step": 9474 }, { "epoch": 1.2053173896450833, "ewc_loss": 0.05496014654636383, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023710145615041256, "grad_norm": 6.420958995819092, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8534393310546875, "num_tokens": 361537101.0, "step": 9475 }, { "epoch": 1.2054445999236738, "ewc_loss": 0.05492393672466278, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002367393608437851, "grad_norm": 6.394403457641602, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8662078380584717, "num_tokens": 361572130.0, "step": 9476 }, { "epoch": 1.2055718102022643, "ewc_loss": 0.054913803935050964, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023663802130613476, "grad_norm": 6.387431621551514, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8790168166160583, "num_tokens": 361608484.0, "step": 9477 }, { "epoch": 1.2056990204808549, "ewc_loss": 0.055025555193424225, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023775556473992765, "grad_norm": 6.333186149597168, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.8808907270431519, "num_tokens": 361650361.0, "step": 9478 }, { "epoch": 1.2058262307594454, "ewc_loss": 0.054965417832136154, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023715417773928493, "grad_norm": 6.428996562957764, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8716689348220825, "num_tokens": 361680670.0, "step": 9479 }, { "epoch": 1.205953441038036, "ewc_loss": 0.05496997386217117, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023719973978586495, "grad_norm": 6.2833251953125, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8694949150085449, "num_tokens": 361723504.0, "step": 9480 }, { "epoch": 1.2060806513166265, "ewc_loss": 0.05511261522769928, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002386261330684647, "grad_norm": 6.3781280517578125, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8697179555892944, "num_tokens": 361765850.0, "step": 9481 }, { "epoch": 1.206207861595217, "ewc_loss": 0.054951105266809464, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023701105965301394, "grad_norm": 6.332077503204346, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8550039529800415, "num_tokens": 361804375.0, "step": 9482 }, { "epoch": 1.2063350718738075, "ewc_loss": 0.05510028451681137, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023850283469073474, "grad_norm": 6.4711151123046875, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8601793646812439, "num_tokens": 361834714.0, "step": 9483 }, { "epoch": 1.206462282152398, "ewc_loss": 0.054962195456027985, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023712193069513887, "grad_norm": 6.3593220710754395, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8634278178215027, "num_tokens": 361877568.0, "step": 9484 }, { "epoch": 1.2065894924309883, "ewc_loss": 0.055039986968040466, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023789986153133214, "grad_norm": 6.422599792480469, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8594439029693604, "num_tokens": 361914785.0, "step": 9485 }, { "epoch": 1.2067167027095789, "ewc_loss": 0.05490005761384964, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023650057846680284, "grad_norm": 6.3074951171875, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8711518049240112, "num_tokens": 361956781.0, "step": 9486 }, { "epoch": 1.2068439129881694, "ewc_loss": 0.055025987327098846, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023775988665875047, "grad_norm": 6.393278121948242, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8737362623214722, "num_tokens": 361997408.0, "step": 9487 }, { "epoch": 1.20697112326676, "ewc_loss": 0.05493296682834625, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023682967002969235, "grad_norm": 6.38985538482666, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8639175295829773, "num_tokens": 362030226.0, "step": 9488 }, { "epoch": 1.2070983335453505, "ewc_loss": 0.05498656630516052, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023736564617138356, "grad_norm": 6.328352928161621, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8749141097068787, "num_tokens": 362068370.0, "step": 9489 }, { "epoch": 1.207225543823941, "ewc_loss": 0.05503886565566063, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002378886565566063, "grad_norm": 6.3676557540893555, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8714940547943115, "num_tokens": 362108569.0, "step": 9490 }, { "epoch": 1.2073527541025315, "ewc_loss": 0.05495864897966385, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002370864967815578, "grad_norm": 6.429989337921143, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8553181290626526, "num_tokens": 362140476.0, "step": 9491 }, { "epoch": 1.207479964381122, "ewc_loss": 0.05493579059839249, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023685788619332016, "grad_norm": 6.426530838012695, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.870413064956665, "num_tokens": 362171132.0, "step": 9492 }, { "epoch": 1.2076071746597126, "ewc_loss": 0.05493482947349548, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002368482673773542, "grad_norm": 6.382329940795898, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8627535700798035, "num_tokens": 362204791.0, "step": 9493 }, { "epoch": 1.207734384938303, "ewc_loss": 0.054935701191425323, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023685701307840645, "grad_norm": 6.32211971282959, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8672131299972534, "num_tokens": 362242360.0, "step": 9494 }, { "epoch": 1.2078615952168934, "ewc_loss": 0.05497429519891739, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002372429589740932, "grad_norm": 6.393984317779541, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8666648864746094, "num_tokens": 362276558.0, "step": 9495 }, { "epoch": 1.207988805495484, "ewc_loss": 0.05499967187643051, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023749671527184546, "grad_norm": 6.476049423217773, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8501604795455933, "num_tokens": 362307055.0, "step": 9496 }, { "epoch": 1.2081160157740745, "ewc_loss": 0.05489404499530792, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023644044995307922, "grad_norm": 6.294323921203613, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8744800090789795, "num_tokens": 362342433.0, "step": 9497 }, { "epoch": 1.208243226052665, "ewc_loss": 0.055012233555316925, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002376223128521815, "grad_norm": 6.354586124420166, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8853422403335571, "num_tokens": 362377008.0, "step": 9498 }, { "epoch": 1.2083704363312555, "ewc_loss": 0.054944004863500595, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023694004630669951, "grad_norm": 6.433345317840576, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8779276609420776, "num_tokens": 362412089.0, "step": 9499 }, { "epoch": 1.208497646609846, "ewc_loss": 0.05494595319032669, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023695951676927507, "grad_norm": 6.299810886383057, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8558297157287598, "num_tokens": 362455817.0, "step": 9500 }, { "epoch": 1.2086248568884366, "ewc_loss": 0.05499175935983658, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002374175819568336, "grad_norm": 6.31204080581665, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8645709753036499, "num_tokens": 362498859.0, "step": 9501 }, { "epoch": 1.208752067167027, "ewc_loss": 0.055019766092300415, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023769764811731875, "grad_norm": 6.357493877410889, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8642131090164185, "num_tokens": 362542346.0, "step": 9502 }, { "epoch": 1.2088792774456176, "ewc_loss": 0.05498786270618439, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023737862647976726, "grad_norm": 6.385866165161133, "learning_rate": 1e-06, "loss": 0.382, "mean_token_accuracy": 0.8819608688354492, "num_tokens": 362575360.0, "step": 9503 }, { "epoch": 1.2090064877242082, "ewc_loss": 0.054958783090114594, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023708782100584358, "grad_norm": 6.339155197143555, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8643976449966431, "num_tokens": 362614737.0, "step": 9504 }, { "epoch": 1.2091336980027987, "ewc_loss": 0.0550004243850708, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023750423861201853, "grad_norm": 6.385622501373291, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8592553734779358, "num_tokens": 362653756.0, "step": 9505 }, { "epoch": 1.2092609082813892, "ewc_loss": 0.05493561923503876, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023685619817115366, "grad_norm": 6.3009185791015625, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8649762868881226, "num_tokens": 362690745.0, "step": 9506 }, { "epoch": 1.2093881185599797, "ewc_loss": 0.05501360446214676, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023763604986015707, "grad_norm": 6.350281238555908, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8731175661087036, "num_tokens": 362726328.0, "step": 9507 }, { "epoch": 1.2095153288385703, "ewc_loss": 0.05494212359189987, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023692123068030924, "grad_norm": 6.304900169372559, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8672724962234497, "num_tokens": 362764285.0, "step": 9508 }, { "epoch": 1.2096425391171606, "ewc_loss": 0.05494215711951256, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023692157992627472, "grad_norm": 6.461305618286133, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8451336622238159, "num_tokens": 362794697.0, "step": 9509 }, { "epoch": 1.209769749395751, "ewc_loss": 0.05490368604660034, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023653685639146715, "grad_norm": 6.327328681945801, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8617801070213318, "num_tokens": 362831618.0, "step": 9510 }, { "epoch": 1.2098969596743416, "ewc_loss": 0.055036626756191254, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023786627571098506, "grad_norm": 6.4715657234191895, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8726829290390015, "num_tokens": 362866404.0, "step": 9511 }, { "epoch": 1.2100241699529322, "ewc_loss": 0.054947271943092346, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023697270080447197, "grad_norm": 6.284635543823242, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.8801617622375488, "num_tokens": 362902554.0, "step": 9512 }, { "epoch": 1.2101513802315227, "ewc_loss": 0.05507161468267441, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023821613285690546, "grad_norm": 6.348212718963623, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8583650588989258, "num_tokens": 362944169.0, "step": 9513 }, { "epoch": 1.2102785905101132, "ewc_loss": 0.05505302548408508, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023803023213986307, "grad_norm": 6.353036403656006, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.843647301197052, "num_tokens": 362983789.0, "step": 9514 }, { "epoch": 1.2104058007887037, "ewc_loss": 0.05498809367418289, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023738091113045812, "grad_norm": 6.371888160705566, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8704203963279724, "num_tokens": 363021551.0, "step": 9515 }, { "epoch": 1.2105330110672943, "ewc_loss": 0.05500609427690506, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002375609619775787, "grad_norm": 6.34039306640625, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.846642255783081, "num_tokens": 363064951.0, "step": 9516 }, { "epoch": 1.2106602213458848, "ewc_loss": 0.05501329153776169, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023763289209455252, "grad_norm": 6.330800533294678, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8631500005722046, "num_tokens": 363110876.0, "step": 9517 }, { "epoch": 1.2107874316244753, "ewc_loss": 0.05504274368286133, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023792742285877466, "grad_norm": 6.327198028564453, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8617511987686157, "num_tokens": 363155310.0, "step": 9518 }, { "epoch": 1.2109146419030659, "ewc_loss": 0.05500340834259987, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002375340845901519, "grad_norm": 6.37258768081665, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8752639293670654, "num_tokens": 363193166.0, "step": 9519 }, { "epoch": 1.2110418521816562, "ewc_loss": 0.055039092898368835, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023789092665538192, "grad_norm": 6.450920104980469, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8446491956710815, "num_tokens": 363225163.0, "step": 9520 }, { "epoch": 1.2111690624602467, "ewc_loss": 0.05500604212284088, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023756042355671525, "grad_norm": 6.470128059387207, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8625301718711853, "num_tokens": 363264720.0, "step": 9521 }, { "epoch": 1.2112962727388372, "ewc_loss": 0.05493086203932762, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023680862796027213, "grad_norm": 6.339151859283447, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.849219560623169, "num_tokens": 363305029.0, "step": 9522 }, { "epoch": 1.2114234830174277, "ewc_loss": 0.05498585104942322, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023735851573292166, "grad_norm": 6.349630832672119, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8709973692893982, "num_tokens": 363344128.0, "step": 9523 }, { "epoch": 1.2115506932960183, "ewc_loss": 0.05491142347455025, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002366142434766516, "grad_norm": 6.351657867431641, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8634131550788879, "num_tokens": 363385903.0, "step": 9524 }, { "epoch": 1.2116779035746088, "ewc_loss": 0.055025532841682434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023775534646119922, "grad_norm": 6.373415946960449, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8651678562164307, "num_tokens": 363425216.0, "step": 9525 }, { "epoch": 1.2118051138531993, "ewc_loss": 0.05486270785331726, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023612705990672112, "grad_norm": 6.346798419952393, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8746147155761719, "num_tokens": 363462691.0, "step": 9526 }, { "epoch": 1.2119323241317899, "ewc_loss": 0.05500845983624458, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023758459428790957, "grad_norm": 6.466947555541992, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8493369817733765, "num_tokens": 363495849.0, "step": 9527 }, { "epoch": 1.2120595344103804, "ewc_loss": 0.054908573627471924, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000236585721722804, "grad_norm": 6.336573123931885, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8621773719787598, "num_tokens": 363535268.0, "step": 9528 }, { "epoch": 1.212186744688971, "ewc_loss": 0.055018916726112366, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023768914979882538, "grad_norm": 6.418911933898926, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8469905257225037, "num_tokens": 363573523.0, "step": 9529 }, { "epoch": 1.2123139549675614, "ewc_loss": 0.054951563477516174, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023701562895439565, "grad_norm": 6.37688684463501, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8637827038764954, "num_tokens": 363607545.0, "step": 9530 }, { "epoch": 1.212441165246152, "ewc_loss": 0.05500396341085434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023753962886985391, "grad_norm": 6.346238136291504, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8657139539718628, "num_tokens": 363648456.0, "step": 9531 }, { "epoch": 1.2125683755247425, "ewc_loss": 0.05493047088384628, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023680469894316047, "grad_norm": 6.357194900512695, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8733133673667908, "num_tokens": 363688726.0, "step": 9532 }, { "epoch": 1.212695585803333, "ewc_loss": 0.054958097636699677, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002370809525018558, "grad_norm": 6.427066802978516, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8573469519615173, "num_tokens": 363728557.0, "step": 9533 }, { "epoch": 1.2128227960819233, "ewc_loss": 0.05494072660803795, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023690726084169, "grad_norm": 6.3272199630737305, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8643069267272949, "num_tokens": 363764191.0, "step": 9534 }, { "epoch": 1.2129500063605139, "ewc_loss": 0.05499044805765152, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023740445612929761, "grad_norm": 6.44509744644165, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8521857261657715, "num_tokens": 363798126.0, "step": 9535 }, { "epoch": 1.2130772166391044, "ewc_loss": 0.0549321174621582, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023682118626311421, "grad_norm": 6.349900245666504, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8473714590072632, "num_tokens": 363836037.0, "step": 9536 }, { "epoch": 1.213204426917695, "ewc_loss": 0.05498676747083664, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023736769799143076, "grad_norm": 6.354394912719727, "learning_rate": 1e-06, "loss": 0.3911, "mean_token_accuracy": 0.8856380581855774, "num_tokens": 363872359.0, "step": 9537 }, { "epoch": 1.2133316371962855, "ewc_loss": 0.054963864386081696, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023713863629382104, "grad_norm": 6.3575520515441895, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.858344554901123, "num_tokens": 363914206.0, "step": 9538 }, { "epoch": 1.213458847474876, "ewc_loss": 0.0550101175904274, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023760116891935468, "grad_norm": 6.351347923278809, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8648836612701416, "num_tokens": 363956147.0, "step": 9539 }, { "epoch": 1.2135860577534665, "ewc_loss": 0.054925721138715744, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002367572160437703, "grad_norm": 6.393953800201416, "learning_rate": 1e-06, "loss": 0.3979, "mean_token_accuracy": 0.8803646564483643, "num_tokens": 363986328.0, "step": 9540 }, { "epoch": 1.213713268032057, "ewc_loss": 0.054948411881923676, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000236984109506011, "grad_norm": 6.331252098083496, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.860776424407959, "num_tokens": 364024905.0, "step": 9541 }, { "epoch": 1.2138404783106476, "ewc_loss": 0.05503014475107193, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023780146148055792, "grad_norm": 6.411421775817871, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8648986220359802, "num_tokens": 364068628.0, "step": 9542 }, { "epoch": 1.213967688589238, "ewc_loss": 0.05487487465143204, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023624875757377595, "grad_norm": 6.361931800842285, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8602230548858643, "num_tokens": 364104261.0, "step": 9543 }, { "epoch": 1.2140948988678284, "ewc_loss": 0.055016010999679565, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023766008962411433, "grad_norm": 6.394720077514648, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8603836894035339, "num_tokens": 364147524.0, "step": 9544 }, { "epoch": 1.214222109146419, "ewc_loss": 0.05495268851518631, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023702687758486718, "grad_norm": 6.340231418609619, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8650497198104858, "num_tokens": 364187715.0, "step": 9545 }, { "epoch": 1.2143493194250095, "ewc_loss": 0.054997965693473816, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023747963132336736, "grad_norm": 6.378727436065674, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8741762638092041, "num_tokens": 364226081.0, "step": 9546 }, { "epoch": 1.2144765297036, "ewc_loss": 0.055017173290252686, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023767173115629703, "grad_norm": 6.389359474182129, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8641613125801086, "num_tokens": 364268390.0, "step": 9547 }, { "epoch": 1.2146037399821905, "ewc_loss": 0.055024512112140656, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023774511646479368, "grad_norm": 6.383645534515381, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8661248683929443, "num_tokens": 364310176.0, "step": 9548 }, { "epoch": 1.214730950260781, "ewc_loss": 0.05491362884640694, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023663628962822258, "grad_norm": 6.389795303344727, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8751779794692993, "num_tokens": 364345652.0, "step": 9549 }, { "epoch": 1.2148581605393716, "ewc_loss": 0.05498456954956055, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023734569549560547, "grad_norm": 6.433358192443848, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8597566485404968, "num_tokens": 364383810.0, "step": 9550 }, { "epoch": 1.214985370817962, "ewc_loss": 0.05490882694721222, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023658825375605375, "grad_norm": 6.348334312438965, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.868093729019165, "num_tokens": 364421169.0, "step": 9551 }, { "epoch": 1.2151125810965526, "ewc_loss": 0.05497945100069046, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023729453096166253, "grad_norm": 6.429158687591553, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8581703305244446, "num_tokens": 364452552.0, "step": 9552 }, { "epoch": 1.2152397913751432, "ewc_loss": 0.05487558990716934, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002362559171160683, "grad_norm": 6.404360771179199, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8631751537322998, "num_tokens": 364486779.0, "step": 9553 }, { "epoch": 1.2153670016537337, "ewc_loss": 0.054938506335020065, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023688505461905152, "grad_norm": 6.349274635314941, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8741336464881897, "num_tokens": 364526237.0, "step": 9554 }, { "epoch": 1.2154942119323242, "ewc_loss": 0.05493185296654701, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023681853781454265, "grad_norm": 6.3748064041137695, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.869896650314331, "num_tokens": 364568446.0, "step": 9555 }, { "epoch": 1.2156214222109147, "ewc_loss": 0.05492163449525833, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002367163688177243, "grad_norm": 6.382294178009033, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8704989552497864, "num_tokens": 364604448.0, "step": 9556 }, { "epoch": 1.2157486324895053, "ewc_loss": 0.05505703017115593, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000238070308114402, "grad_norm": 6.395904064178467, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8619282245635986, "num_tokens": 364642707.0, "step": 9557 }, { "epoch": 1.2158758427680956, "ewc_loss": 0.054957035928964615, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023707035870756954, "grad_norm": 6.3926615715026855, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8590447306632996, "num_tokens": 364678428.0, "step": 9558 }, { "epoch": 1.216003053046686, "ewc_loss": 0.05501789227128029, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023767891980241984, "grad_norm": 6.384087085723877, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8789336681365967, "num_tokens": 364718915.0, "step": 9559 }, { "epoch": 1.2161302633252766, "ewc_loss": 0.05506526678800583, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023815265740267932, "grad_norm": 6.456577777862549, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8796544671058655, "num_tokens": 364756030.0, "step": 9560 }, { "epoch": 1.2162574736038672, "ewc_loss": 0.05492035672068596, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023670356313232332, "grad_norm": 6.391458034515381, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8616209030151367, "num_tokens": 364796158.0, "step": 9561 }, { "epoch": 1.2163846838824577, "ewc_loss": 0.055005788803100586, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023755786241963506, "grad_norm": 6.39216947555542, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8715068101882935, "num_tokens": 364832722.0, "step": 9562 }, { "epoch": 1.2165118941610482, "ewc_loss": 0.0549636147916317, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023713614791631699, "grad_norm": 6.400944232940674, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.863625168800354, "num_tokens": 364874120.0, "step": 9563 }, { "epoch": 1.2166391044396387, "ewc_loss": 0.05493248626589775, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023682485334575176, "grad_norm": 6.396157741546631, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.866179883480072, "num_tokens": 364913166.0, "step": 9564 }, { "epoch": 1.2167663147182293, "ewc_loss": 0.05490626394748688, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023656261328142136, "grad_norm": 6.468342304229736, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8463963270187378, "num_tokens": 364949586.0, "step": 9565 }, { "epoch": 1.2168935249968198, "ewc_loss": 0.05491996929049492, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023669969232287258, "grad_norm": 6.496925354003906, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8460544347763062, "num_tokens": 364989070.0, "step": 9566 }, { "epoch": 1.2170207352754103, "ewc_loss": 0.05490952730178833, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023659529688302428, "grad_norm": 6.37960147857666, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8684018850326538, "num_tokens": 365027050.0, "step": 9567 }, { "epoch": 1.2171479455540009, "ewc_loss": 0.05487315356731415, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023623155720997602, "grad_norm": 6.461963653564453, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8633428812026978, "num_tokens": 365060050.0, "step": 9568 }, { "epoch": 1.2172751558325912, "ewc_loss": 0.054918091744184494, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023668092035222799, "grad_norm": 6.458099365234375, "learning_rate": 1e-06, "loss": 0.5476, "mean_token_accuracy": 0.8329436779022217, "num_tokens": 365098030.0, "step": 9569 }, { "epoch": 1.2174023661111817, "ewc_loss": 0.05492802709341049, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023678025172557682, "grad_norm": 6.36546516418457, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8659067153930664, "num_tokens": 365141143.0, "step": 9570 }, { "epoch": 1.2175295763897722, "ewc_loss": 0.05490553006529808, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002365552936680615, "grad_norm": 6.460541248321533, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8416301012039185, "num_tokens": 365174204.0, "step": 9571 }, { "epoch": 1.2176567866683627, "ewc_loss": 0.0549408495426178, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023690849775448442, "grad_norm": 6.371253967285156, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.863513171672821, "num_tokens": 365207170.0, "step": 9572 }, { "epoch": 1.2177839969469533, "ewc_loss": 0.0549631267786026, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002371312875766307, "grad_norm": 6.385193824768066, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8665966987609863, "num_tokens": 365248926.0, "step": 9573 }, { "epoch": 1.2179112072255438, "ewc_loss": 0.0549679771065712, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002371797600062564, "grad_norm": 6.404481410980225, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8668337464332581, "num_tokens": 365283588.0, "step": 9574 }, { "epoch": 1.2180384175041343, "ewc_loss": 0.05496310442686081, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023713105474598706, "grad_norm": 6.342498779296875, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.861059308052063, "num_tokens": 365324869.0, "step": 9575 }, { "epoch": 1.2181656277827249, "ewc_loss": 0.055028803646564484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023778803006280214, "grad_norm": 6.37226676940918, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8606792688369751, "num_tokens": 365366832.0, "step": 9576 }, { "epoch": 1.2182928380613154, "ewc_loss": 0.05497013032436371, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002372013113927096, "grad_norm": 6.402749061584473, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8606255054473877, "num_tokens": 365401148.0, "step": 9577 }, { "epoch": 1.218420048339906, "ewc_loss": 0.055054254829883575, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002380425576120615, "grad_norm": 6.424431324005127, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8662432432174683, "num_tokens": 365442303.0, "step": 9578 }, { "epoch": 1.2185472586184964, "ewc_loss": 0.05496693402528763, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002371693408349529, "grad_norm": 6.381301403045654, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8757420182228088, "num_tokens": 365477877.0, "step": 9579 }, { "epoch": 1.218674468897087, "ewc_loss": 0.05501130223274231, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023761299962643534, "grad_norm": 6.407932758331299, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8522515892982483, "num_tokens": 365520648.0, "step": 9580 }, { "epoch": 1.2188016791756775, "ewc_loss": 0.054968081414699554, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023718079319223762, "grad_norm": 6.468234062194824, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8454820513725281, "num_tokens": 365555129.0, "step": 9581 }, { "epoch": 1.218928889454268, "ewc_loss": 0.05495548993349075, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023705490457359701, "grad_norm": 6.418150424957275, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8690370917320251, "num_tokens": 365594368.0, "step": 9582 }, { "epoch": 1.2190560997328583, "ewc_loss": 0.054984644055366516, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023734646674711257, "grad_norm": 6.412326335906982, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8634064197540283, "num_tokens": 365634763.0, "step": 9583 }, { "epoch": 1.2191833100114489, "ewc_loss": 0.054862961173057556, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002361296210438013, "grad_norm": 6.359299182891846, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8798124194145203, "num_tokens": 365675951.0, "step": 9584 }, { "epoch": 1.2193105202900394, "ewc_loss": 0.054972559213638306, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023722558398731053, "grad_norm": 6.41586446762085, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8710322380065918, "num_tokens": 365710348.0, "step": 9585 }, { "epoch": 1.21943773056863, "ewc_loss": 0.0549798309803009, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023729832901153713, "grad_norm": 6.432123184204102, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8668699264526367, "num_tokens": 365745166.0, "step": 9586 }, { "epoch": 1.2195649408472204, "ewc_loss": 0.05494621396064758, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002369621506659314, "grad_norm": 6.4324212074279785, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8771795034408569, "num_tokens": 365780122.0, "step": 9587 }, { "epoch": 1.219692151125811, "ewc_loss": 0.05491109937429428, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023661096929572523, "grad_norm": 6.457675933837891, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8626923561096191, "num_tokens": 365816326.0, "step": 9588 }, { "epoch": 1.2198193614044015, "ewc_loss": 0.054928943514823914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023678944853600115, "grad_norm": 6.4231438636779785, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.8379449248313904, "num_tokens": 365858938.0, "step": 9589 }, { "epoch": 1.219946571682992, "ewc_loss": 0.054914288222789764, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002366428670939058, "grad_norm": 6.451262950897217, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8524259924888611, "num_tokens": 365893892.0, "step": 9590 }, { "epoch": 1.2200737819615826, "ewc_loss": 0.054850563406944275, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002360056241741404, "grad_norm": 6.415273189544678, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8538316488265991, "num_tokens": 365938787.0, "step": 9591 }, { "epoch": 1.220200992240173, "ewc_loss": 0.05492439121007919, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023674393014516681, "grad_norm": 6.463421821594238, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8637481927871704, "num_tokens": 365972664.0, "step": 9592 }, { "epoch": 1.2203282025187634, "ewc_loss": 0.05483861267566681, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002358861529501155, "grad_norm": 6.390589714050293, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8701226115226746, "num_tokens": 366013367.0, "step": 9593 }, { "epoch": 1.220455412797354, "ewc_loss": 0.054905928671360016, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023655928089283407, "grad_norm": 6.506171703338623, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8700157403945923, "num_tokens": 366051637.0, "step": 9594 }, { "epoch": 1.2205826230759445, "ewc_loss": 0.0548412948846817, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023591294302605093, "grad_norm": 6.373742580413818, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8603478074073792, "num_tokens": 366092713.0, "step": 9595 }, { "epoch": 1.220709833354535, "ewc_loss": 0.05495942384004593, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002370942384004593, "grad_norm": 6.476861000061035, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8639678359031677, "num_tokens": 366131594.0, "step": 9596 }, { "epoch": 1.2208370436331255, "ewc_loss": 0.05484064295887947, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023590642376802862, "grad_norm": 6.399991989135742, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8588876724243164, "num_tokens": 366170871.0, "step": 9597 }, { "epoch": 1.220964253911716, "ewc_loss": 0.05494684353470802, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023696843709331006, "grad_norm": 6.41247034072876, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8528736233711243, "num_tokens": 366212974.0, "step": 9598 }, { "epoch": 1.2210914641903066, "ewc_loss": 0.054903171956539154, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023653173411730677, "grad_norm": 6.4442877769470215, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8553829789161682, "num_tokens": 366255904.0, "step": 9599 }, { "epoch": 1.221218674468897, "ewc_loss": 0.054965317249298096, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023715315910521895, "grad_norm": 6.398303508758545, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8685369491577148, "num_tokens": 366288887.0, "step": 9600 }, { "epoch": 1.2213458847474876, "ewc_loss": 0.054923154413700104, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023673151736147702, "grad_norm": 6.396271705627441, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8771371841430664, "num_tokens": 366328796.0, "step": 9601 }, { "epoch": 1.2214730950260781, "ewc_loss": 0.0549553781747818, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023705378407612443, "grad_norm": 6.381298542022705, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8609292507171631, "num_tokens": 366373093.0, "step": 9602 }, { "epoch": 1.2216003053046687, "ewc_loss": 0.05493283271789551, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023682833125349134, "grad_norm": 6.412696838378906, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.872422456741333, "num_tokens": 366414353.0, "step": 9603 }, { "epoch": 1.2217275155832592, "ewc_loss": 0.054960791021585464, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023710790264885873, "grad_norm": 6.435763835906982, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8771635293960571, "num_tokens": 366450683.0, "step": 9604 }, { "epoch": 1.2218547258618497, "ewc_loss": 0.05489998310804367, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023649985087104142, "grad_norm": 6.386372089385986, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8684544563293457, "num_tokens": 366487158.0, "step": 9605 }, { "epoch": 1.2219819361404403, "ewc_loss": 0.05508788675069809, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002383788669249043, "grad_norm": 6.432679176330566, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8645623326301575, "num_tokens": 366529984.0, "step": 9606 }, { "epoch": 1.2221091464190306, "ewc_loss": 0.05498743802309036, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002373743918724358, "grad_norm": 6.430395126342773, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8673198223114014, "num_tokens": 366564100.0, "step": 9607 }, { "epoch": 1.222236356697621, "ewc_loss": 0.05500330030918121, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023753299319650978, "grad_norm": 6.4217209815979, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8706711530685425, "num_tokens": 366603752.0, "step": 9608 }, { "epoch": 1.2223635669762116, "ewc_loss": 0.05502288416028023, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023772883287165314, "grad_norm": 6.363307476043701, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8658479452133179, "num_tokens": 366642170.0, "step": 9609 }, { "epoch": 1.2224907772548022, "ewc_loss": 0.055045947432518005, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023795949527993798, "grad_norm": 6.440277576446533, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8493797183036804, "num_tokens": 366684038.0, "step": 9610 }, { "epoch": 1.2226179875333927, "ewc_loss": 0.055078890174627304, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002382888924330473, "grad_norm": 6.411852836608887, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8604997992515564, "num_tokens": 366717832.0, "step": 9611 }, { "epoch": 1.2227451978119832, "ewc_loss": 0.05512535572052002, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023875353508628905, "grad_norm": 6.437093257904053, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8576064109802246, "num_tokens": 366757999.0, "step": 9612 }, { "epoch": 1.2228724080905737, "ewc_loss": 0.05510875582695007, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023858757049310952, "grad_norm": 6.4131951332092285, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8730639815330505, "num_tokens": 366798533.0, "step": 9613 }, { "epoch": 1.2229996183691643, "ewc_loss": 0.05521347373723984, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023963474086485803, "grad_norm": 6.4268317222595215, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8618826270103455, "num_tokens": 366843066.0, "step": 9614 }, { "epoch": 1.2231268286477548, "ewc_loss": 0.05506712570786476, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023817125475034118, "grad_norm": 6.380283832550049, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8794944286346436, "num_tokens": 366883190.0, "step": 9615 }, { "epoch": 1.2232540389263453, "ewc_loss": 0.05517011135816574, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023920113744679838, "grad_norm": 6.486340522766113, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8674481511116028, "num_tokens": 366920642.0, "step": 9616 }, { "epoch": 1.2233812492049359, "ewc_loss": 0.05504177510738373, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023791777493897825, "grad_norm": 6.403450012207031, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8645449876785278, "num_tokens": 366954468.0, "step": 9617 }, { "epoch": 1.2235084594835262, "ewc_loss": 0.05512743443250656, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002387743443250656, "grad_norm": 6.488929748535156, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8476244211196899, "num_tokens": 366991170.0, "step": 9618 }, { "epoch": 1.2236356697621167, "ewc_loss": 0.05508718639612198, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023837188200559467, "grad_norm": 6.384047508239746, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8691333532333374, "num_tokens": 367031833.0, "step": 9619 }, { "epoch": 1.2237628800407072, "ewc_loss": 0.055097565054893494, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023847565171308815, "grad_norm": 6.44600772857666, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8588578701019287, "num_tokens": 367074709.0, "step": 9620 }, { "epoch": 1.2238900903192977, "ewc_loss": 0.05504956841468811, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023799565678928047, "grad_norm": 6.427292346954346, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8615742921829224, "num_tokens": 367106820.0, "step": 9621 }, { "epoch": 1.2240173005978883, "ewc_loss": 0.05513725429773331, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002388725260971114, "grad_norm": 6.429145336151123, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8781554698944092, "num_tokens": 367142298.0, "step": 9622 }, { "epoch": 1.2241445108764788, "ewc_loss": 0.05505603924393654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023806038370821625, "grad_norm": 6.418158531188965, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.853941023349762, "num_tokens": 367179957.0, "step": 9623 }, { "epoch": 1.2242717211550693, "ewc_loss": 0.05506131798028946, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023811319260858, "grad_norm": 6.37677001953125, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8660454750061035, "num_tokens": 367226800.0, "step": 9624 }, { "epoch": 1.2243989314336599, "ewc_loss": 0.055120013654232025, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002387001586612314, "grad_norm": 6.4408721923828125, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8684756755828857, "num_tokens": 367257168.0, "step": 9625 }, { "epoch": 1.2245261417122504, "ewc_loss": 0.05512901395559311, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023879014770500362, "grad_norm": 6.4345479011535645, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8655226826667786, "num_tokens": 367295254.0, "step": 9626 }, { "epoch": 1.224653351990841, "ewc_loss": 0.05517091602087021, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023920917010400444, "grad_norm": 6.406749248504639, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.852019190788269, "num_tokens": 367335814.0, "step": 9627 }, { "epoch": 1.2247805622694314, "ewc_loss": 0.05512574315071106, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023875744955148548, "grad_norm": 6.408323764801025, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8608449101448059, "num_tokens": 367377592.0, "step": 9628 }, { "epoch": 1.224907772548022, "ewc_loss": 0.05510127544403076, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023851277364883572, "grad_norm": 6.4735612869262695, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8547635078430176, "num_tokens": 367408069.0, "step": 9629 }, { "epoch": 1.2250349828266125, "ewc_loss": 0.055143486708402634, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023893486650194973, "grad_norm": 6.429880619049072, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.864890456199646, "num_tokens": 367444787.0, "step": 9630 }, { "epoch": 1.225162193105203, "ewc_loss": 0.05515553802251816, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002390553563600406, "grad_norm": 6.437496185302734, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8496250510215759, "num_tokens": 367486848.0, "step": 9631 }, { "epoch": 1.2252894033837933, "ewc_loss": 0.055069927126169205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023819926718715578, "grad_norm": 6.4060959815979, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8740814924240112, "num_tokens": 367530456.0, "step": 9632 }, { "epoch": 1.2254166136623839, "ewc_loss": 0.05506405979394913, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023814060841687024, "grad_norm": 6.412961006164551, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8677183985710144, "num_tokens": 367566513.0, "step": 9633 }, { "epoch": 1.2255438239409744, "ewc_loss": 0.05507165193557739, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023821654031053185, "grad_norm": 6.450489521026611, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8425559401512146, "num_tokens": 367607757.0, "step": 9634 }, { "epoch": 1.225671034219565, "ewc_loss": 0.05505475029349327, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023804750526323915, "grad_norm": 6.401331424713135, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8667422533035278, "num_tokens": 367645525.0, "step": 9635 }, { "epoch": 1.2257982444981554, "ewc_loss": 0.055087439715862274, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023837438493501395, "grad_norm": 6.440439224243164, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8731849193572998, "num_tokens": 367680840.0, "step": 9636 }, { "epoch": 1.225925454776746, "ewc_loss": 0.055083200335502625, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023833200975786895, "grad_norm": 6.444583415985107, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8721853494644165, "num_tokens": 367717444.0, "step": 9637 }, { "epoch": 1.2260526650553365, "ewc_loss": 0.05507485568523407, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002382485690759495, "grad_norm": 6.420788288116455, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8439013361930847, "num_tokens": 367756548.0, "step": 9638 }, { "epoch": 1.226179875333927, "ewc_loss": 0.055112797766923904, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002386279811616987, "grad_norm": 6.407842636108398, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8584628105163574, "num_tokens": 367798341.0, "step": 9639 }, { "epoch": 1.2263070856125176, "ewc_loss": 0.05508562922477722, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023835626780055463, "grad_norm": 6.410439491271973, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8710423707962036, "num_tokens": 367835087.0, "step": 9640 }, { "epoch": 1.226434295891108, "ewc_loss": 0.055147938430309296, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023897936625871807, "grad_norm": 6.434402942657471, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8658843636512756, "num_tokens": 367873816.0, "step": 9641 }, { "epoch": 1.2265615061696984, "ewc_loss": 0.05511976778507233, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023869768483564258, "grad_norm": 6.414012908935547, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8587782382965088, "num_tokens": 367913621.0, "step": 9642 }, { "epoch": 1.226688716448289, "ewc_loss": 0.05512941628694534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023879414948169142, "grad_norm": 6.4947733879089355, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8543227910995483, "num_tokens": 367946881.0, "step": 9643 }, { "epoch": 1.2268159267268794, "ewc_loss": 0.05512738972902298, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023877389321569353, "grad_norm": 6.429317951202393, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8671011924743652, "num_tokens": 367980806.0, "step": 9644 }, { "epoch": 1.22694313700547, "ewc_loss": 0.055153995752334595, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023903993132989854, "grad_norm": 6.437366008758545, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8712510466575623, "num_tokens": 368016041.0, "step": 9645 }, { "epoch": 1.2270703472840605, "ewc_loss": 0.05509290099143982, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023842898372095078, "grad_norm": 6.398902416229248, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8616400957107544, "num_tokens": 368052331.0, "step": 9646 }, { "epoch": 1.227197557562651, "ewc_loss": 0.0551363043487072, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023886305280029774, "grad_norm": 6.455018043518066, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8717168569564819, "num_tokens": 368091458.0, "step": 9647 }, { "epoch": 1.2273247678412416, "ewc_loss": 0.05508367717266083, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023833678278606385, "grad_norm": 6.391439437866211, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8524487614631653, "num_tokens": 368132433.0, "step": 9648 }, { "epoch": 1.227451978119832, "ewc_loss": 0.05505269020795822, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000238026914303191, "grad_norm": 6.395968914031982, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.864134669303894, "num_tokens": 368171030.0, "step": 9649 }, { "epoch": 1.2275791883984226, "ewc_loss": 0.055174484848976135, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002392448514001444, "grad_norm": 6.468290328979492, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8517128229141235, "num_tokens": 368211555.0, "step": 9650 }, { "epoch": 1.2277063986770131, "ewc_loss": 0.05506249517202377, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023812495055608451, "grad_norm": 6.43931245803833, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8699224591255188, "num_tokens": 368252076.0, "step": 9651 }, { "epoch": 1.2278336089556037, "ewc_loss": 0.055138759315013885, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023888761643320322, "grad_norm": 6.423470497131348, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8556805849075317, "num_tokens": 368291265.0, "step": 9652 }, { "epoch": 1.2279608192341942, "ewc_loss": 0.05514279752969742, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002389279834460467, "grad_norm": 6.481629371643066, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.856444776058197, "num_tokens": 368332178.0, "step": 9653 }, { "epoch": 1.2280880295127847, "ewc_loss": 0.05509261041879654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023842608788982034, "grad_norm": 6.382518291473389, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8614585399627686, "num_tokens": 368374103.0, "step": 9654 }, { "epoch": 1.2282152397913753, "ewc_loss": 0.05516091734170914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002391091693425551, "grad_norm": 6.487318515777588, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8662332892417908, "num_tokens": 368411928.0, "step": 9655 }, { "epoch": 1.2283424500699656, "ewc_loss": 0.055062804371118546, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023812805011402816, "grad_norm": 6.468042850494385, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.838326632976532, "num_tokens": 368449200.0, "step": 9656 }, { "epoch": 1.228469660348556, "ewc_loss": 0.05509239807724953, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023842397786211222, "grad_norm": 6.472422122955322, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8459571599960327, "num_tokens": 368489935.0, "step": 9657 }, { "epoch": 1.2285968706271466, "ewc_loss": 0.05499580129981041, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000237458007177338, "grad_norm": 6.421785831451416, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.875994086265564, "num_tokens": 368522227.0, "step": 9658 }, { "epoch": 1.2287240809057371, "ewc_loss": 0.05510324239730835, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023853243328630924, "grad_norm": 6.4633893966674805, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8634729385375977, "num_tokens": 368555084.0, "step": 9659 }, { "epoch": 1.2288512911843277, "ewc_loss": 0.05505433678627014, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023804335796739906, "grad_norm": 6.432985305786133, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8595966100692749, "num_tokens": 368591845.0, "step": 9660 }, { "epoch": 1.2289785014629182, "ewc_loss": 0.05509256571531296, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023842568043619394, "grad_norm": 6.4231486320495605, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8535110354423523, "num_tokens": 368637841.0, "step": 9661 }, { "epoch": 1.2291057117415087, "ewc_loss": 0.05501836538314819, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023768364917486906, "grad_norm": 6.4305009841918945, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8789648413658142, "num_tokens": 368677470.0, "step": 9662 }, { "epoch": 1.2292329220200993, "ewc_loss": 0.055057067424058914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002380706719122827, "grad_norm": 6.404026985168457, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8773170113563538, "num_tokens": 368718412.0, "step": 9663 }, { "epoch": 1.2293601322986898, "ewc_loss": 0.05508556216955185, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023835559841245413, "grad_norm": 6.460278034210205, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8519821763038635, "num_tokens": 368759267.0, "step": 9664 }, { "epoch": 1.2294873425772803, "ewc_loss": 0.055100247263908386, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023850248544476926, "grad_norm": 6.45735502243042, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.85333251953125, "num_tokens": 368797586.0, "step": 9665 }, { "epoch": 1.2296145528558708, "ewc_loss": 0.05506562814116478, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002381562808295712, "grad_norm": 6.44580602645874, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8548640012741089, "num_tokens": 368833801.0, "step": 9666 }, { "epoch": 1.2297417631344612, "ewc_loss": 0.055077988654375076, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023827988479752094, "grad_norm": 6.42829704284668, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8472887277603149, "num_tokens": 368878513.0, "step": 9667 }, { "epoch": 1.2298689734130517, "ewc_loss": 0.05510937422513962, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002385937696089968, "grad_norm": 6.532331943511963, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8553454875946045, "num_tokens": 368917815.0, "step": 9668 }, { "epoch": 1.2299961836916422, "ewc_loss": 0.05511303246021271, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023863030946813524, "grad_norm": 6.519779682159424, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.853249728679657, "num_tokens": 368954971.0, "step": 9669 }, { "epoch": 1.2301233939702327, "ewc_loss": 0.05500473827123642, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023754738504067063, "grad_norm": 6.3458356857299805, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8691434860229492, "num_tokens": 368996212.0, "step": 9670 }, { "epoch": 1.2302506042488233, "ewc_loss": 0.05510400980710983, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023854011669754982, "grad_norm": 6.479215621948242, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8763538599014282, "num_tokens": 369032186.0, "step": 9671 }, { "epoch": 1.2303778145274138, "ewc_loss": 0.05514519661664963, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023895196500234306, "grad_norm": 6.459802150726318, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8682264089584351, "num_tokens": 369077565.0, "step": 9672 }, { "epoch": 1.2305050248060043, "ewc_loss": 0.055109355598688126, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023859355133026838, "grad_norm": 6.51769495010376, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8698453903198242, "num_tokens": 369111935.0, "step": 9673 }, { "epoch": 1.2306322350845948, "ewc_loss": 0.0549844354391098, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023734435671940446, "grad_norm": 6.422422409057617, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8595052361488342, "num_tokens": 369154374.0, "step": 9674 }, { "epoch": 1.2307594453631854, "ewc_loss": 0.0551220178604126, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023872019664850086, "grad_norm": 6.501133918762207, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8544768691062927, "num_tokens": 369198557.0, "step": 9675 }, { "epoch": 1.230886655641776, "ewc_loss": 0.054933786392211914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002368378482060507, "grad_norm": 6.424278736114502, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8460005521774292, "num_tokens": 369237756.0, "step": 9676 }, { "epoch": 1.2310138659203664, "ewc_loss": 0.055076055228710175, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002382605307502672, "grad_norm": 6.753290176391602, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8497296571731567, "num_tokens": 369270155.0, "step": 9677 }, { "epoch": 1.231141076198957, "ewc_loss": 0.05489096790552139, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023640968720428646, "grad_norm": 6.3584699630737305, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8570406436920166, "num_tokens": 369310870.0, "step": 9678 }, { "epoch": 1.2312682864775475, "ewc_loss": 0.055077195167541504, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023827195400372148, "grad_norm": 6.573646068572998, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8464328050613403, "num_tokens": 369344679.0, "step": 9679 }, { "epoch": 1.231395496756138, "ewc_loss": 0.05493313819169998, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002368313871556893, "grad_norm": 6.410242080688477, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8627172112464905, "num_tokens": 369380388.0, "step": 9680 }, { "epoch": 1.2315227070347283, "ewc_loss": 0.05502394586801529, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023773945576976985, "grad_norm": 6.4652419090271, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8735857009887695, "num_tokens": 369422327.0, "step": 9681 }, { "epoch": 1.2316499173133189, "ewc_loss": 0.05500517040491104, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000237551677855663, "grad_norm": 6.47383451461792, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8687982559204102, "num_tokens": 369457160.0, "step": 9682 }, { "epoch": 1.2317771275919094, "ewc_loss": 0.05502551048994064, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023775509907864034, "grad_norm": 6.425307750701904, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8630744218826294, "num_tokens": 369497883.0, "step": 9683 }, { "epoch": 1.2319043378705, "ewc_loss": 0.05501525476574898, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023765255173202604, "grad_norm": 6.403995513916016, "learning_rate": 1e-06, "loss": 0.3651, "mean_token_accuracy": 0.8928225040435791, "num_tokens": 369540014.0, "step": 9684 }, { "epoch": 1.2320315481490904, "ewc_loss": 0.05504726991057396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023797270841896534, "grad_norm": 6.468104362487793, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8629215359687805, "num_tokens": 369580424.0, "step": 9685 }, { "epoch": 1.232158758427681, "ewc_loss": 0.05503137409687042, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023781372874509543, "grad_norm": 6.489501953125, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8591949343681335, "num_tokens": 369623615.0, "step": 9686 }, { "epoch": 1.2322859687062715, "ewc_loss": 0.054971467703580856, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023721468460280448, "grad_norm": 6.413852691650391, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8504180312156677, "num_tokens": 369658408.0, "step": 9687 }, { "epoch": 1.232413178984862, "ewc_loss": 0.055034391582012177, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023784389486536384, "grad_norm": 6.504759788513184, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8590602874755859, "num_tokens": 369696311.0, "step": 9688 }, { "epoch": 1.2325403892634526, "ewc_loss": 0.05510019510984421, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023850197612773627, "grad_norm": 6.494410991668701, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8505669236183167, "num_tokens": 369737473.0, "step": 9689 }, { "epoch": 1.232667599542043, "ewc_loss": 0.05505139380693436, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023801391944289207, "grad_norm": 6.483002185821533, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8605732917785645, "num_tokens": 369770151.0, "step": 9690 }, { "epoch": 1.2327948098206334, "ewc_loss": 0.05508222430944443, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023832221631892025, "grad_norm": 6.476646900177002, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8810397982597351, "num_tokens": 369813503.0, "step": 9691 }, { "epoch": 1.232922020099224, "ewc_loss": 0.0550188273191452, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023768824758008122, "grad_norm": 6.398514270782471, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8641231060028076, "num_tokens": 369848768.0, "step": 9692 }, { "epoch": 1.2330492303778144, "ewc_loss": 0.05518146604299545, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023931465693749487, "grad_norm": 6.774903774261475, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8658588528633118, "num_tokens": 369892510.0, "step": 9693 }, { "epoch": 1.233176440656405, "ewc_loss": 0.05490165948867798, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023651658557355404, "grad_norm": 6.413568019866943, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8702318668365479, "num_tokens": 369929638.0, "step": 9694 }, { "epoch": 1.2333036509349955, "ewc_loss": 0.05517720431089401, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023927204892970622, "grad_norm": 6.727204322814941, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8836055994033813, "num_tokens": 369971201.0, "step": 9695 }, { "epoch": 1.233430861213586, "ewc_loss": 0.054823022335767746, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023573022917844355, "grad_norm": 6.343116760253906, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8580164313316345, "num_tokens": 370011713.0, "step": 9696 }, { "epoch": 1.2335580714921766, "ewc_loss": 0.05522356182336807, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023973561474122107, "grad_norm": 6.9812445640563965, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8414719104766846, "num_tokens": 370042647.0, "step": 9697 }, { "epoch": 1.233685281770767, "ewc_loss": 0.054949045181274414, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023699045414105058, "grad_norm": 6.2787017822265625, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8611488938331604, "num_tokens": 370083976.0, "step": 9698 }, { "epoch": 1.2338124920493576, "ewc_loss": 0.05543535202741623, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024185352958738804, "grad_norm": 7.170551300048828, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8599426746368408, "num_tokens": 370126435.0, "step": 9699 }, { "epoch": 1.2339397023279481, "ewc_loss": 0.05525089055299759, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024000892881304026, "grad_norm": 6.328073501586914, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.871801495552063, "num_tokens": 370165038.0, "step": 9700 }, { "epoch": 1.2340669126065387, "ewc_loss": 0.05553304776549339, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024283048696815968, "grad_norm": 6.646290302276611, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8544389009475708, "num_tokens": 370203438.0, "step": 9701 }, { "epoch": 1.2341941228851292, "ewc_loss": 0.055257219821214676, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002400722005404532, "grad_norm": 6.693172931671143, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8773722648620605, "num_tokens": 370235144.0, "step": 9702 }, { "epoch": 1.2343213331637197, "ewc_loss": 0.0551779605448246, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023927960137370974, "grad_norm": 6.42970609664917, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8582168817520142, "num_tokens": 370278936.0, "step": 9703 }, { "epoch": 1.2344485434423103, "ewc_loss": 0.055210746824741364, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023960745602380484, "grad_norm": 6.552018642425537, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.866752028465271, "num_tokens": 370312595.0, "step": 9704 }, { "epoch": 1.2345757537209006, "ewc_loss": 0.05503598600625992, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023785985831636935, "grad_norm": 6.51334810256958, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8602721691131592, "num_tokens": 370351697.0, "step": 9705 }, { "epoch": 1.234702963999491, "ewc_loss": 0.05507705360651016, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023827055701985955, "grad_norm": 6.429398536682129, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8771985769271851, "num_tokens": 370391264.0, "step": 9706 }, { "epoch": 1.2348301742780816, "ewc_loss": 0.05509825795888901, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023848256387282163, "grad_norm": 6.466335296630859, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8710604310035706, "num_tokens": 370425248.0, "step": 9707 }, { "epoch": 1.2349573845566721, "ewc_loss": 0.055103104561567307, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023853105085436255, "grad_norm": 6.551255702972412, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8427557945251465, "num_tokens": 370462670.0, "step": 9708 }, { "epoch": 1.2350845948352627, "ewc_loss": 0.055030256509780884, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002378025819780305, "grad_norm": 6.4105329513549805, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8437751531600952, "num_tokens": 370506126.0, "step": 9709 }, { "epoch": 1.2352118051138532, "ewc_loss": 0.05513904243707657, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002388904249528423, "grad_norm": 6.443882465362549, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8725336790084839, "num_tokens": 370546895.0, "step": 9710 }, { "epoch": 1.2353390153924437, "ewc_loss": 0.05505567789077759, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023805678938515484, "grad_norm": 6.402404308319092, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8667418360710144, "num_tokens": 370590645.0, "step": 9711 }, { "epoch": 1.2354662256710343, "ewc_loss": 0.055204153060913086, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023954150674398988, "grad_norm": 6.486572265625, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8660017251968384, "num_tokens": 370628614.0, "step": 9712 }, { "epoch": 1.2355934359496248, "ewc_loss": 0.05513106286525726, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023881063680164516, "grad_norm": 6.467518329620361, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.865032434463501, "num_tokens": 370666082.0, "step": 9713 }, { "epoch": 1.2357206462282153, "ewc_loss": 0.05521026998758316, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023960272665135562, "grad_norm": 6.479020118713379, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8482670783996582, "num_tokens": 370705780.0, "step": 9714 }, { "epoch": 1.2358478565068058, "ewc_loss": 0.055114537477493286, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023864535614848137, "grad_norm": 6.424837589263916, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8679631948471069, "num_tokens": 370742550.0, "step": 9715 }, { "epoch": 1.2359750667853961, "ewc_loss": 0.05519115552306175, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023941155814100057, "grad_norm": 6.470602512359619, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8667229413986206, "num_tokens": 370784406.0, "step": 9716 }, { "epoch": 1.2361022770639867, "ewc_loss": 0.05522362142801285, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002397361968178302, "grad_norm": 6.443571090698242, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8592970371246338, "num_tokens": 370820911.0, "step": 9717 }, { "epoch": 1.2362294873425772, "ewc_loss": 0.05520786717534065, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023957867233548313, "grad_norm": 6.459909915924072, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8740426898002625, "num_tokens": 370857427.0, "step": 9718 }, { "epoch": 1.2363566976211677, "ewc_loss": 0.05522219091653824, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023972190683707595, "grad_norm": 6.408350467681885, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8533050417900085, "num_tokens": 370897989.0, "step": 9719 }, { "epoch": 1.2364839078997583, "ewc_loss": 0.055243268609046936, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023993270588107407, "grad_norm": 6.50166654586792, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8502791523933411, "num_tokens": 370932672.0, "step": 9720 }, { "epoch": 1.2366111181783488, "ewc_loss": 0.05522792413830757, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023977924138307571, "grad_norm": 6.454080104827881, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8537789583206177, "num_tokens": 370971027.0, "step": 9721 }, { "epoch": 1.2367383284569393, "ewc_loss": 0.055358026176691055, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024108026991598308, "grad_norm": 6.376946926116943, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.873343825340271, "num_tokens": 371014742.0, "step": 9722 }, { "epoch": 1.2368655387355298, "ewc_loss": 0.05531427264213562, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000240642752032727, "grad_norm": 6.475098609924316, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8473855257034302, "num_tokens": 371052065.0, "step": 9723 }, { "epoch": 1.2369927490141204, "ewc_loss": 0.055348534137010574, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024098533322103322, "grad_norm": 6.420226573944092, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8418065309524536, "num_tokens": 371093897.0, "step": 9724 }, { "epoch": 1.237119959292711, "ewc_loss": 0.05537565052509308, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024125647905748338, "grad_norm": 6.444880485534668, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8616492748260498, "num_tokens": 371132390.0, "step": 9725 }, { "epoch": 1.2372471695713014, "ewc_loss": 0.05541940778493881, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002416940697003156, "grad_norm": 6.4521379470825195, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8738460540771484, "num_tokens": 371170468.0, "step": 9726 }, { "epoch": 1.237374379849892, "ewc_loss": 0.05538364499807358, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024133644183166325, "grad_norm": 6.420973300933838, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8599033355712891, "num_tokens": 371209323.0, "step": 9727 }, { "epoch": 1.2375015901284825, "ewc_loss": 0.055420536547899246, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002417053619865328, "grad_norm": 6.45262336730957, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8628799915313721, "num_tokens": 371246526.0, "step": 9728 }, { "epoch": 1.237628800407073, "ewc_loss": 0.05539855360984802, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024148551165126264, "grad_norm": 6.435266017913818, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8663461804389954, "num_tokens": 371282850.0, "step": 9729 }, { "epoch": 1.2377560106856633, "ewc_loss": 0.055415429174900055, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002416543138679117, "grad_norm": 6.481492519378662, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.872504711151123, "num_tokens": 371312614.0, "step": 9730 }, { "epoch": 1.2378832209642538, "ewc_loss": 0.055357739329338074, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002410774031886831, "grad_norm": 6.4458465576171875, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8647600412368774, "num_tokens": 371350170.0, "step": 9731 }, { "epoch": 1.2380104312428444, "ewc_loss": 0.05540041998028755, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024150419631041586, "grad_norm": 6.447884559631348, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8698309063911438, "num_tokens": 371389295.0, "step": 9732 }, { "epoch": 1.238137641521435, "ewc_loss": 0.05538519471883774, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002413519541732967, "grad_norm": 6.47946834564209, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8632738590240479, "num_tokens": 371430579.0, "step": 9733 }, { "epoch": 1.2382648518000254, "ewc_loss": 0.05529925599694252, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024049256171565503, "grad_norm": 6.493478298187256, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.853496253490448, "num_tokens": 371465087.0, "step": 9734 }, { "epoch": 1.238392062078616, "ewc_loss": 0.05535674840211868, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002410675078863278, "grad_norm": 6.473363876342773, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8711740374565125, "num_tokens": 371503549.0, "step": 9735 }, { "epoch": 1.2385192723572065, "ewc_loss": 0.055297575891017914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024047575425356627, "grad_norm": 6.410086154937744, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8829711675643921, "num_tokens": 371542953.0, "step": 9736 }, { "epoch": 1.238646482635797, "ewc_loss": 0.05535785108804703, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024107850913424045, "grad_norm": 6.489360809326172, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8673410415649414, "num_tokens": 371582541.0, "step": 9737 }, { "epoch": 1.2387736929143875, "ewc_loss": 0.05529892444610596, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024048924387898296, "grad_norm": 6.4609856605529785, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8729788064956665, "num_tokens": 371617221.0, "step": 9738 }, { "epoch": 1.238900903192978, "ewc_loss": 0.05536819249391556, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000241181900491938, "grad_norm": 6.47820520401001, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8669037818908691, "num_tokens": 371660773.0, "step": 9739 }, { "epoch": 1.2390281134715684, "ewc_loss": 0.05523459240794182, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002398459182586521, "grad_norm": 6.42529296875, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8576644659042358, "num_tokens": 371694763.0, "step": 9740 }, { "epoch": 1.239155323750159, "ewc_loss": 0.05530749261379242, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024057492555584759, "grad_norm": 6.509673118591309, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8622881770133972, "num_tokens": 371734793.0, "step": 9741 }, { "epoch": 1.2392825340287494, "ewc_loss": 0.055290911346673965, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024040912103373557, "grad_norm": 6.4770097732543945, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8584468364715576, "num_tokens": 371775691.0, "step": 9742 }, { "epoch": 1.23940974430734, "ewc_loss": 0.055349815636873245, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002409981534583494, "grad_norm": 6.455944538116455, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8552384972572327, "num_tokens": 371817319.0, "step": 9743 }, { "epoch": 1.2395369545859305, "ewc_loss": 0.05527304485440254, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024023045261856169, "grad_norm": 6.51568603515625, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.862005889415741, "num_tokens": 371847431.0, "step": 9744 }, { "epoch": 1.239664164864521, "ewc_loss": 0.055176716297864914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023926715948618948, "grad_norm": 6.433347702026367, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8755754232406616, "num_tokens": 371881788.0, "step": 9745 }, { "epoch": 1.2397913751431116, "ewc_loss": 0.055338822305202484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024088824284262955, "grad_norm": 6.499377727508545, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.858359694480896, "num_tokens": 371917388.0, "step": 9746 }, { "epoch": 1.239918585421702, "ewc_loss": 0.05524829775094986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002399829973001033, "grad_norm": 6.431890964508057, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8726105093955994, "num_tokens": 371957561.0, "step": 9747 }, { "epoch": 1.2400457957002926, "ewc_loss": 0.05532792583107948, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002407792635494843, "grad_norm": 6.546321392059326, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8561602830886841, "num_tokens": 371987322.0, "step": 9748 }, { "epoch": 1.2401730059788831, "ewc_loss": 0.055212002247571945, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023962001432664692, "grad_norm": 6.400834560394287, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8651649355888367, "num_tokens": 372027263.0, "step": 9749 }, { "epoch": 1.2403002162574737, "ewc_loss": 0.05538420379161835, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024134202976711094, "grad_norm": 6.473628044128418, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8673295378684998, "num_tokens": 372061441.0, "step": 9750 }, { "epoch": 1.2404274265360642, "ewc_loss": 0.05529139190912247, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024041393771767616, "grad_norm": 6.408335208892822, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8690537214279175, "num_tokens": 372099085.0, "step": 9751 }, { "epoch": 1.2405546368146547, "ewc_loss": 0.05541269853711128, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024162698537111282, "grad_norm": 6.541849136352539, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8653934597969055, "num_tokens": 372134106.0, "step": 9752 }, { "epoch": 1.2406818470932452, "ewc_loss": 0.055357497185468674, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024107497301883996, "grad_norm": 6.477118015289307, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8719674944877625, "num_tokens": 372164635.0, "step": 9753 }, { "epoch": 1.2408090573718356, "ewc_loss": 0.05536159873008728, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024111600941978395, "grad_norm": 6.404112339019775, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8693199157714844, "num_tokens": 372204924.0, "step": 9754 }, { "epoch": 1.240936267650426, "ewc_loss": 0.05539122223854065, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024141221365425736, "grad_norm": 6.58256721496582, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8637180328369141, "num_tokens": 372240939.0, "step": 9755 }, { "epoch": 1.2410634779290166, "ewc_loss": 0.05533991754055023, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024089917133096606, "grad_norm": 6.4620184898376465, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8584948778152466, "num_tokens": 372272355.0, "step": 9756 }, { "epoch": 1.2411906882076071, "ewc_loss": 0.05537661537528038, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024126615608111024, "grad_norm": 6.401969909667969, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8612862229347229, "num_tokens": 372309566.0, "step": 9757 }, { "epoch": 1.2413178984861977, "ewc_loss": 0.055367667227983475, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000241176676354371, "grad_norm": 6.4233903884887695, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8573935031890869, "num_tokens": 372347768.0, "step": 9758 }, { "epoch": 1.2414451087647882, "ewc_loss": 0.05539138987660408, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024141390167642385, "grad_norm": 6.493595600128174, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8714529275894165, "num_tokens": 372381020.0, "step": 9759 }, { "epoch": 1.2415723190433787, "ewc_loss": 0.05540426820516586, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002415426861261949, "grad_norm": 6.462802886962891, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8520122170448303, "num_tokens": 372422434.0, "step": 9760 }, { "epoch": 1.2416995293219693, "ewc_loss": 0.05537434294819832, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024124342598952353, "grad_norm": 6.4640398025512695, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.855281412601471, "num_tokens": 372459342.0, "step": 9761 }, { "epoch": 1.2418267396005598, "ewc_loss": 0.055340833961963654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024090833903755993, "grad_norm": 6.4261155128479, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8662706613540649, "num_tokens": 372496637.0, "step": 9762 }, { "epoch": 1.2419539498791503, "ewc_loss": 0.05542664974927902, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024176652368623763, "grad_norm": 6.4309539794921875, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8761287331581116, "num_tokens": 372532068.0, "step": 9763 }, { "epoch": 1.2420811601577408, "ewc_loss": 0.05534707009792328, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024097067944239825, "grad_norm": 6.434706211090088, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8721330165863037, "num_tokens": 372565460.0, "step": 9764 }, { "epoch": 1.2422083704363311, "ewc_loss": 0.05544501915574074, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024195018340833485, "grad_norm": 6.44415807723999, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8671982288360596, "num_tokens": 372605998.0, "step": 9765 }, { "epoch": 1.2423355807149217, "ewc_loss": 0.05533391609787941, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024083915923256427, "grad_norm": 6.439010143280029, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8531789779663086, "num_tokens": 372642038.0, "step": 9766 }, { "epoch": 1.2424627909935122, "ewc_loss": 0.055387504398822784, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002413750480627641, "grad_norm": 6.408257484436035, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8586751222610474, "num_tokens": 372678192.0, "step": 9767 }, { "epoch": 1.2425900012721027, "ewc_loss": 0.05540095269680023, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024150953686330467, "grad_norm": 6.474456787109375, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.85939621925354, "num_tokens": 372712867.0, "step": 9768 }, { "epoch": 1.2427172115506933, "ewc_loss": 0.055344924330711365, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024094924447126687, "grad_norm": 6.502166271209717, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.846416711807251, "num_tokens": 372745790.0, "step": 9769 }, { "epoch": 1.2428444218292838, "ewc_loss": 0.05537498742341995, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024124990159180015, "grad_norm": 6.47086763381958, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8729659914970398, "num_tokens": 372777634.0, "step": 9770 }, { "epoch": 1.2429716321078743, "ewc_loss": 0.05529477447271347, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024044772726483643, "grad_norm": 6.381338119506836, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8481018543243408, "num_tokens": 372820527.0, "step": 9771 }, { "epoch": 1.2430988423864648, "ewc_loss": 0.055323339998722076, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024073338136076927, "grad_norm": 6.441667556762695, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8736355900764465, "num_tokens": 372858479.0, "step": 9772 }, { "epoch": 1.2432260526650554, "ewc_loss": 0.055291276425123215, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024041275901254267, "grad_norm": 6.40609073638916, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8691507577896118, "num_tokens": 372895219.0, "step": 9773 }, { "epoch": 1.243353262943646, "ewc_loss": 0.0553717277944088, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024121727619785815, "grad_norm": 6.4183502197265625, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8754305839538574, "num_tokens": 372934491.0, "step": 9774 }, { "epoch": 1.2434804732222364, "ewc_loss": 0.055261820554733276, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002401182136964053, "grad_norm": 6.457305431365967, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8544621467590332, "num_tokens": 372972892.0, "step": 9775 }, { "epoch": 1.243607683500827, "ewc_loss": 0.05534828454256058, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024098283029161394, "grad_norm": 6.460018634796143, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8670955300331116, "num_tokens": 373011564.0, "step": 9776 }, { "epoch": 1.2437348937794175, "ewc_loss": 0.055280901491642, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024030901840887964, "grad_norm": 6.48091983795166, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8658030033111572, "num_tokens": 373049444.0, "step": 9777 }, { "epoch": 1.243862104058008, "ewc_loss": 0.055311813950538635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024061811564024538, "grad_norm": 6.506881237030029, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8631347417831421, "num_tokens": 373085466.0, "step": 9778 }, { "epoch": 1.2439893143365983, "ewc_loss": 0.05522293224930763, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002397293137619272, "grad_norm": 6.477173328399658, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8400776386260986, "num_tokens": 373124712.0, "step": 9779 }, { "epoch": 1.2441165246151888, "ewc_loss": 0.055274803191423416, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024024803133215755, "grad_norm": 6.493474960327148, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8685780167579651, "num_tokens": 373160008.0, "step": 9780 }, { "epoch": 1.2442437348937794, "ewc_loss": 0.0551895946264267, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023939594393596053, "grad_norm": 6.528628349304199, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8640503287315369, "num_tokens": 373192110.0, "step": 9781 }, { "epoch": 1.24437094517237, "ewc_loss": 0.055127739906311035, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023877738567534834, "grad_norm": 6.41918420791626, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8648263812065125, "num_tokens": 373234432.0, "step": 9782 }, { "epoch": 1.2444981554509604, "ewc_loss": 0.05522705987095833, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023977059754543006, "grad_norm": 6.5099568367004395, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8531879186630249, "num_tokens": 373270725.0, "step": 9783 }, { "epoch": 1.244625365729551, "ewc_loss": 0.05522078275680542, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023970783513505012, "grad_norm": 6.506477355957031, "learning_rate": 1e-06, "loss": 0.5394, "mean_token_accuracy": 0.8415569067001343, "num_tokens": 373307531.0, "step": 9784 }, { "epoch": 1.2447525760081415, "ewc_loss": 0.05520375072956085, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023953749041538686, "grad_norm": 6.46058464050293, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8678150177001953, "num_tokens": 373351657.0, "step": 9785 }, { "epoch": 1.244879786286732, "ewc_loss": 0.05520722270011902, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023957221128512174, "grad_norm": 6.488779067993164, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8683922290802002, "num_tokens": 373388811.0, "step": 9786 }, { "epoch": 1.2450069965653225, "ewc_loss": 0.055191945284605026, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023941944527905434, "grad_norm": 6.429408550262451, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8734480142593384, "num_tokens": 373423378.0, "step": 9787 }, { "epoch": 1.245134206843913, "ewc_loss": 0.05521109700202942, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023961096303537488, "grad_norm": 6.437554359436035, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8640532493591309, "num_tokens": 373460982.0, "step": 9788 }, { "epoch": 1.2452614171225034, "ewc_loss": 0.055241212248802185, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002399121003691107, "grad_norm": 6.474215507507324, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8702986240386963, "num_tokens": 373501474.0, "step": 9789 }, { "epoch": 1.245388627401094, "ewc_loss": 0.0551891066133976, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023939106904435903, "grad_norm": 6.467901706695557, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8676787614822388, "num_tokens": 373537830.0, "step": 9790 }, { "epoch": 1.2455158376796844, "ewc_loss": 0.055199794471263885, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002394979674136266, "grad_norm": 6.463197231292725, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8561330437660217, "num_tokens": 373573975.0, "step": 9791 }, { "epoch": 1.245643047958275, "ewc_loss": 0.05522198975086212, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002397198841208592, "grad_norm": 6.4674482345581055, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8424826264381409, "num_tokens": 373612157.0, "step": 9792 }, { "epoch": 1.2457702582368655, "ewc_loss": 0.05523676797747612, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023986768792383373, "grad_norm": 6.435002326965332, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8824365139007568, "num_tokens": 373645065.0, "step": 9793 }, { "epoch": 1.245897468515456, "ewc_loss": 0.05530789494514465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024057892733253539, "grad_norm": 6.4375152587890625, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8578332662582397, "num_tokens": 373687658.0, "step": 9794 }, { "epoch": 1.2460246787940465, "ewc_loss": 0.05529491603374481, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024044913880061358, "grad_norm": 6.427379608154297, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.865494966506958, "num_tokens": 373732262.0, "step": 9795 }, { "epoch": 1.246151889072637, "ewc_loss": 0.05537714064121246, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024127140932250768, "grad_norm": 6.50324010848999, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8441397547721863, "num_tokens": 373772410.0, "step": 9796 }, { "epoch": 1.2462790993512276, "ewc_loss": 0.05521038919687271, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023960387625265867, "grad_norm": 6.414519309997559, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.863594114780426, "num_tokens": 373808773.0, "step": 9797 }, { "epoch": 1.2464063096298181, "ewc_loss": 0.05538160353899002, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024131604004651308, "grad_norm": 6.511429786682129, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8597005605697632, "num_tokens": 373848945.0, "step": 9798 }, { "epoch": 1.2465335199084087, "ewc_loss": 0.05530665069818497, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024056649999693036, "grad_norm": 6.47755241394043, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8582003116607666, "num_tokens": 373892716.0, "step": 9799 }, { "epoch": 1.2466607301869992, "ewc_loss": 0.05527323856949806, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024023238802328706, "grad_norm": 6.430894374847412, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8797590732574463, "num_tokens": 373929443.0, "step": 9800 }, { "epoch": 1.2467879404655897, "ewc_loss": 0.055257413536310196, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024007413594517857, "grad_norm": 6.510020732879639, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8647663593292236, "num_tokens": 373968882.0, "step": 9801 }, { "epoch": 1.2469151507441802, "ewc_loss": 0.05525166541337967, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002400166413281113, "grad_norm": 6.432750701904297, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8767480850219727, "num_tokens": 374009265.0, "step": 9802 }, { "epoch": 1.2470423610227706, "ewc_loss": 0.05528583377599716, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002403583494015038, "grad_norm": 6.505712032318115, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8480110168457031, "num_tokens": 374047152.0, "step": 9803 }, { "epoch": 1.247169571301361, "ewc_loss": 0.055254627019166946, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024004626902751625, "grad_norm": 6.497344493865967, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8591963052749634, "num_tokens": 374083127.0, "step": 9804 }, { "epoch": 1.2472967815799516, "ewc_loss": 0.05521862208843231, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023968619643710554, "grad_norm": 6.404719352722168, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8723621368408203, "num_tokens": 374127086.0, "step": 9805 }, { "epoch": 1.2474239918585421, "ewc_loss": 0.05531734228134155, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024067341291811317, "grad_norm": 6.517861843109131, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.857921302318573, "num_tokens": 374163879.0, "step": 9806 }, { "epoch": 1.2475512021371327, "ewc_loss": 0.055207572877407074, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023957570374477655, "grad_norm": 6.433631896972656, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8679237365722656, "num_tokens": 374206639.0, "step": 9807 }, { "epoch": 1.2476784124157232, "ewc_loss": 0.055203087627887726, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023953086929395795, "grad_norm": 6.472338676452637, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8606714606285095, "num_tokens": 374244106.0, "step": 9808 }, { "epoch": 1.2478056226943137, "ewc_loss": 0.05524080619215965, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023990805493667722, "grad_norm": 6.437314987182617, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8607186079025269, "num_tokens": 374285776.0, "step": 9809 }, { "epoch": 1.2479328329729042, "ewc_loss": 0.055247411131858826, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023997409152798355, "grad_norm": 6.474464416503906, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8723446726799011, "num_tokens": 374323036.0, "step": 9810 }, { "epoch": 1.2480600432514948, "ewc_loss": 0.05522594600915909, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023975945077836514, "grad_norm": 6.43264627456665, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8696836233139038, "num_tokens": 374360204.0, "step": 9811 }, { "epoch": 1.2481872535300853, "ewc_loss": 0.05523674935102463, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023986748419702053, "grad_norm": 6.5098981857299805, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8734736442565918, "num_tokens": 374394961.0, "step": 9812 }, { "epoch": 1.2483144638086758, "ewc_loss": 0.05526772141456604, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024017719260882586, "grad_norm": 6.427328109741211, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8658587336540222, "num_tokens": 374436142.0, "step": 9813 }, { "epoch": 1.2484416740872661, "ewc_loss": 0.055287592113018036, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024037591356318444, "grad_norm": 6.47618293762207, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8594251871109009, "num_tokens": 374474263.0, "step": 9814 }, { "epoch": 1.2485688843658567, "ewc_loss": 0.0552719309926033, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024021930585149676, "grad_norm": 6.501720428466797, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8586263656616211, "num_tokens": 374510250.0, "step": 9815 }, { "epoch": 1.2486960946444472, "ewc_loss": 0.05528394877910614, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024033946101553738, "grad_norm": 6.455989837646484, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.881967306137085, "num_tokens": 374549579.0, "step": 9816 }, { "epoch": 1.2488233049230377, "ewc_loss": 0.0553702712059021, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024120269517879933, "grad_norm": 6.468408584594727, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8615782260894775, "num_tokens": 374588631.0, "step": 9817 }, { "epoch": 1.2489505152016283, "ewc_loss": 0.055269017815589905, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024019017291720957, "grad_norm": 6.4754133224487305, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8587591648101807, "num_tokens": 374627092.0, "step": 9818 }, { "epoch": 1.2490777254802188, "ewc_loss": 0.055386416614055634, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024136417778208852, "grad_norm": 6.53612756729126, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8700208067893982, "num_tokens": 374657797.0, "step": 9819 }, { "epoch": 1.2492049357588093, "ewc_loss": 0.05526798218488693, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024017984105739743, "grad_norm": 6.4444732666015625, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8680152297019958, "num_tokens": 374694011.0, "step": 9820 }, { "epoch": 1.2493321460373998, "ewc_loss": 0.05544189363718033, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024191894044633955, "grad_norm": 6.532413482666016, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8662588596343994, "num_tokens": 374735127.0, "step": 9821 }, { "epoch": 1.2494593563159904, "ewc_loss": 0.055237285792827606, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023987283930182457, "grad_norm": 6.435934066772461, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8611342906951904, "num_tokens": 374774170.0, "step": 9822 }, { "epoch": 1.249586566594581, "ewc_loss": 0.055407412350177765, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024157414736691862, "grad_norm": 6.567610740661621, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8646934628486633, "num_tokens": 374809162.0, "step": 9823 }, { "epoch": 1.2497137768731714, "ewc_loss": 0.05524274706840515, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023992749629542232, "grad_norm": 6.446444988250732, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8552902936935425, "num_tokens": 374847974.0, "step": 9824 }, { "epoch": 1.249840987151762, "ewc_loss": 0.055357642471790314, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024107639910653234, "grad_norm": 6.422325611114502, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8789441585540771, "num_tokens": 374894911.0, "step": 9825 }, { "epoch": 1.2499681974303525, "ewc_loss": 0.05539286136627197, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002414285991108045, "grad_norm": 6.602059364318848, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8586385250091553, "num_tokens": 374933534.0, "step": 9826 }, { "epoch": 1.250095407708943, "ewc_loss": 0.055331505835056305, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024081507581286132, "grad_norm": 6.434334754943848, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8506170511245728, "num_tokens": 374968183.0, "step": 9827 }, { "epoch": 1.2502226179875333, "ewc_loss": 0.05547550320625305, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024225504603236914, "grad_norm": 6.507551193237305, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8576444983482361, "num_tokens": 375005973.0, "step": 9828 }, { "epoch": 1.2503498282661238, "ewc_loss": 0.055318333208560944, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024068330822046846, "grad_norm": 6.472782611846924, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8569605350494385, "num_tokens": 375043446.0, "step": 9829 }, { "epoch": 1.2504770385447144, "ewc_loss": 0.05547378212213516, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024223780201282352, "grad_norm": 6.490257740020752, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8665655255317688, "num_tokens": 375081518.0, "step": 9830 }, { "epoch": 1.250604248823305, "ewc_loss": 0.0553518608212471, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024101862800307572, "grad_norm": 6.509465217590332, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8568247556686401, "num_tokens": 375117767.0, "step": 9831 }, { "epoch": 1.2507314591018954, "ewc_loss": 0.0553886815905571, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024138682056218386, "grad_norm": 6.524421215057373, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8387011289596558, "num_tokens": 375152061.0, "step": 9832 }, { "epoch": 1.250858669380486, "ewc_loss": 0.05537758022546768, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024127580400090665, "grad_norm": 6.420269966125488, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8562096357345581, "num_tokens": 375195677.0, "step": 9833 }, { "epoch": 1.2509858796590765, "ewc_loss": 0.055449821054935455, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024199823383241892, "grad_norm": 6.560299873352051, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8558661937713623, "num_tokens": 375230319.0, "step": 9834 }, { "epoch": 1.251113089937667, "ewc_loss": 0.05536922812461853, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024119227600749582, "grad_norm": 6.384515762329102, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8678576946258545, "num_tokens": 375268970.0, "step": 9835 }, { "epoch": 1.2512403002162575, "ewc_loss": 0.055489979684352875, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024239980848506093, "grad_norm": 6.514870643615723, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8697422742843628, "num_tokens": 375303661.0, "step": 9836 }, { "epoch": 1.2513675104948478, "ewc_loss": 0.05534598231315613, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024095980916172266, "grad_norm": 6.427767753601074, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8707135319709778, "num_tokens": 375343278.0, "step": 9837 }, { "epoch": 1.2514947207734384, "ewc_loss": 0.05547965690493584, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024229656264651567, "grad_norm": 6.489011287689209, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.870022177696228, "num_tokens": 375378574.0, "step": 9838 }, { "epoch": 1.251621931052029, "ewc_loss": 0.055474814027547836, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002422481484245509, "grad_norm": 7.2676100730896, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8580484390258789, "num_tokens": 375412781.0, "step": 9839 }, { "epoch": 1.2517491413306194, "ewc_loss": 0.05535513162612915, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002410513407085091, "grad_norm": 6.365684986114502, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.875038206577301, "num_tokens": 375450431.0, "step": 9840 }, { "epoch": 1.25187635160921, "ewc_loss": 0.05550351366400719, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024253514129668474, "grad_norm": 6.551374912261963, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8567563891410828, "num_tokens": 375490692.0, "step": 9841 }, { "epoch": 1.2520035618878005, "ewc_loss": 0.05517105013132095, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023921052343212068, "grad_norm": 6.357603073120117, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8679594993591309, "num_tokens": 375535278.0, "step": 9842 }, { "epoch": 1.252130772166391, "ewc_loss": 0.055529817938804626, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002427981817163527, "grad_norm": 6.558678150177002, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8566117286682129, "num_tokens": 375575859.0, "step": 9843 }, { "epoch": 1.2522579824449815, "ewc_loss": 0.055328018963336945, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024078019487205893, "grad_norm": 6.432018280029297, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8563021421432495, "num_tokens": 375611084.0, "step": 9844 }, { "epoch": 1.252385192723572, "ewc_loss": 0.05545498803257942, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002420498785795644, "grad_norm": 6.596659183502197, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8733944892883301, "num_tokens": 375644926.0, "step": 9845 }, { "epoch": 1.2525124030021626, "ewc_loss": 0.05534861981868744, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002409861917840317, "grad_norm": 6.443844318389893, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8712210655212402, "num_tokens": 375686970.0, "step": 9846 }, { "epoch": 1.2526396132807531, "ewc_loss": 0.05540591478347778, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002415591588942334, "grad_norm": 6.495680809020996, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8756726384162903, "num_tokens": 375727372.0, "step": 9847 }, { "epoch": 1.2527668235593437, "ewc_loss": 0.05532171204686165, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024071712687145919, "grad_norm": 7.292117118835449, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8515338897705078, "num_tokens": 375763568.0, "step": 9848 }, { "epoch": 1.2528940338379342, "ewc_loss": 0.05527248978614807, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024022489378694445, "grad_norm": 6.363112926483154, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8470386862754822, "num_tokens": 375797954.0, "step": 9849 }, { "epoch": 1.2530212441165247, "ewc_loss": 0.05548948794603348, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024239488993771374, "grad_norm": 6.623711585998535, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.861193060874939, "num_tokens": 375836362.0, "step": 9850 }, { "epoch": 1.2531484543951152, "ewc_loss": 0.055099967867136, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002384996769251302, "grad_norm": 6.3467793464660645, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8590555787086487, "num_tokens": 375879403.0, "step": 9851 }, { "epoch": 1.2532756646737058, "ewc_loss": 0.055574432015419006, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024324429978150874, "grad_norm": 6.555063247680664, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8655111789703369, "num_tokens": 375916728.0, "step": 9852 }, { "epoch": 1.253402874952296, "ewc_loss": 0.05532379075884819, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002407379070064053, "grad_norm": 6.497832775115967, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8578934073448181, "num_tokens": 375951743.0, "step": 9853 }, { "epoch": 1.2535300852308866, "ewc_loss": 0.055354565382003784, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002410456509096548, "grad_norm": 6.559662818908691, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.837834358215332, "num_tokens": 375988872.0, "step": 9854 }, { "epoch": 1.2536572955094771, "ewc_loss": 0.05530912056565285, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024059120914898813, "grad_norm": 6.446816921234131, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8523620367050171, "num_tokens": 376030388.0, "step": 9855 }, { "epoch": 1.2537845057880677, "ewc_loss": 0.05544079467654228, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024190795375034213, "grad_norm": 6.459749698638916, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8731709718704224, "num_tokens": 376062975.0, "step": 9856 }, { "epoch": 1.2539117160666582, "ewc_loss": 0.055402837693691254, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002415283815935254, "grad_norm": 6.454240322113037, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8559347987174988, "num_tokens": 376101873.0, "step": 9857 }, { "epoch": 1.2540389263452487, "ewc_loss": 0.05538617819547653, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002413618058199063, "grad_norm": 6.5406389236450195, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8411821722984314, "num_tokens": 376138774.0, "step": 9858 }, { "epoch": 1.2541661366238392, "ewc_loss": 0.05545925348997116, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024209254479501396, "grad_norm": 6.529302597045898, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8548951148986816, "num_tokens": 376172357.0, "step": 9859 }, { "epoch": 1.2542933469024298, "ewc_loss": 0.055349767208099365, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024099767324514687, "grad_norm": 6.475632190704346, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8532766103744507, "num_tokens": 376212977.0, "step": 9860 }, { "epoch": 1.2544205571810203, "ewc_loss": 0.055414535105228424, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024164536444004625, "grad_norm": 6.478423595428467, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8527551889419556, "num_tokens": 376256989.0, "step": 9861 }, { "epoch": 1.2545477674596106, "ewc_loss": 0.055388517677783966, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024138519074767828, "grad_norm": 6.532221794128418, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8675763607025146, "num_tokens": 376300130.0, "step": 9862 }, { "epoch": 1.2546749777382011, "ewc_loss": 0.055340781807899475, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024090780061669648, "grad_norm": 6.4252238273620605, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8665489554405212, "num_tokens": 376337475.0, "step": 9863 }, { "epoch": 1.2548021880167917, "ewc_loss": 0.05540004372596741, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002415004128124565, "grad_norm": 6.4760637283325195, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8538708090782166, "num_tokens": 376376485.0, "step": 9864 }, { "epoch": 1.2549293982953822, "ewc_loss": 0.055351436138153076, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002410143642919138, "grad_norm": 6.491803169250488, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8683764934539795, "num_tokens": 376413393.0, "step": 9865 }, { "epoch": 1.2550566085739727, "ewc_loss": 0.05529884994029999, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024048851628322154, "grad_norm": 6.44590425491333, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.852774977684021, "num_tokens": 376448192.0, "step": 9866 }, { "epoch": 1.2551838188525632, "ewc_loss": 0.0554543174803257, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024204317014664412, "grad_norm": 6.542722702026367, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.865121603012085, "num_tokens": 376489382.0, "step": 9867 }, { "epoch": 1.2553110291311538, "ewc_loss": 0.05534471571445465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024094713444355875, "grad_norm": 6.453952789306641, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8669947385787964, "num_tokens": 376527221.0, "step": 9868 }, { "epoch": 1.2554382394097443, "ewc_loss": 0.055441588163375854, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024191585544031113, "grad_norm": 6.616876602172852, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8592503070831299, "num_tokens": 376567045.0, "step": 9869 }, { "epoch": 1.2555654496883348, "ewc_loss": 0.055197980254888535, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002394798066234216, "grad_norm": 6.361963272094727, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8700991272926331, "num_tokens": 376607039.0, "step": 9870 }, { "epoch": 1.2556926599669254, "ewc_loss": 0.055515311658382416, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002426531136734411, "grad_norm": 6.684870719909668, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8653788566589355, "num_tokens": 376643220.0, "step": 9871 }, { "epoch": 1.255819870245516, "ewc_loss": 0.05521819740533829, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023968199093360454, "grad_norm": 6.356846332550049, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8818882703781128, "num_tokens": 376681417.0, "step": 9872 }, { "epoch": 1.2559470805241064, "ewc_loss": 0.0555562749505043, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002430627355352044, "grad_norm": 6.703619480133057, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8813700079917908, "num_tokens": 376722836.0, "step": 9873 }, { "epoch": 1.256074290802697, "ewc_loss": 0.055198170244693756, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00023948171292431653, "grad_norm": 6.379009246826172, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.855840802192688, "num_tokens": 376763194.0, "step": 9874 }, { "epoch": 1.2562015010812875, "ewc_loss": 0.05549997091293335, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024249973648693413, "grad_norm": 6.63226842880249, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8589754700660706, "num_tokens": 376803701.0, "step": 9875 }, { "epoch": 1.256328711359878, "ewc_loss": 0.05530389025807381, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024053889501374215, "grad_norm": 6.448309898376465, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8712400794029236, "num_tokens": 376835296.0, "step": 9876 }, { "epoch": 1.2564559216384683, "ewc_loss": 0.055372513830661774, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024122516333591193, "grad_norm": 6.492832660675049, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8656998872756958, "num_tokens": 376876317.0, "step": 9877 }, { "epoch": 1.2565831319170588, "ewc_loss": 0.055384837090969086, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024134834529832006, "grad_norm": 6.570416450500488, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8620975613594055, "num_tokens": 376916403.0, "step": 9878 }, { "epoch": 1.2567103421956494, "ewc_loss": 0.05532878637313843, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024078784917946905, "grad_norm": 6.431764602661133, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8728551864624023, "num_tokens": 376956079.0, "step": 9879 }, { "epoch": 1.25683755247424, "ewc_loss": 0.05540042743086815, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024150428362190723, "grad_norm": 6.606250286102295, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8539848923683167, "num_tokens": 376991618.0, "step": 9880 }, { "epoch": 1.2569647627528304, "ewc_loss": 0.055252350866794586, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002400235243840143, "grad_norm": 6.437809944152832, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8686021566390991, "num_tokens": 377033964.0, "step": 9881 }, { "epoch": 1.257091973031421, "ewc_loss": 0.055382341146469116, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024132341786753386, "grad_norm": 6.484837532043457, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8708866834640503, "num_tokens": 377070091.0, "step": 9882 }, { "epoch": 1.2572191833100115, "ewc_loss": 0.05533697456121445, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024086971825454384, "grad_norm": 6.507392883300781, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.871162474155426, "num_tokens": 377111185.0, "step": 9883 }, { "epoch": 1.257346393588602, "ewc_loss": 0.055404797196388245, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024154799757525325, "grad_norm": 6.485567569732666, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8679465651512146, "num_tokens": 377147637.0, "step": 9884 }, { "epoch": 1.2574736038671925, "ewc_loss": 0.05533721670508385, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002408721629763022, "grad_norm": 6.446245193481445, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.862848162651062, "num_tokens": 377181918.0, "step": 9885 }, { "epoch": 1.2576008141457828, "ewc_loss": 0.05540570616722107, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002415570488665253, "grad_norm": 6.526898384094238, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8572278022766113, "num_tokens": 377222900.0, "step": 9886 }, { "epoch": 1.2577280244243734, "ewc_loss": 0.055425453931093216, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024175453290808946, "grad_norm": 6.4359564781188965, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8774106502532959, "num_tokens": 377263107.0, "step": 9887 }, { "epoch": 1.257855234702964, "ewc_loss": 0.055515509098768234, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024265509273391217, "grad_norm": 6.535653114318848, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8783576488494873, "num_tokens": 377301384.0, "step": 9888 }, { "epoch": 1.2579824449815544, "ewc_loss": 0.05542292073369026, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002417292125755921, "grad_norm": 6.507286071777344, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8566920161247253, "num_tokens": 377339865.0, "step": 9889 }, { "epoch": 1.258109655260145, "ewc_loss": 0.0554194450378418, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024169444805011153, "grad_norm": 6.472839832305908, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8722785711288452, "num_tokens": 377381582.0, "step": 9890 }, { "epoch": 1.2582368655387355, "ewc_loss": 0.055474914610385895, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024224913795478642, "grad_norm": 6.512482166290283, "learning_rate": 1e-06, "loss": 0.5298, "mean_token_accuracy": 0.8405612707138062, "num_tokens": 377425548.0, "step": 9891 }, { "epoch": 1.258364075817326, "ewc_loss": 0.055423758924007416, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002417375799268484, "grad_norm": 6.483428001403809, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8620012402534485, "num_tokens": 377462298.0, "step": 9892 }, { "epoch": 1.2584912860959165, "ewc_loss": 0.05546005070209503, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002421005192445591, "grad_norm": 6.504159450531006, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8571180105209351, "num_tokens": 377503873.0, "step": 9893 }, { "epoch": 1.258618496374507, "ewc_loss": 0.0554826557636261, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002423265832476318, "grad_norm": 6.493587493896484, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8779536485671997, "num_tokens": 377540497.0, "step": 9894 }, { "epoch": 1.2587457066530976, "ewc_loss": 0.05542461574077606, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024174618010874838, "grad_norm": 6.492591381072998, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.864305853843689, "num_tokens": 377582242.0, "step": 9895 }, { "epoch": 1.2588729169316881, "ewc_loss": 0.055512987077236176, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024262984516099095, "grad_norm": 6.512325763702393, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8781747817993164, "num_tokens": 377614972.0, "step": 9896 }, { "epoch": 1.2590001272102787, "ewc_loss": 0.05543893575668335, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024188935640268028, "grad_norm": 6.50330114364624, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8607275485992432, "num_tokens": 377645392.0, "step": 9897 }, { "epoch": 1.2591273374888692, "ewc_loss": 0.05557304993271828, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024323049001395702, "grad_norm": 6.544248104095459, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8717098236083984, "num_tokens": 377680317.0, "step": 9898 }, { "epoch": 1.2592545477674597, "ewc_loss": 0.05547897890210152, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024228979600593448, "grad_norm": 6.497802734375, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8636190891265869, "num_tokens": 377714879.0, "step": 9899 }, { "epoch": 1.2593817580460502, "ewc_loss": 0.05553314834833145, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024283149105031043, "grad_norm": 6.512887001037598, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8494489192962646, "num_tokens": 377751404.0, "step": 9900 }, { "epoch": 1.2595089683246408, "ewc_loss": 0.05557151138782501, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002432150940876454, "grad_norm": 6.520977020263672, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8683012127876282, "num_tokens": 377788354.0, "step": 9901 }, { "epoch": 1.259636178603231, "ewc_loss": 0.05556207150220871, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024312072491738945, "grad_norm": 6.480711936950684, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.852310299873352, "num_tokens": 377827259.0, "step": 9902 }, { "epoch": 1.2597633888818216, "ewc_loss": 0.055566899478435516, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024316899362020195, "grad_norm": 6.4565510749816895, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8560824394226074, "num_tokens": 377872477.0, "step": 9903 }, { "epoch": 1.2598905991604121, "ewc_loss": 0.05560547113418579, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002435546921333298, "grad_norm": 6.525681495666504, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8551939725875854, "num_tokens": 377911159.0, "step": 9904 }, { "epoch": 1.2600178094390027, "ewc_loss": 0.055574290454387665, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024324293190147728, "grad_norm": 7.321803092956543, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8654271960258484, "num_tokens": 377951919.0, "step": 9905 }, { "epoch": 1.2601450197175932, "ewc_loss": 0.05559636652469635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024346366990357637, "grad_norm": 6.366241455078125, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.846070408821106, "num_tokens": 377997567.0, "step": 9906 }, { "epoch": 1.2602722299961837, "ewc_loss": 0.05578335002064705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024533350369893014, "grad_norm": 6.648382186889648, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8623014092445374, "num_tokens": 378033811.0, "step": 9907 }, { "epoch": 1.2603994402747742, "ewc_loss": 0.05542299523949623, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024172995472326875, "grad_norm": 6.44980001449585, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8509052991867065, "num_tokens": 378070136.0, "step": 9908 }, { "epoch": 1.2605266505533648, "ewc_loss": 0.05575159564614296, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002450159518048167, "grad_norm": 6.5892534255981445, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8606802225112915, "num_tokens": 378106902.0, "step": 9909 }, { "epoch": 1.2606538608319553, "ewc_loss": 0.05554389953613281, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002429390006000176, "grad_norm": 6.4029927253723145, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8661475777626038, "num_tokens": 378144638.0, "step": 9910 }, { "epoch": 1.2607810711105456, "ewc_loss": 0.055718906223773956, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024468908668495715, "grad_norm": 6.5180253982543945, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8723412752151489, "num_tokens": 378179059.0, "step": 9911 }, { "epoch": 1.2609082813891361, "ewc_loss": 0.055694110691547394, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444410929456353, "grad_norm": 6.550306797027588, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8499914407730103, "num_tokens": 378210346.0, "step": 9912 }, { "epoch": 1.2610354916677267, "ewc_loss": 0.05564114451408386, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000243911417783238, "grad_norm": 6.493053436279297, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8673833608627319, "num_tokens": 378242687.0, "step": 9913 }, { "epoch": 1.2611627019463172, "ewc_loss": 0.05576220154762268, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024512200616300106, "grad_norm": 6.498435974121094, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8686439394950867, "num_tokens": 378279421.0, "step": 9914 }, { "epoch": 1.2612899122249077, "ewc_loss": 0.055655136704444885, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024405134900007397, "grad_norm": 6.41448974609375, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8592209815979004, "num_tokens": 378318133.0, "step": 9915 }, { "epoch": 1.2614171225034982, "ewc_loss": 0.0557544082403183, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024504406610503793, "grad_norm": 6.465902328491211, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8454549312591553, "num_tokens": 378362987.0, "step": 9916 }, { "epoch": 1.2615443327820888, "ewc_loss": 0.055600862950086594, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002435086207697168, "grad_norm": 6.455149173736572, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8806741237640381, "num_tokens": 378396197.0, "step": 9917 }, { "epoch": 1.2616715430606793, "ewc_loss": 0.055758148431777954, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024508146452717483, "grad_norm": 6.471632957458496, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8706954717636108, "num_tokens": 378433691.0, "step": 9918 }, { "epoch": 1.2617987533392698, "ewc_loss": 0.05569145083427429, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024441449204459786, "grad_norm": 6.431743144989014, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8669267892837524, "num_tokens": 378470277.0, "step": 9919 }, { "epoch": 1.2619259636178604, "ewc_loss": 0.055689193308353424, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443919365759939, "grad_norm": 6.423607349395752, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8570554256439209, "num_tokens": 378510500.0, "step": 9920 }, { "epoch": 1.2620531738964509, "ewc_loss": 0.05571562051773071, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024465619935654104, "grad_norm": 6.476255416870117, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8604800701141357, "num_tokens": 378552893.0, "step": 9921 }, { "epoch": 1.2621803841750414, "ewc_loss": 0.055720504373311996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024470503558404744, "grad_norm": 6.442230701446533, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8697408437728882, "num_tokens": 378590917.0, "step": 9922 }, { "epoch": 1.262307594453632, "ewc_loss": 0.05567707121372223, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002442707191221416, "grad_norm": 6.389595985412598, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8718032836914062, "num_tokens": 378637764.0, "step": 9923 }, { "epoch": 1.2624348047322225, "ewc_loss": 0.05572609603404999, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002447609440423548, "grad_norm": 6.478745460510254, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8595724105834961, "num_tokens": 378675497.0, "step": 9924 }, { "epoch": 1.262562015010813, "ewc_loss": 0.055682629346847534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024432630743831396, "grad_norm": 6.476421356201172, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8540523648262024, "num_tokens": 378710470.0, "step": 9925 }, { "epoch": 1.2626892252894033, "ewc_loss": 0.05558574199676514, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002433573972666636, "grad_norm": 6.349721908569336, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8592928051948547, "num_tokens": 378758116.0, "step": 9926 }, { "epoch": 1.2628164355679938, "ewc_loss": 0.05571380630135536, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002446380676701665, "grad_norm": 6.450516700744629, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8591117262840271, "num_tokens": 378798166.0, "step": 9927 }, { "epoch": 1.2629436458465844, "ewc_loss": 0.055577173829078674, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024327174469362944, "grad_norm": 6.3474626541137695, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8784819841384888, "num_tokens": 378839899.0, "step": 9928 }, { "epoch": 1.263070856125175, "ewc_loss": 0.05574779212474823, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024497791309840977, "grad_norm": 6.45581579208374, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8638848066329956, "num_tokens": 378886022.0, "step": 9929 }, { "epoch": 1.2631980664037654, "ewc_loss": 0.05563797056674957, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002438797237118706, "grad_norm": 6.387653827667236, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8522814512252808, "num_tokens": 378925097.0, "step": 9930 }, { "epoch": 1.263325276682356, "ewc_loss": 0.05580113083124161, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002455113281030208, "grad_norm": 6.444072246551514, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8748904466629028, "num_tokens": 378965242.0, "step": 9931 }, { "epoch": 1.2634524869609465, "ewc_loss": 0.055661365389823914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024411366030108184, "grad_norm": 6.433056354522705, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8431220650672913, "num_tokens": 379005932.0, "step": 9932 }, { "epoch": 1.263579697239537, "ewc_loss": 0.055781856179237366, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453185443300754, "grad_norm": 6.4700517654418945, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8604379296302795, "num_tokens": 379045817.0, "step": 9933 }, { "epoch": 1.2637069075181275, "ewc_loss": 0.055690497159957886, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024440494598820806, "grad_norm": 6.402358531951904, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8561685681343079, "num_tokens": 379086481.0, "step": 9934 }, { "epoch": 1.2638341177967178, "ewc_loss": 0.055732958018779755, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448295708745718, "grad_norm": 6.419938087463379, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8468788266181946, "num_tokens": 379129782.0, "step": 9935 }, { "epoch": 1.2639613280753084, "ewc_loss": 0.05577303469181061, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000245230330619961, "grad_norm": 6.400022506713867, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8609025478363037, "num_tokens": 379171525.0, "step": 9936 }, { "epoch": 1.264088538353899, "ewc_loss": 0.0557587593793869, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002450876054354012, "grad_norm": 6.437164783477783, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8565531969070435, "num_tokens": 379213887.0, "step": 9937 }, { "epoch": 1.2642157486324894, "ewc_loss": 0.05577368289232254, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002452368207741529, "grad_norm": 6.44995641708374, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8637057542800903, "num_tokens": 379252634.0, "step": 9938 }, { "epoch": 1.26434295891108, "ewc_loss": 0.05578543618321419, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453543711453676, "grad_norm": 6.503547668457031, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8619197607040405, "num_tokens": 379287402.0, "step": 9939 }, { "epoch": 1.2644701691896705, "ewc_loss": 0.05568347126245499, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024433471844531596, "grad_norm": 6.381480693817139, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8691436052322388, "num_tokens": 379325592.0, "step": 9940 }, { "epoch": 1.264597379468261, "ewc_loss": 0.05577624961733818, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002452624903526157, "grad_norm": 6.506368160247803, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8500363826751709, "num_tokens": 379359010.0, "step": 9941 }, { "epoch": 1.2647245897468515, "ewc_loss": 0.055695973336696625, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024445971939712763, "grad_norm": 6.4152512550354, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8553608655929565, "num_tokens": 379399783.0, "step": 9942 }, { "epoch": 1.264851800025442, "ewc_loss": 0.05587078630924225, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024620784097351134, "grad_norm": 6.457388877868652, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8657755851745605, "num_tokens": 379442207.0, "step": 9943 }, { "epoch": 1.2649790103040326, "ewc_loss": 0.05574852228164673, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449852181598544, "grad_norm": 6.42738151550293, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8619240522384644, "num_tokens": 379479561.0, "step": 9944 }, { "epoch": 1.2651062205826231, "ewc_loss": 0.05578774958848953, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453775086905807, "grad_norm": 6.490963935852051, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8686951398849487, "num_tokens": 379516413.0, "step": 9945 }, { "epoch": 1.2652334308612136, "ewc_loss": 0.05573689192533493, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448689192533493, "grad_norm": 6.3720703125, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8536235690116882, "num_tokens": 379557993.0, "step": 9946 }, { "epoch": 1.2653606411398042, "ewc_loss": 0.05589235574007034, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024642355856485665, "grad_norm": 6.551653861999512, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8444998264312744, "num_tokens": 379594028.0, "step": 9947 }, { "epoch": 1.2654878514183947, "ewc_loss": 0.05574573948979378, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449573948979378, "grad_norm": 6.440521717071533, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8567109107971191, "num_tokens": 379631235.0, "step": 9948 }, { "epoch": 1.2656150616969852, "ewc_loss": 0.05586308240890503, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002461308322381228, "grad_norm": 6.475985050201416, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8507846593856812, "num_tokens": 379668714.0, "step": 9949 }, { "epoch": 1.2657422719755758, "ewc_loss": 0.055764008313417435, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002451400796417147, "grad_norm": 6.431134223937988, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8679047226905823, "num_tokens": 379710233.0, "step": 9950 }, { "epoch": 1.265869482254166, "ewc_loss": 0.05583472177386284, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024584721541032195, "grad_norm": 6.498123645782471, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8585518002510071, "num_tokens": 379746208.0, "step": 9951 }, { "epoch": 1.2659966925327566, "ewc_loss": 0.05573759227991104, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448759332764894, "grad_norm": 6.385631084442139, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8712350130081177, "num_tokens": 379796420.0, "step": 9952 }, { "epoch": 1.2661239028113471, "ewc_loss": 0.055852554738521576, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024602553457953036, "grad_norm": 6.542855739593506, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8637205958366394, "num_tokens": 379830027.0, "step": 9953 }, { "epoch": 1.2662511130899377, "ewc_loss": 0.055683426558971405, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443342818878591, "grad_norm": 6.391633033752441, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.861333966255188, "num_tokens": 379871751.0, "step": 9954 }, { "epoch": 1.2663783233685282, "ewc_loss": 0.055880315601825714, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024630315601825714, "grad_norm": 6.539773464202881, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8570464849472046, "num_tokens": 379908255.0, "step": 9955 }, { "epoch": 1.2665055336471187, "ewc_loss": 0.05571167171001434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002446167345624417, "grad_norm": 6.467402458190918, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8651759624481201, "num_tokens": 379944502.0, "step": 9956 }, { "epoch": 1.2666327439257092, "ewc_loss": 0.055792346596717834, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002454234636388719, "grad_norm": 6.514867782592773, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8584647178649902, "num_tokens": 379981929.0, "step": 9957 }, { "epoch": 1.2667599542042998, "ewc_loss": 0.05569510906934738, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024445110466331244, "grad_norm": 6.438652515411377, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8588352203369141, "num_tokens": 380018876.0, "step": 9958 }, { "epoch": 1.2668871644828903, "ewc_loss": 0.055764246731996536, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024514246615581214, "grad_norm": 6.4994072914123535, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8579913377761841, "num_tokens": 380054153.0, "step": 9959 }, { "epoch": 1.2670143747614806, "ewc_loss": 0.05564308166503906, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024393080093432218, "grad_norm": 6.540317535400391, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8544269800186157, "num_tokens": 380086550.0, "step": 9960 }, { "epoch": 1.2671415850400711, "ewc_loss": 0.05565083026885986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024400828988291323, "grad_norm": 6.493062973022461, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.857951283454895, "num_tokens": 380120308.0, "step": 9961 }, { "epoch": 1.2672687953186617, "ewc_loss": 0.055685050785541534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443504927214235, "grad_norm": 6.519208908081055, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8456834554672241, "num_tokens": 380160204.0, "step": 9962 }, { "epoch": 1.2673960055972522, "ewc_loss": 0.05554652214050293, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024296523770317435, "grad_norm": 6.435834884643555, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8653218746185303, "num_tokens": 380196687.0, "step": 9963 }, { "epoch": 1.2675232158758427, "ewc_loss": 0.05569695681333542, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000244469556491822, "grad_norm": 6.438300609588623, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8774890899658203, "num_tokens": 380237451.0, "step": 9964 }, { "epoch": 1.2676504261544332, "ewc_loss": 0.055628105998039246, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024378104717470706, "grad_norm": 6.47916316986084, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8586268424987793, "num_tokens": 380276631.0, "step": 9965 }, { "epoch": 1.2677776364330238, "ewc_loss": 0.05560358241200447, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024353583285119385, "grad_norm": 6.425276756286621, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8671936988830566, "num_tokens": 380319432.0, "step": 9966 }, { "epoch": 1.2679048467116143, "ewc_loss": 0.055749546736478806, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449954627081752, "grad_norm": 6.486374378204346, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8704196810722351, "num_tokens": 380356666.0, "step": 9967 }, { "epoch": 1.2680320569902048, "ewc_loss": 0.055675581097602844, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024425581796094775, "grad_norm": 6.523105144500732, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8711330890655518, "num_tokens": 380390347.0, "step": 9968 }, { "epoch": 1.2681592672687954, "ewc_loss": 0.05566743016242981, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024417429813183844, "grad_norm": 6.480523586273193, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8660733103752136, "num_tokens": 380424083.0, "step": 9969 }, { "epoch": 1.2682864775473859, "ewc_loss": 0.05562529340386391, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024375293287448585, "grad_norm": 6.491023540496826, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8574280738830566, "num_tokens": 380471179.0, "step": 9970 }, { "epoch": 1.2684136878259764, "ewc_loss": 0.05564738065004349, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024397378729190677, "grad_norm": 6.452602386474609, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8638361692428589, "num_tokens": 380510291.0, "step": 9971 }, { "epoch": 1.268540898104567, "ewc_loss": 0.055586762726306915, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024336764181498438, "grad_norm": 6.421208381652832, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8665317296981812, "num_tokens": 380556297.0, "step": 9972 }, { "epoch": 1.2686681083831575, "ewc_loss": 0.05561424046754837, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002436424110783264, "grad_norm": 6.48090934753418, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8729547262191772, "num_tokens": 380592113.0, "step": 9973 }, { "epoch": 1.268795318661748, "ewc_loss": 0.055593397468328476, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002434339839965105, "grad_norm": 6.446449279785156, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8754748106002808, "num_tokens": 380623942.0, "step": 9974 }, { "epoch": 1.2689225289403383, "ewc_loss": 0.05561768636107445, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024367687001358718, "grad_norm": 6.494925498962402, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.856560230255127, "num_tokens": 380662300.0, "step": 9975 }, { "epoch": 1.2690497392189288, "ewc_loss": 0.055684495717287064, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024434496299363673, "grad_norm": 6.436961650848389, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8650472164154053, "num_tokens": 380701452.0, "step": 9976 }, { "epoch": 1.2691769494975194, "ewc_loss": 0.05562172457575798, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024371723702643067, "grad_norm": 6.411442756652832, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8740424513816833, "num_tokens": 380741248.0, "step": 9977 }, { "epoch": 1.2693041597761099, "ewc_loss": 0.05566021427512169, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024410213518422097, "grad_norm": 6.443476676940918, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8562618494033813, "num_tokens": 380781455.0, "step": 9978 }, { "epoch": 1.2694313700547004, "ewc_loss": 0.05565289780497551, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024402898270636797, "grad_norm": 6.462397575378418, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8536931872367859, "num_tokens": 380818139.0, "step": 9979 }, { "epoch": 1.269558580333291, "ewc_loss": 0.055712029337882996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002446203143335879, "grad_norm": 6.422102928161621, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8680533170700073, "num_tokens": 380860530.0, "step": 9980 }, { "epoch": 1.2696857906118815, "ewc_loss": 0.05574958771467209, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024499589926563203, "grad_norm": 6.450129985809326, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.875708818435669, "num_tokens": 380900825.0, "step": 9981 }, { "epoch": 1.269813000890472, "ewc_loss": 0.05574885755777359, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024498856510035694, "grad_norm": 6.447195053100586, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8602527379989624, "num_tokens": 380939270.0, "step": 9982 }, { "epoch": 1.2699402111690625, "ewc_loss": 0.05575508624315262, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024505084729753435, "grad_norm": 6.506259441375732, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8406209945678711, "num_tokens": 380981708.0, "step": 9983 }, { "epoch": 1.2700674214476528, "ewc_loss": 0.05572948604822159, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002447948500048369, "grad_norm": 6.492215633392334, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8684774041175842, "num_tokens": 381015186.0, "step": 9984 }, { "epoch": 1.2701946317262434, "ewc_loss": 0.05567061901092529, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024420616682618856, "grad_norm": 6.476994037628174, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8730455636978149, "num_tokens": 381055503.0, "step": 9985 }, { "epoch": 1.270321842004834, "ewc_loss": 0.05568881332874298, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443881530780345, "grad_norm": 6.468214511871338, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8753080368041992, "num_tokens": 381091935.0, "step": 9986 }, { "epoch": 1.2704490522834244, "ewc_loss": 0.055648088455200195, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000243980874074623, "grad_norm": 6.486158847808838, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8673858046531677, "num_tokens": 381127951.0, "step": 9987 }, { "epoch": 1.270576262562015, "ewc_loss": 0.05559977889060974, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002434977941447869, "grad_norm": 6.439770698547363, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8578649163246155, "num_tokens": 381161918.0, "step": 9988 }, { "epoch": 1.2707034728406055, "ewc_loss": 0.055633287876844406, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002438328810967505, "grad_norm": 6.496949672698975, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8503031730651855, "num_tokens": 381198747.0, "step": 9989 }, { "epoch": 1.270830683119196, "ewc_loss": 0.05568598955869675, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443599223624915, "grad_norm": 6.4507646560668945, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8640196323394775, "num_tokens": 381242714.0, "step": 9990 }, { "epoch": 1.2709578933977865, "ewc_loss": 0.05575288087129593, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002450288156978786, "grad_norm": 6.462863445281982, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8733329772949219, "num_tokens": 381274667.0, "step": 9991 }, { "epoch": 1.271085103676377, "ewc_loss": 0.055708542466163635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024458541884087026, "grad_norm": 6.457192420959473, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8719100952148438, "num_tokens": 381318110.0, "step": 9992 }, { "epoch": 1.2712123139549676, "ewc_loss": 0.05573482811450958, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448482846375555, "grad_norm": 6.4660258293151855, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.862966001033783, "num_tokens": 381356694.0, "step": 9993 }, { "epoch": 1.2713395242335581, "ewc_loss": 0.055729616433382034, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024479615967720747, "grad_norm": 6.530697822570801, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8516143560409546, "num_tokens": 381395218.0, "step": 9994 }, { "epoch": 1.2714667345121486, "ewc_loss": 0.055666305124759674, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002441630349494517, "grad_norm": 6.501678466796875, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8763618469238281, "num_tokens": 381429195.0, "step": 9995 }, { "epoch": 1.2715939447907392, "ewc_loss": 0.05566217005252838, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024412172206211835, "grad_norm": 6.478830814361572, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8566718101501465, "num_tokens": 381468276.0, "step": 9996 }, { "epoch": 1.2717211550693297, "ewc_loss": 0.05559414625167847, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024344147823285311, "grad_norm": 6.4570465087890625, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8710525035858154, "num_tokens": 381503400.0, "step": 9997 }, { "epoch": 1.2718483653479202, "ewc_loss": 0.05572522431612015, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024475224199704826, "grad_norm": 6.474669456481934, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8683663606643677, "num_tokens": 381542126.0, "step": 9998 }, { "epoch": 1.2719755756265108, "ewc_loss": 0.05571817606687546, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024468175251968205, "grad_norm": 6.491690158843994, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8713557124137878, "num_tokens": 381579169.0, "step": 9999 }, { "epoch": 1.272102785905101, "ewc_loss": 0.05567369610071182, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002442369586788118, "grad_norm": 6.505271911621094, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8489371538162231, "num_tokens": 381615501.0, "step": 10000 }, { "epoch": 1.2722299961836916, "ewc_loss": 0.05565338209271431, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002440338139422238, "grad_norm": 6.5030622482299805, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8575178384780884, "num_tokens": 381649737.0, "step": 10001 }, { "epoch": 1.2723572064622821, "ewc_loss": 0.0556335523724556, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024383551499340683, "grad_norm": 6.470930576324463, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8661589026451111, "num_tokens": 381693776.0, "step": 10002 }, { "epoch": 1.2724844167408726, "ewc_loss": 0.055672481656074524, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002442247932776809, "grad_norm": 6.4737162590026855, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8732650279998779, "num_tokens": 381733324.0, "step": 10003 }, { "epoch": 1.2726116270194632, "ewc_loss": 0.0556858591735363, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443585835862905, "grad_norm": 6.537078857421875, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8579856157302856, "num_tokens": 381765893.0, "step": 10004 }, { "epoch": 1.2727388372980537, "ewc_loss": 0.055698733776807785, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024448733893223107, "grad_norm": 6.489076614379883, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8602169156074524, "num_tokens": 381802782.0, "step": 10005 }, { "epoch": 1.2728660475766442, "ewc_loss": 0.055620454251766205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002437045332044363, "grad_norm": 6.482354164123535, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8661285638809204, "num_tokens": 381843448.0, "step": 10006 }, { "epoch": 1.2729932578552348, "ewc_loss": 0.05563412606716156, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002438412484480068, "grad_norm": 6.486546039581299, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8523952960968018, "num_tokens": 381878850.0, "step": 10007 }, { "epoch": 1.2731204681338253, "ewc_loss": 0.055689968168735504, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024439970729872584, "grad_norm": 6.562427997589111, "learning_rate": 1e-06, "loss": 0.5295, "mean_token_accuracy": 0.8416264653205872, "num_tokens": 381915508.0, "step": 10008 }, { "epoch": 1.2732476784124156, "ewc_loss": 0.05555998533964157, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024309985747095197, "grad_norm": 6.445784568786621, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8540187478065491, "num_tokens": 381952699.0, "step": 10009 }, { "epoch": 1.2733748886910061, "ewc_loss": 0.055721260607242584, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024471263168379664, "grad_norm": 6.476444244384766, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.85680091381073, "num_tokens": 381989947.0, "step": 10010 }, { "epoch": 1.2735020989695967, "ewc_loss": 0.05557370185852051, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024323702382389456, "grad_norm": 6.398998260498047, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8622558116912842, "num_tokens": 382027701.0, "step": 10011 }, { "epoch": 1.2736293092481872, "ewc_loss": 0.05570598319172859, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445598365738988, "grad_norm": 6.485803604125977, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.861399233341217, "num_tokens": 382066415.0, "step": 10012 }, { "epoch": 1.2737565195267777, "ewc_loss": 0.055730875581502914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024480876163579524, "grad_norm": 6.479773044586182, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8579902648925781, "num_tokens": 382108402.0, "step": 10013 }, { "epoch": 1.2738837298053682, "ewc_loss": 0.055644966661930084, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002439496893202886, "grad_norm": 6.459664344787598, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.864362359046936, "num_tokens": 382146865.0, "step": 10014 }, { "epoch": 1.2740109400839588, "ewc_loss": 0.055667709559202194, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002441770920995623, "grad_norm": 6.467737197875977, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8599418997764587, "num_tokens": 382186746.0, "step": 10015 }, { "epoch": 1.2741381503625493, "ewc_loss": 0.05574236065149307, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449236053507775, "grad_norm": 6.528221607208252, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.875996470451355, "num_tokens": 382220693.0, "step": 10016 }, { "epoch": 1.2742653606411398, "ewc_loss": 0.05571503937244415, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002446503785904497, "grad_norm": 6.465940475463867, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8564373254776001, "num_tokens": 382259549.0, "step": 10017 }, { "epoch": 1.2743925709197303, "ewc_loss": 0.05568311735987663, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024433116777800024, "grad_norm": 6.482197284698486, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8725833892822266, "num_tokens": 382299164.0, "step": 10018 }, { "epoch": 1.2745197811983209, "ewc_loss": 0.05570906400680542, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445906575303525, "grad_norm": 6.498335838317871, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8769142627716064, "num_tokens": 382333596.0, "step": 10019 }, { "epoch": 1.2746469914769114, "ewc_loss": 0.05564756691455841, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000243975649937056, "grad_norm": 6.466017723083496, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.864637017250061, "num_tokens": 382372321.0, "step": 10020 }, { "epoch": 1.274774201755502, "ewc_loss": 0.055682145059108734, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443214470986277, "grad_norm": 6.476762294769287, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8659058213233948, "num_tokens": 382411125.0, "step": 10021 }, { "epoch": 1.2749014120340925, "ewc_loss": 0.05562363564968109, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024373635824304074, "grad_norm": 6.4599714279174805, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8520463705062866, "num_tokens": 382450935.0, "step": 10022 }, { "epoch": 1.275028622312683, "ewc_loss": 0.05575665831565857, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024506659246981144, "grad_norm": 6.506831169128418, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8611317276954651, "num_tokens": 382488232.0, "step": 10023 }, { "epoch": 1.2751558325912733, "ewc_loss": 0.055608753114938736, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024358753580600023, "grad_norm": 6.490664482116699, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8594757318496704, "num_tokens": 382522280.0, "step": 10024 }, { "epoch": 1.2752830428698638, "ewc_loss": 0.05569097772240639, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444097772240639, "grad_norm": 6.436425685882568, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8727531433105469, "num_tokens": 382562788.0, "step": 10025 }, { "epoch": 1.2754102531484544, "ewc_loss": 0.05570407211780548, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445407153572887, "grad_norm": 6.490357398986816, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8553813695907593, "num_tokens": 382598534.0, "step": 10026 }, { "epoch": 1.2755374634270449, "ewc_loss": 0.05565633624792099, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002440633688820526, "grad_norm": 6.455605983734131, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8569186925888062, "num_tokens": 382641803.0, "step": 10027 }, { "epoch": 1.2756646737056354, "ewc_loss": 0.05570336803793907, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445336722303182, "grad_norm": 6.461455345153809, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8731560707092285, "num_tokens": 382677871.0, "step": 10028 }, { "epoch": 1.275791883984226, "ewc_loss": 0.05565113574266434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024401136033702642, "grad_norm": 6.456869602203369, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8716336488723755, "num_tokens": 382714989.0, "step": 10029 }, { "epoch": 1.2759190942628165, "ewc_loss": 0.05567609518766403, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002442609693389386, "grad_norm": 6.5159525871276855, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8655009269714355, "num_tokens": 382751919.0, "step": 10030 }, { "epoch": 1.276046304541407, "ewc_loss": 0.05564267933368683, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024392678460571915, "grad_norm": 6.421144485473633, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8736004829406738, "num_tokens": 382793550.0, "step": 10031 }, { "epoch": 1.2761735148199975, "ewc_loss": 0.05567300692200661, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024423006107099354, "grad_norm": 6.469505310058594, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8717505931854248, "num_tokens": 382833142.0, "step": 10032 }, { "epoch": 1.2763007250985878, "ewc_loss": 0.055655382573604584, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024405385192949325, "grad_norm": 6.446176052093506, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8713976144790649, "num_tokens": 382869934.0, "step": 10033 }, { "epoch": 1.2764279353771784, "ewc_loss": 0.055654071271419525, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024404072610195726, "grad_norm": 6.5167694091796875, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8493000268936157, "num_tokens": 382909533.0, "step": 10034 }, { "epoch": 1.2765551456557689, "ewc_loss": 0.05566352978348732, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024413529899902642, "grad_norm": 6.476864337921143, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8695728778839111, "num_tokens": 382947462.0, "step": 10035 }, { "epoch": 1.2766823559343594, "ewc_loss": 0.05562792345881462, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024377924273721874, "grad_norm": 6.405048370361328, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8644428849220276, "num_tokens": 382989973.0, "step": 10036 }, { "epoch": 1.27680956621295, "ewc_loss": 0.0557563379406929, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024506336194463074, "grad_norm": 6.513017654418945, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8620933890342712, "num_tokens": 383033052.0, "step": 10037 }, { "epoch": 1.2769367764915405, "ewc_loss": 0.0556158721446991, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002436587237752974, "grad_norm": 6.4295148849487305, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8613229990005493, "num_tokens": 383075981.0, "step": 10038 }, { "epoch": 1.277063986770131, "ewc_loss": 0.05580350011587143, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002455349895171821, "grad_norm": 6.497549533843994, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8789327144622803, "num_tokens": 383111984.0, "step": 10039 }, { "epoch": 1.2771911970487215, "ewc_loss": 0.05563775449991226, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024387752637267113, "grad_norm": 6.42921781539917, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8646039962768555, "num_tokens": 383155323.0, "step": 10040 }, { "epoch": 1.277318407327312, "ewc_loss": 0.05581645667552948, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002456645597703755, "grad_norm": 6.632211685180664, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8614555597305298, "num_tokens": 383190069.0, "step": 10041 }, { "epoch": 1.2774456176059026, "ewc_loss": 0.0556614026427269, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024411400954704732, "grad_norm": 6.487884521484375, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8546339273452759, "num_tokens": 383224999.0, "step": 10042 }, { "epoch": 1.2775728278844931, "ewc_loss": 0.0556376576423645, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024387655139435083, "grad_norm": 6.494829177856445, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8561453819274902, "num_tokens": 383267764.0, "step": 10043 }, { "epoch": 1.2777000381630836, "ewc_loss": 0.05567629635334015, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024426294839940965, "grad_norm": 6.495726585388184, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8731099367141724, "num_tokens": 383306148.0, "step": 10044 }, { "epoch": 1.2778272484416742, "ewc_loss": 0.055650174617767334, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024400174152106047, "grad_norm": 6.485746383666992, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8670376539230347, "num_tokens": 383343408.0, "step": 10045 }, { "epoch": 1.2779544587202647, "ewc_loss": 0.05563074350357056, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024380742979701608, "grad_norm": 6.425263404846191, "learning_rate": 1e-06, "loss": 0.378, "mean_token_accuracy": 0.8864976167678833, "num_tokens": 383383626.0, "step": 10046 }, { "epoch": 1.2780816689988552, "ewc_loss": 0.05570925772190094, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445925783831626, "grad_norm": 6.508584022521973, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8474838733673096, "num_tokens": 383423218.0, "step": 10047 }, { "epoch": 1.2782088792774458, "ewc_loss": 0.05573297291994095, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448297163937241, "grad_norm": 6.472472190856934, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.870743989944458, "num_tokens": 383461126.0, "step": 10048 }, { "epoch": 1.278336089556036, "ewc_loss": 0.05572438985109329, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024474391830153763, "grad_norm": 6.539913654327393, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8549466729164124, "num_tokens": 383501808.0, "step": 10049 }, { "epoch": 1.2784632998346266, "ewc_loss": 0.05571674555540085, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002446674625389278, "grad_norm": 6.443551063537598, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8834401965141296, "num_tokens": 383542395.0, "step": 10050 }, { "epoch": 1.2785905101132171, "ewc_loss": 0.05580513924360275, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002455514040775597, "grad_norm": 6.5378546714782715, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8600916862487793, "num_tokens": 383579968.0, "step": 10051 }, { "epoch": 1.2787177203918076, "ewc_loss": 0.055705759674310684, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445575955789536, "grad_norm": 6.439341068267822, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8629142045974731, "num_tokens": 383617861.0, "step": 10052 }, { "epoch": 1.2788449306703982, "ewc_loss": 0.055828843265771866, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024578842567279935, "grad_norm": 6.569368362426758, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8582427501678467, "num_tokens": 383652534.0, "step": 10053 }, { "epoch": 1.2789721409489887, "ewc_loss": 0.0557057224214077, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445572172291577, "grad_norm": 6.450908660888672, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8584365248680115, "num_tokens": 383693120.0, "step": 10054 }, { "epoch": 1.2790993512275792, "ewc_loss": 0.055802538990974426, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024552541435696185, "grad_norm": 6.509825229644775, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8541953563690186, "num_tokens": 383734958.0, "step": 10055 }, { "epoch": 1.2792265615061698, "ewc_loss": 0.05568385124206543, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024433850194327533, "grad_norm": 6.456945419311523, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8547873497009277, "num_tokens": 383768873.0, "step": 10056 }, { "epoch": 1.2793537717847603, "ewc_loss": 0.055813148617744446, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024563149781897664, "grad_norm": 6.528748989105225, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8648037910461426, "num_tokens": 383804063.0, "step": 10057 }, { "epoch": 1.2794809820633506, "ewc_loss": 0.055743999779224396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449400199111551, "grad_norm": 6.421741962432861, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8767479658126831, "num_tokens": 383843098.0, "step": 10058 }, { "epoch": 1.2796081923419411, "ewc_loss": 0.055908724665641785, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002465872385073453, "grad_norm": 6.6059980392456055, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8725258111953735, "num_tokens": 383879443.0, "step": 10059 }, { "epoch": 1.2797354026205316, "ewc_loss": 0.055753208696842194, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002450321044307202, "grad_norm": 6.477258205413818, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8625586032867432, "num_tokens": 383918071.0, "step": 10060 }, { "epoch": 1.2798626128991222, "ewc_loss": 0.05588017404079437, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024630172993056476, "grad_norm": 6.54963493347168, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8560423254966736, "num_tokens": 383953052.0, "step": 10061 }, { "epoch": 1.2799898231777127, "ewc_loss": 0.05572265386581421, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024472654331475496, "grad_norm": 6.468538761138916, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8614868521690369, "num_tokens": 383992835.0, "step": 10062 }, { "epoch": 1.2801170334563032, "ewc_loss": 0.05583227053284645, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002458227099850774, "grad_norm": 6.564488887786865, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8503525257110596, "num_tokens": 384031234.0, "step": 10063 }, { "epoch": 1.2802442437348938, "ewc_loss": 0.05573880672454834, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448880404699594, "grad_norm": 6.49627685546875, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8545249104499817, "num_tokens": 384063611.0, "step": 10064 }, { "epoch": 1.2803714540134843, "ewc_loss": 0.05576314032077789, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024513137759640813, "grad_norm": 6.398510932922363, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8715636730194092, "num_tokens": 384106757.0, "step": 10065 }, { "epoch": 1.2804986642920748, "ewc_loss": 0.055862776935100555, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002461277472320944, "grad_norm": 6.4905548095703125, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8664898872375488, "num_tokens": 384143094.0, "step": 10066 }, { "epoch": 1.2806258745706653, "ewc_loss": 0.055843427777290344, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002459342940710485, "grad_norm": 6.478648662567139, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8629198670387268, "num_tokens": 384181498.0, "step": 10067 }, { "epoch": 1.2807530848492559, "ewc_loss": 0.055904291570186615, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002465429133735597, "grad_norm": 6.58373498916626, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8672869205474854, "num_tokens": 384209784.0, "step": 10068 }, { "epoch": 1.2808802951278464, "ewc_loss": 0.05575457960367203, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024504581233486533, "grad_norm": 6.474959850311279, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8688015937805176, "num_tokens": 384248000.0, "step": 10069 }, { "epoch": 1.281007505406437, "ewc_loss": 0.055971406400203705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000247214047703892, "grad_norm": 6.4683098793029785, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8547176122665405, "num_tokens": 384290897.0, "step": 10070 }, { "epoch": 1.2811347156850275, "ewc_loss": 0.055808812379837036, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002455881331115961, "grad_norm": 6.47695779800415, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8550018072128296, "num_tokens": 384329464.0, "step": 10071 }, { "epoch": 1.281261925963618, "ewc_loss": 0.056018173694610596, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002476817462593317, "grad_norm": 6.506507873535156, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8660078048706055, "num_tokens": 384368915.0, "step": 10072 }, { "epoch": 1.2813891362422083, "ewc_loss": 0.055839456617832184, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024589456734247506, "grad_norm": 6.456558704376221, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8729838132858276, "num_tokens": 384407445.0, "step": 10073 }, { "epoch": 1.2815163465207988, "ewc_loss": 0.05598386377096176, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002473386120982468, "grad_norm": 6.530679702758789, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8589478135108948, "num_tokens": 384444384.0, "step": 10074 }, { "epoch": 1.2816435567993893, "ewc_loss": 0.055834487080574036, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024584485800005496, "grad_norm": 6.515586853027344, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8590317368507385, "num_tokens": 384479670.0, "step": 10075 }, { "epoch": 1.2817707670779799, "ewc_loss": 0.05583912506699562, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000245891249505803, "grad_norm": 6.463953971862793, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8518620729446411, "num_tokens": 384522436.0, "step": 10076 }, { "epoch": 1.2818979773565704, "ewc_loss": 0.05585866793990135, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024608668172731996, "grad_norm": 6.536041259765625, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8550238609313965, "num_tokens": 384559754.0, "step": 10077 }, { "epoch": 1.282025187635161, "ewc_loss": 0.0557769313454628, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002452693006489426, "grad_norm": 6.493919849395752, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8591042757034302, "num_tokens": 384595153.0, "step": 10078 }, { "epoch": 1.2821523979137515, "ewc_loss": 0.05584178492426872, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024591785040684044, "grad_norm": 6.5124077796936035, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8624104261398315, "num_tokens": 384628345.0, "step": 10079 }, { "epoch": 1.282279608192342, "ewc_loss": 0.055797725915908813, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002454772766213864, "grad_norm": 6.503598213195801, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8583730459213257, "num_tokens": 384666725.0, "step": 10080 }, { "epoch": 1.2824068184709325, "ewc_loss": 0.055706873536109924, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445687132421881, "grad_norm": 6.495655536651611, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8536261320114136, "num_tokens": 384706217.0, "step": 10081 }, { "epoch": 1.2825340287495228, "ewc_loss": 0.05576696991920471, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002451696782372892, "grad_norm": 6.53362512588501, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8589056730270386, "num_tokens": 384741835.0, "step": 10082 }, { "epoch": 1.2826612390281134, "ewc_loss": 0.05564390867948532, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024393908097408712, "grad_norm": 6.492555141448975, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8643433451652527, "num_tokens": 384777273.0, "step": 10083 }, { "epoch": 1.2827884493067039, "ewc_loss": 0.05576565861701965, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002451566106174141, "grad_norm": 6.487013339996338, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8760998845100403, "num_tokens": 384814950.0, "step": 10084 }, { "epoch": 1.2829156595852944, "ewc_loss": 0.05571550130844116, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024465503520332277, "grad_norm": 6.527397632598877, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8547155857086182, "num_tokens": 384848088.0, "step": 10085 }, { "epoch": 1.283042869863885, "ewc_loss": 0.05578497052192688, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024534971453249454, "grad_norm": 6.508557319641113, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8541544675827026, "num_tokens": 384887361.0, "step": 10086 }, { "epoch": 1.2831700801424755, "ewc_loss": 0.055696457624435425, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444645797368139, "grad_norm": 6.452357292175293, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8690100312232971, "num_tokens": 384930459.0, "step": 10087 }, { "epoch": 1.283297290421066, "ewc_loss": 0.05577363073825836, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024523629690520465, "grad_norm": 6.49790096282959, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8570672273635864, "num_tokens": 384969506.0, "step": 10088 }, { "epoch": 1.2834245006996565, "ewc_loss": 0.05571881681680679, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000244688184466213, "grad_norm": 6.516749382019043, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8686612844467163, "num_tokens": 385004368.0, "step": 10089 }, { "epoch": 1.283551710978247, "ewc_loss": 0.05575171858072281, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024501720326952636, "grad_norm": 6.454049587249756, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8642106652259827, "num_tokens": 385044553.0, "step": 10090 }, { "epoch": 1.2836789212568376, "ewc_loss": 0.055754296481609344, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024504298926331103, "grad_norm": 6.4960713386535645, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8752814531326294, "num_tokens": 385076873.0, "step": 10091 }, { "epoch": 1.283806131535428, "ewc_loss": 0.05578143894672394, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024531438248232007, "grad_norm": 6.487340450286865, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8657139539718628, "num_tokens": 385113969.0, "step": 10092 }, { "epoch": 1.2839333418140186, "ewc_loss": 0.0557684600353241, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024518457939848304, "grad_norm": 6.498746395111084, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8731907606124878, "num_tokens": 385149460.0, "step": 10093 }, { "epoch": 1.2840605520926092, "ewc_loss": 0.05570933595299721, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024459336418658495, "grad_norm": 6.480391025543213, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8455816507339478, "num_tokens": 385187240.0, "step": 10094 }, { "epoch": 1.2841877623711997, "ewc_loss": 0.05579137057065964, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024541368475183845, "grad_norm": 6.52659273147583, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.855506420135498, "num_tokens": 385225532.0, "step": 10095 }, { "epoch": 1.2843149726497902, "ewc_loss": 0.05571049451828003, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024460491840727627, "grad_norm": 6.481226444244385, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8649929761886597, "num_tokens": 385267404.0, "step": 10096 }, { "epoch": 1.2844421829283807, "ewc_loss": 0.05570507049560547, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445506979711354, "grad_norm": 6.493666172027588, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8506675958633423, "num_tokens": 385314008.0, "step": 10097 }, { "epoch": 1.284569393206971, "ewc_loss": 0.05570441111922264, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024454412050545216, "grad_norm": 6.562282085418701, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8498967885971069, "num_tokens": 385352204.0, "step": 10098 }, { "epoch": 1.2846966034855616, "ewc_loss": 0.05560959875583649, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024359596136491746, "grad_norm": 6.433269500732422, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.855069637298584, "num_tokens": 385393307.0, "step": 10099 }, { "epoch": 1.2848238137641521, "ewc_loss": 0.0557202473282814, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024470247444696724, "grad_norm": 6.53519868850708, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8633818626403809, "num_tokens": 385426124.0, "step": 10100 }, { "epoch": 1.2849510240427426, "ewc_loss": 0.05563709884881973, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002438709925627336, "grad_norm": 6.434479713439941, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8678821921348572, "num_tokens": 385467471.0, "step": 10101 }, { "epoch": 1.2850782343213332, "ewc_loss": 0.05567210167646408, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024422103888355196, "grad_norm": 6.494338512420654, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8542535305023193, "num_tokens": 385508780.0, "step": 10102 }, { "epoch": 1.2852054445999237, "ewc_loss": 0.05571068823337555, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002446068683639169, "grad_norm": 6.429066181182861, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8681355118751526, "num_tokens": 385552646.0, "step": 10103 }, { "epoch": 1.2853326548785142, "ewc_loss": 0.055666327476501465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024416326778009534, "grad_norm": 6.5450053215026855, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8498208522796631, "num_tokens": 385586676.0, "step": 10104 }, { "epoch": 1.2854598651571048, "ewc_loss": 0.055692557245492935, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444255806040019, "grad_norm": 6.472829818725586, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8509925603866577, "num_tokens": 385623201.0, "step": 10105 }, { "epoch": 1.2855870754356953, "ewc_loss": 0.05574365705251694, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024493655655533075, "grad_norm": 6.472411155700684, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8737083673477173, "num_tokens": 385662716.0, "step": 10106 }, { "epoch": 1.2857142857142856, "ewc_loss": 0.05569825321435928, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024448250769637525, "grad_norm": 6.462808609008789, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8517528176307678, "num_tokens": 385706986.0, "step": 10107 }, { "epoch": 1.2858414959928761, "ewc_loss": 0.0557359978556633, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448599843773991, "grad_norm": 6.530817985534668, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8623365163803101, "num_tokens": 385744240.0, "step": 10108 }, { "epoch": 1.2859687062714666, "ewc_loss": 0.05572587624192238, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024475876125507057, "grad_norm": 6.498241901397705, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8469587564468384, "num_tokens": 385778583.0, "step": 10109 }, { "epoch": 1.2860959165500572, "ewc_loss": 0.05571584776043892, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024465846945531666, "grad_norm": 6.466329097747803, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8626673221588135, "num_tokens": 385819910.0, "step": 10110 }, { "epoch": 1.2862231268286477, "ewc_loss": 0.05574261397123337, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024492613738402724, "grad_norm": 6.441378593444824, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8603512644767761, "num_tokens": 385860068.0, "step": 10111 }, { "epoch": 1.2863503371072382, "ewc_loss": 0.05581967532634735, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024569674860686064, "grad_norm": 6.567375659942627, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8533881306648254, "num_tokens": 385890460.0, "step": 10112 }, { "epoch": 1.2864775473858288, "ewc_loss": 0.055758483707904816, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002450848405715078, "grad_norm": 6.5132293701171875, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8588249683380127, "num_tokens": 385926174.0, "step": 10113 }, { "epoch": 1.2866047576644193, "ewc_loss": 0.055839989334344864, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024589989334344864, "grad_norm": 6.5459818840026855, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8724422454833984, "num_tokens": 385962928.0, "step": 10114 }, { "epoch": 1.2867319679430098, "ewc_loss": 0.05572345107793808, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024473448866046965, "grad_norm": 6.444277763366699, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8714452385902405, "num_tokens": 386002509.0, "step": 10115 }, { "epoch": 1.2868591782216003, "ewc_loss": 0.05576470494270325, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002451470645610243, "grad_norm": 6.521187782287598, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8615115284919739, "num_tokens": 386043304.0, "step": 10116 }, { "epoch": 1.2869863885001909, "ewc_loss": 0.05580615997314453, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024556161952205, "grad_norm": 6.495413303375244, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8702479600906372, "num_tokens": 386081607.0, "step": 10117 }, { "epoch": 1.2871135987787814, "ewc_loss": 0.055779412388801575, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002452940971124917, "grad_norm": 6.477161884307861, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.880657434463501, "num_tokens": 386117762.0, "step": 10118 }, { "epoch": 1.287240809057372, "ewc_loss": 0.05570457875728607, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024454580852761865, "grad_norm": 6.447337627410889, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8539644479751587, "num_tokens": 386155908.0, "step": 10119 }, { "epoch": 1.2873680193359625, "ewc_loss": 0.05579105392098427, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002454105415381491, "grad_norm": 6.486247539520264, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8603231906890869, "num_tokens": 386199385.0, "step": 10120 }, { "epoch": 1.287495229614553, "ewc_loss": 0.05572591349482536, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002447591396048665, "grad_norm": 6.559940814971924, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8574577569961548, "num_tokens": 386231892.0, "step": 10121 }, { "epoch": 1.2876224398931433, "ewc_loss": 0.055688001215457916, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024438000400550663, "grad_norm": 6.482738494873047, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8452827334403992, "num_tokens": 386269657.0, "step": 10122 }, { "epoch": 1.2877496501717338, "ewc_loss": 0.055745869874954224, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024495867546647787, "grad_norm": 6.486993312835693, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8670063614845276, "num_tokens": 386306936.0, "step": 10123 }, { "epoch": 1.2878768604503243, "ewc_loss": 0.055707015097141266, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445701393298805, "grad_norm": 6.470596790313721, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8622681498527527, "num_tokens": 386342260.0, "step": 10124 }, { "epoch": 1.2880040707289149, "ewc_loss": 0.055715784430503845, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024465785827487707, "grad_norm": 6.458417892456055, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8585225343704224, "num_tokens": 386381784.0, "step": 10125 }, { "epoch": 1.2881312810075054, "ewc_loss": 0.05571821331977844, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024468215997330844, "grad_norm": 6.444296836853027, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.860493540763855, "num_tokens": 386422350.0, "step": 10126 }, { "epoch": 1.288258491286096, "ewc_loss": 0.0557156503200531, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024465651949867606, "grad_norm": 6.489735126495361, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.858160138130188, "num_tokens": 386467951.0, "step": 10127 }, { "epoch": 1.2883857015646865, "ewc_loss": 0.05574313551187515, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000244931346969679, "grad_norm": 6.442999362945557, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8705657720565796, "num_tokens": 386511896.0, "step": 10128 }, { "epoch": 1.288512911843277, "ewc_loss": 0.05578345060348511, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453344932291657, "grad_norm": 6.4900054931640625, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8495166301727295, "num_tokens": 386550704.0, "step": 10129 }, { "epoch": 1.2886401221218675, "ewc_loss": 0.055783383548259735, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024533382384106517, "grad_norm": 6.479588508605957, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8458329439163208, "num_tokens": 386589295.0, "step": 10130 }, { "epoch": 1.2887673324004578, "ewc_loss": 0.055755749344825745, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002450574829708785, "grad_norm": 6.531075477600098, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.862274169921875, "num_tokens": 386623462.0, "step": 10131 }, { "epoch": 1.2888945426790483, "ewc_loss": 0.055742740631103516, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449273888487369, "grad_norm": 6.465503215789795, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8653916716575623, "num_tokens": 386663036.0, "step": 10132 }, { "epoch": 1.2890217529576389, "ewc_loss": 0.0558038055896759, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024553804541938007, "grad_norm": 6.494087219238281, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8630881905555725, "num_tokens": 386702530.0, "step": 10133 }, { "epoch": 1.2891489632362294, "ewc_loss": 0.05575857684016228, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002450857718940824, "grad_norm": 6.4894633293151855, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8771304488182068, "num_tokens": 386739758.0, "step": 10134 }, { "epoch": 1.28927617351482, "ewc_loss": 0.05578877031803131, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453876950312406, "grad_norm": 6.5247602462768555, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8720450401306152, "num_tokens": 386778287.0, "step": 10135 }, { "epoch": 1.2894033837934105, "ewc_loss": 0.055688127875328064, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024438128457404673, "grad_norm": 6.473886013031006, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8730233907699585, "num_tokens": 386812348.0, "step": 10136 }, { "epoch": 1.289530594072001, "ewc_loss": 0.055808767676353455, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024558769655413926, "grad_norm": 6.502532005310059, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8696081042289734, "num_tokens": 386855540.0, "step": 10137 }, { "epoch": 1.2896578043505915, "ewc_loss": 0.05579260736703873, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000245426082983613, "grad_norm": 6.505581378936768, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8582394123077393, "num_tokens": 386893775.0, "step": 10138 }, { "epoch": 1.289785014629182, "ewc_loss": 0.055800825357437134, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002455082430969924, "grad_norm": 6.586339950561523, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8626372814178467, "num_tokens": 386929553.0, "step": 10139 }, { "epoch": 1.2899122249077726, "ewc_loss": 0.055742084980010986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449208404868841, "grad_norm": 6.477992057800293, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8569688200950623, "num_tokens": 386968033.0, "step": 10140 }, { "epoch": 1.290039435186363, "ewc_loss": 0.05578390508890152, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453390334267169, "grad_norm": 6.560256004333496, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8613395094871521, "num_tokens": 387004967.0, "step": 10141 }, { "epoch": 1.2901666454649536, "ewc_loss": 0.05569186061620712, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444186247885227, "grad_norm": 6.508006572723389, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8505151271820068, "num_tokens": 387038086.0, "step": 10142 }, { "epoch": 1.2902938557435442, "ewc_loss": 0.055835552513599396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002458555391058326, "grad_norm": 6.487920761108398, "learning_rate": 1e-06, "loss": 0.5418, "mean_token_accuracy": 0.8379883766174316, "num_tokens": 387077112.0, "step": 10143 }, { "epoch": 1.2904210660221347, "ewc_loss": 0.055770616978406906, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024520617444068193, "grad_norm": 6.5386881828308105, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8454840183258057, "num_tokens": 387118800.0, "step": 10144 }, { "epoch": 1.2905482763007252, "ewc_loss": 0.05573657155036926, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024486571783199906, "grad_norm": 6.503247261047363, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8578705787658691, "num_tokens": 387160321.0, "step": 10145 }, { "epoch": 1.2906754865793157, "ewc_loss": 0.05584681034088135, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024596808361820877, "grad_norm": 6.586531639099121, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8602948188781738, "num_tokens": 387199387.0, "step": 10146 }, { "epoch": 1.290802696857906, "ewc_loss": 0.05565953627228737, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024409535399172455, "grad_norm": 6.413235664367676, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8686974048614502, "num_tokens": 387238859.0, "step": 10147 }, { "epoch": 1.2909299071364966, "ewc_loss": 0.05590422824025154, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024654227308928967, "grad_norm": 6.581053256988525, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.868141233921051, "num_tokens": 387272149.0, "step": 10148 }, { "epoch": 1.291057117415087, "ewc_loss": 0.05571543425321579, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024465436581522226, "grad_norm": 6.4371018409729, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.860325276851654, "num_tokens": 387311617.0, "step": 10149 }, { "epoch": 1.2911843276936776, "ewc_loss": 0.055944688618183136, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024694690364412963, "grad_norm": 6.592707633972168, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8484704494476318, "num_tokens": 387344281.0, "step": 10150 }, { "epoch": 1.2913115379722682, "ewc_loss": 0.055684126913547516, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443412959109992, "grad_norm": 6.421494960784912, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.851034939289093, "num_tokens": 387385959.0, "step": 10151 }, { "epoch": 1.2914387482508587, "ewc_loss": 0.05600985139608383, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024759850930422544, "grad_norm": 6.5931077003479, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.857579231262207, "num_tokens": 387426197.0, "step": 10152 }, { "epoch": 1.2915659585294492, "ewc_loss": 0.05574001744389534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024490017676725984, "grad_norm": 6.460256099700928, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.86794114112854, "num_tokens": 387462250.0, "step": 10153 }, { "epoch": 1.2916931688080397, "ewc_loss": 0.0559048056602478, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024654806475155056, "grad_norm": 6.595869541168213, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8610655069351196, "num_tokens": 387499903.0, "step": 10154 }, { "epoch": 1.2918203790866303, "ewc_loss": 0.05579063296318054, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024540635058656335, "grad_norm": 6.545650482177734, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8701318502426147, "num_tokens": 387535361.0, "step": 10155 }, { "epoch": 1.2919475893652206, "ewc_loss": 0.05571330711245537, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024463306181132793, "grad_norm": 6.537367343902588, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8597392439842224, "num_tokens": 387574300.0, "step": 10156 }, { "epoch": 1.2920747996438111, "ewc_loss": 0.05577301234006882, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002452301268931478, "grad_norm": 6.57840633392334, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.863504946231842, "num_tokens": 387606860.0, "step": 10157 }, { "epoch": 1.2922020099224016, "ewc_loss": 0.055647920817136765, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024397920060437173, "grad_norm": 6.461075305938721, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.851709246635437, "num_tokens": 387646519.0, "step": 10158 }, { "epoch": 1.2923292202009922, "ewc_loss": 0.05581538379192352, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002456538495607674, "grad_norm": 6.555303573608398, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8744548559188843, "num_tokens": 387681100.0, "step": 10159 }, { "epoch": 1.2924564304795827, "ewc_loss": 0.055724941194057465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024474941892549396, "grad_norm": 6.525171756744385, "learning_rate": 1e-06, "loss": 0.3961, "mean_token_accuracy": 0.8835492134094238, "num_tokens": 387715962.0, "step": 10160 }, { "epoch": 1.2925836407581732, "ewc_loss": 0.05581706017255783, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024567058426328003, "grad_norm": 6.560923099517822, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8603501319885254, "num_tokens": 387755856.0, "step": 10161 }, { "epoch": 1.2927108510367638, "ewc_loss": 0.05572091415524483, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024470913922414184, "grad_norm": 6.535760879516602, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8703877925872803, "num_tokens": 387793256.0, "step": 10162 }, { "epoch": 1.2928380613153543, "ewc_loss": 0.055685028433799744, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024435025989077985, "grad_norm": 6.517016887664795, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8568276762962341, "num_tokens": 387834696.0, "step": 10163 }, { "epoch": 1.2929652715939448, "ewc_loss": 0.05569464713335037, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444464771542698, "grad_norm": 6.584770679473877, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8500147461891174, "num_tokens": 387874973.0, "step": 10164 }, { "epoch": 1.2930924818725353, "ewc_loss": 0.05567074567079544, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024420744739472866, "grad_norm": 6.571450710296631, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8506001234054565, "num_tokens": 387904887.0, "step": 10165 }, { "epoch": 1.2932196921511259, "ewc_loss": 0.05573072284460068, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448072482366115, "grad_norm": 6.515186786651611, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8688238859176636, "num_tokens": 387944676.0, "step": 10166 }, { "epoch": 1.2933469024297164, "ewc_loss": 0.05580832436680794, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002455832436680794, "grad_norm": 6.582279205322266, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.863085150718689, "num_tokens": 387982234.0, "step": 10167 }, { "epoch": 1.293474112708307, "ewc_loss": 0.05569213628768921, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024442136054858565, "grad_norm": 6.486466407775879, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8622812032699585, "num_tokens": 388018001.0, "step": 10168 }, { "epoch": 1.2936013229868975, "ewc_loss": 0.05582219734787941, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002457219816278666, "grad_norm": 6.553099632263184, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8571580052375793, "num_tokens": 388057476.0, "step": 10169 }, { "epoch": 1.293728533265488, "ewc_loss": 0.055704474449157715, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024454473168589175, "grad_norm": 6.475871562957764, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8805878758430481, "num_tokens": 388095351.0, "step": 10170 }, { "epoch": 1.2938557435440783, "ewc_loss": 0.05575007200241089, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024500073050148785, "grad_norm": 6.6057024002075195, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8630408048629761, "num_tokens": 388135366.0, "step": 10171 }, { "epoch": 1.2939829538226688, "ewc_loss": 0.055648721754550934, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024398721870966256, "grad_norm": 6.5322089195251465, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8510819673538208, "num_tokens": 388175003.0, "step": 10172 }, { "epoch": 1.2941101641012593, "ewc_loss": 0.05565136671066284, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002440136595396325, "grad_norm": 6.591362476348877, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8436415791511536, "num_tokens": 388207083.0, "step": 10173 }, { "epoch": 1.2942373743798499, "ewc_loss": 0.055670082569122314, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002442008117213845, "grad_norm": 6.464899063110352, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.877108097076416, "num_tokens": 388245038.0, "step": 10174 }, { "epoch": 1.2943645846584404, "ewc_loss": 0.0556991770863533, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444917627144605, "grad_norm": 6.5343146324157715, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8587143421173096, "num_tokens": 388285035.0, "step": 10175 }, { "epoch": 1.294491794937031, "ewc_loss": 0.055720821022987366, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002447082079015672, "grad_norm": 6.494299411773682, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8688392639160156, "num_tokens": 388323971.0, "step": 10176 }, { "epoch": 1.2946190052156215, "ewc_loss": 0.05573011562228203, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024480116553604603, "grad_norm": 6.519228935241699, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8582190275192261, "num_tokens": 388362256.0, "step": 10177 }, { "epoch": 1.294746215494212, "ewc_loss": 0.05573911964893341, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024489121278747916, "grad_norm": 6.499637603759766, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8826620578765869, "num_tokens": 388401120.0, "step": 10178 }, { "epoch": 1.2948734257728025, "ewc_loss": 0.055729012936353683, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002447901351843029, "grad_norm": 6.501040935516357, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8655614852905273, "num_tokens": 388446833.0, "step": 10179 }, { "epoch": 1.2950006360513928, "ewc_loss": 0.055775485932826996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024525486514903605, "grad_norm": 6.552745819091797, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8747440576553345, "num_tokens": 388482266.0, "step": 10180 }, { "epoch": 1.2951278463299833, "ewc_loss": 0.05570908635854721, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024459086125716567, "grad_norm": 6.503705978393555, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8638206124305725, "num_tokens": 388518024.0, "step": 10181 }, { "epoch": 1.2952550566085739, "ewc_loss": 0.05573398619890213, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448398736305535, "grad_norm": 6.5164079666137695, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8649382591247559, "num_tokens": 388564246.0, "step": 10182 }, { "epoch": 1.2953822668871644, "ewc_loss": 0.05570089817047119, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445089921820909, "grad_norm": 6.585742473602295, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8660435080528259, "num_tokens": 388603556.0, "step": 10183 }, { "epoch": 1.295509477165755, "ewc_loss": 0.05573529377579689, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024485294125042856, "grad_norm": 6.537811756134033, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8713988065719604, "num_tokens": 388643615.0, "step": 10184 }, { "epoch": 1.2956366874443455, "ewc_loss": 0.05567033216357231, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002442033146508038, "grad_norm": 6.525707244873047, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8660840392112732, "num_tokens": 388682917.0, "step": 10185 }, { "epoch": 1.295763897722936, "ewc_loss": 0.05570664629340172, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024456647224724293, "grad_norm": 6.572463035583496, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8648810386657715, "num_tokens": 388720269.0, "step": 10186 }, { "epoch": 1.2958911080015265, "ewc_loss": 0.05563310533761978, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024383104755543172, "grad_norm": 6.521873474121094, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8556988835334778, "num_tokens": 388759245.0, "step": 10187 }, { "epoch": 1.296018318280117, "ewc_loss": 0.05568493530154228, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443493576720357, "grad_norm": 6.545714378356934, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8579072952270508, "num_tokens": 388795673.0, "step": 10188 }, { "epoch": 1.2961455285587076, "ewc_loss": 0.05567479506134987, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024424795992672443, "grad_norm": 6.554144859313965, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8675013780593872, "num_tokens": 388832926.0, "step": 10189 }, { "epoch": 1.296272738837298, "ewc_loss": 0.05559298396110535, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024342983670067042, "grad_norm": 6.533188343048096, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8672497272491455, "num_tokens": 388869767.0, "step": 10190 }, { "epoch": 1.2963999491158886, "ewc_loss": 0.05565200001001358, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002440199750708416, "grad_norm": 6.488095283508301, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.86655592918396, "num_tokens": 388913347.0, "step": 10191 }, { "epoch": 1.2965271593944792, "ewc_loss": 0.055705517530441284, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445551799610257, "grad_norm": 6.564075469970703, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8748593330383301, "num_tokens": 388948211.0, "step": 10192 }, { "epoch": 1.2966543696730697, "ewc_loss": 0.055734358727931976, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024484359892085195, "grad_norm": 6.522501468658447, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8686018586158752, "num_tokens": 388989198.0, "step": 10193 }, { "epoch": 1.2967815799516602, "ewc_loss": 0.05576621741056442, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024516216944903135, "grad_norm": 6.532909870147705, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8548377752304077, "num_tokens": 389029076.0, "step": 10194 }, { "epoch": 1.2969087902302507, "ewc_loss": 0.05574969947338104, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024499697610735893, "grad_norm": 6.630764007568359, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8641753792762756, "num_tokens": 389066751.0, "step": 10195 }, { "epoch": 1.297036000508841, "ewc_loss": 0.05570968985557556, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024459691485390067, "grad_norm": 6.520145416259766, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8609281182289124, "num_tokens": 389104207.0, "step": 10196 }, { "epoch": 1.2971632107874316, "ewc_loss": 0.05569228529930115, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444228739477694, "grad_norm": 6.535524368286133, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8661129474639893, "num_tokens": 389141941.0, "step": 10197 }, { "epoch": 1.297290421066022, "ewc_loss": 0.0557158961892128, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024465896422043443, "grad_norm": 6.464229583740234, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8740243315696716, "num_tokens": 389185326.0, "step": 10198 }, { "epoch": 1.2974176313446126, "ewc_loss": 0.05578196048736572, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453195920679718, "grad_norm": 6.570193290710449, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8725864887237549, "num_tokens": 389221512.0, "step": 10199 }, { "epoch": 1.2975448416232032, "ewc_loss": 0.05567842721939087, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000244284252403304, "grad_norm": 6.487923622131348, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8710569143295288, "num_tokens": 389259388.0, "step": 10200 }, { "epoch": 1.2976720519017937, "ewc_loss": 0.05576615035533905, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002451615291647613, "grad_norm": 6.556158542633057, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.850783109664917, "num_tokens": 389298773.0, "step": 10201 }, { "epoch": 1.2977992621803842, "ewc_loss": 0.055715467780828476, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002446546859573573, "grad_norm": 6.50861120223999, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8638864755630493, "num_tokens": 389339680.0, "step": 10202 }, { "epoch": 1.2979264724589747, "ewc_loss": 0.05573265254497528, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002448265440762043, "grad_norm": 6.5338239669799805, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8664730787277222, "num_tokens": 389380755.0, "step": 10203 }, { "epoch": 1.2980536827375653, "ewc_loss": 0.055786989629268646, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002453699125908315, "grad_norm": 6.572680473327637, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8669657707214355, "num_tokens": 389412031.0, "step": 10204 }, { "epoch": 1.2981808930161556, "ewc_loss": 0.055707141757011414, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002445713907945901, "grad_norm": 6.565279483795166, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8746827840805054, "num_tokens": 389448299.0, "step": 10205 }, { "epoch": 1.298308103294746, "ewc_loss": 0.05576573312282562, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024515733821317554, "grad_norm": 6.613249778747559, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8584135174751282, "num_tokens": 389483962.0, "step": 10206 }, { "epoch": 1.2984353135733366, "ewc_loss": 0.05568676441907883, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002443676348775625, "grad_norm": 6.52603816986084, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8554287552833557, "num_tokens": 389521989.0, "step": 10207 }, { "epoch": 1.2985625238519272, "ewc_loss": 0.05572386085987091, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002447386214043945, "grad_norm": 6.569697856903076, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8591487407684326, "num_tokens": 389558228.0, "step": 10208 }, { "epoch": 1.2986897341305177, "ewc_loss": 0.055792056024074554, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024542055325582623, "grad_norm": 6.58882474899292, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8616083860397339, "num_tokens": 389597164.0, "step": 10209 }, { "epoch": 1.2988169444091082, "ewc_loss": 0.055769093334674835, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002451909240335226, "grad_norm": 6.506682395935059, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8639746308326721, "num_tokens": 389636435.0, "step": 10210 }, { "epoch": 1.2989441546876987, "ewc_loss": 0.055744051933288574, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002449405437801033, "grad_norm": 6.631468772888184, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8553221821784973, "num_tokens": 389670498.0, "step": 10211 }, { "epoch": 1.2990713649662893, "ewc_loss": 0.05569823831319809, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002444823912810534, "grad_norm": 6.501029968261719, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8652708530426025, "num_tokens": 389707433.0, "step": 10212 }, { "epoch": 1.2991985752448798, "ewc_loss": 0.05577726662158966, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024527267669327557, "grad_norm": 6.586633682250977, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8641833662986755, "num_tokens": 389747300.0, "step": 10213 }, { "epoch": 1.2993257855234703, "ewc_loss": 0.0556701123714447, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002442011027596891, "grad_norm": 6.473180294036865, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8575258255004883, "num_tokens": 389788381.0, "step": 10214 }, { "epoch": 1.2994529958020609, "ewc_loss": 0.0558503158390522, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024600315373390913, "grad_norm": 6.6324143409729, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8655346632003784, "num_tokens": 389821792.0, "step": 10215 }, { "epoch": 1.2995802060806514, "ewc_loss": 0.05569958686828613, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024449589545838535, "grad_norm": 6.491332054138184, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8742511868476868, "num_tokens": 389859370.0, "step": 10216 }, { "epoch": 1.299707416359242, "ewc_loss": 0.05576661229133606, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000245166098466143, "grad_norm": 6.523529052734375, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8540581464767456, "num_tokens": 389897020.0, "step": 10217 }, { "epoch": 1.2998346266378324, "ewc_loss": 0.05579809844493866, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024548100191168487, "grad_norm": 6.690288543701172, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8375168442726135, "num_tokens": 389927107.0, "step": 10218 }, { "epoch": 1.299961836916423, "ewc_loss": 0.05576532334089279, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024515323457308114, "grad_norm": 6.533023834228516, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.860142707824707, "num_tokens": 389966323.0, "step": 10219 }, { "epoch": 1.3000890471950133, "ewc_loss": 0.05579088628292084, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024540885351598263, "grad_norm": 6.541375160217285, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8617491722106934, "num_tokens": 390001416.0, "step": 10220 }, { "epoch": 1.3002162574736038, "ewc_loss": 0.05576768144965172, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002451768086757511, "grad_norm": 6.550125598907471, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8656321167945862, "num_tokens": 390042326.0, "step": 10221 }, { "epoch": 1.3003434677521943, "ewc_loss": 0.05574566125869751, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024495660909451544, "grad_norm": 6.503586769104004, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8541243076324463, "num_tokens": 390084443.0, "step": 10222 }, { "epoch": 1.3004706780307849, "ewc_loss": 0.05580057203769684, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024550571106374264, "grad_norm": 6.5317158699035645, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8567038774490356, "num_tokens": 390126309.0, "step": 10223 }, { "epoch": 1.3005978883093754, "ewc_loss": 0.05578254163265228, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024532544193789363, "grad_norm": 6.547310829162598, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8666231632232666, "num_tokens": 390161463.0, "step": 10224 }, { "epoch": 1.300725098587966, "ewc_loss": 0.05575396120548248, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024503961321897805, "grad_norm": 6.519681453704834, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8733459711074829, "num_tokens": 390203221.0, "step": 10225 }, { "epoch": 1.3008523088665565, "ewc_loss": 0.05579693242907524, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002454693312756717, "grad_norm": 6.561127662658691, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8800241947174072, "num_tokens": 390243269.0, "step": 10226 }, { "epoch": 1.300979519145147, "ewc_loss": 0.055856890976428986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024606892839074135, "grad_norm": 6.516732692718506, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8642295002937317, "num_tokens": 390282488.0, "step": 10227 }, { "epoch": 1.3011067294237375, "ewc_loss": 0.05584361031651497, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002459360985085368, "grad_norm": 6.532488822937012, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.858587920665741, "num_tokens": 390320268.0, "step": 10228 }, { "epoch": 1.3012339397023278, "ewc_loss": 0.05579950660467148, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002454950590617955, "grad_norm": 6.609436511993408, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8529770374298096, "num_tokens": 390354733.0, "step": 10229 }, { "epoch": 1.3013611499809183, "ewc_loss": 0.055828168988227844, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002457816735841334, "grad_norm": 6.537716865539551, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8619168996810913, "num_tokens": 390388161.0, "step": 10230 }, { "epoch": 1.3014883602595089, "ewc_loss": 0.05592380464076996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002467380545567721, "grad_norm": 6.579763412475586, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8734517097473145, "num_tokens": 390424609.0, "step": 10231 }, { "epoch": 1.3016155705380994, "ewc_loss": 0.05580522119998932, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002455522189848125, "grad_norm": 6.529665946960449, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.879773736000061, "num_tokens": 390460809.0, "step": 10232 }, { "epoch": 1.30174278081669, "ewc_loss": 0.05593705177307129, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002468705060891807, "grad_norm": 6.58396577835083, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8536105155944824, "num_tokens": 390500198.0, "step": 10233 }, { "epoch": 1.3018699910952805, "ewc_loss": 0.055775705724954605, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002452570479363203, "grad_norm": 6.504892826080322, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8570060729980469, "num_tokens": 390545588.0, "step": 10234 }, { "epoch": 1.301997201373871, "ewc_loss": 0.05591346696019173, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002466346777509898, "grad_norm": 6.6167521476745605, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8532898426055908, "num_tokens": 390581681.0, "step": 10235 }, { "epoch": 1.3021244116524615, "ewc_loss": 0.05576251819729805, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024512517848052084, "grad_norm": 6.467627048492432, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8815784454345703, "num_tokens": 390619122.0, "step": 10236 }, { "epoch": 1.302251621931052, "ewc_loss": 0.05592755973339081, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002467755984980613, "grad_norm": 6.574206352233887, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8505234718322754, "num_tokens": 390667974.0, "step": 10237 }, { "epoch": 1.3023788322096426, "ewc_loss": 0.055767424404621124, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024517421843484044, "grad_norm": 6.535573959350586, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8655610084533691, "num_tokens": 390705188.0, "step": 10238 }, { "epoch": 1.302506042488233, "ewc_loss": 0.05597637593746185, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002472637570463121, "grad_norm": 6.5749382972717285, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8731282949447632, "num_tokens": 390746892.0, "step": 10239 }, { "epoch": 1.3026332527668236, "ewc_loss": 0.05587473884224892, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024624739307910204, "grad_norm": 6.541340351104736, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8778458833694458, "num_tokens": 390787727.0, "step": 10240 }, { "epoch": 1.3027604630454142, "ewc_loss": 0.05584287270903587, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024592873523943126, "grad_norm": 6.599945068359375, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8550575375556946, "num_tokens": 390823702.0, "step": 10241 }, { "epoch": 1.3028876733240047, "ewc_loss": 0.05587950721383095, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024629506515339017, "grad_norm": 6.542276859283447, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8637794852256775, "num_tokens": 390860256.0, "step": 10242 }, { "epoch": 1.3030148836025952, "ewc_loss": 0.05589011311531067, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024640114861540496, "grad_norm": 6.49686861038208, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8753689527511597, "num_tokens": 390901676.0, "step": 10243 }, { "epoch": 1.3031420938811857, "ewc_loss": 0.05591633915901184, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002466634032316506, "grad_norm": 6.570082187652588, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8458825945854187, "num_tokens": 390936911.0, "step": 10244 }, { "epoch": 1.303269304159776, "ewc_loss": 0.055956628173589706, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024706628755666316, "grad_norm": 6.6215949058532715, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.851228654384613, "num_tokens": 390976185.0, "step": 10245 }, { "epoch": 1.3033965144383666, "ewc_loss": 0.05589119344949722, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024641191703267395, "grad_norm": 6.493093013763428, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8628096580505371, "num_tokens": 391014661.0, "step": 10246 }, { "epoch": 1.303523724716957, "ewc_loss": 0.055969059467315674, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024719059001654387, "grad_norm": 6.561287879943848, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8551315069198608, "num_tokens": 391056205.0, "step": 10247 }, { "epoch": 1.3036509349955476, "ewc_loss": 0.05584338307380676, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002459338284097612, "grad_norm": 6.513680458068848, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8645156025886536, "num_tokens": 391092187.0, "step": 10248 }, { "epoch": 1.3037781452741382, "ewc_loss": 0.055938996374607086, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024688997655175626, "grad_norm": 6.565780162811279, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8781359195709229, "num_tokens": 391130890.0, "step": 10249 }, { "epoch": 1.3039053555527287, "ewc_loss": 0.05597486346960068, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002472486230544746, "grad_norm": 6.539200305938721, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8722512722015381, "num_tokens": 391172783.0, "step": 10250 }, { "epoch": 1.3040325658313192, "ewc_loss": 0.05591630935668945, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024666308308951557, "grad_norm": 6.502967834472656, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8455759286880493, "num_tokens": 391215031.0, "step": 10251 }, { "epoch": 1.3041597761099097, "ewc_loss": 0.056008480489254, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475848305039108, "grad_norm": 6.589995861053467, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.866665244102478, "num_tokens": 391251400.0, "step": 10252 }, { "epoch": 1.3042869863885003, "ewc_loss": 0.05591369792819023, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002466369769535959, "grad_norm": 6.576437473297119, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8537213802337646, "num_tokens": 391285895.0, "step": 10253 }, { "epoch": 1.3044141966670906, "ewc_loss": 0.056002143770456314, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000247521442361176, "grad_norm": 6.5379719734191895, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.864769697189331, "num_tokens": 391328361.0, "step": 10254 }, { "epoch": 1.304541406945681, "ewc_loss": 0.05598406866192818, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024734067847020924, "grad_norm": 6.652101516723633, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8614770770072937, "num_tokens": 391365019.0, "step": 10255 }, { "epoch": 1.3046686172242716, "ewc_loss": 0.055897947400808334, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024647946702316403, "grad_norm": 6.545125484466553, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.847661554813385, "num_tokens": 391400407.0, "step": 10256 }, { "epoch": 1.3047958275028622, "ewc_loss": 0.05600268393754959, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475268265698105, "grad_norm": 6.585899829864502, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8469299674034119, "num_tokens": 391440660.0, "step": 10257 }, { "epoch": 1.3049230377814527, "ewc_loss": 0.05586186796426773, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002461186668369919, "grad_norm": 6.548556804656982, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8652838468551636, "num_tokens": 391481096.0, "step": 10258 }, { "epoch": 1.3050502480600432, "ewc_loss": 0.055968984961509705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000247189833316952, "grad_norm": 6.573846340179443, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8629723787307739, "num_tokens": 391516356.0, "step": 10259 }, { "epoch": 1.3051774583386337, "ewc_loss": 0.05591392144560814, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024663921794854105, "grad_norm": 6.59807014465332, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8515483140945435, "num_tokens": 391554691.0, "step": 10260 }, { "epoch": 1.3053046686172243, "ewc_loss": 0.05588795989751816, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002463795826770365, "grad_norm": 6.537429332733154, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.849552571773529, "num_tokens": 391592473.0, "step": 10261 }, { "epoch": 1.3054318788958148, "ewc_loss": 0.05595103278756142, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024701032089069486, "grad_norm": 6.565746784210205, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8689820170402527, "num_tokens": 391632091.0, "step": 10262 }, { "epoch": 1.3055590891744053, "ewc_loss": 0.05588013678789139, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002463013806845993, "grad_norm": 6.540292263031006, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8570843935012817, "num_tokens": 391676121.0, "step": 10263 }, { "epoch": 1.3056862994529959, "ewc_loss": 0.055916957557201385, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002466696023475379, "grad_norm": 6.58666467666626, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8602670431137085, "num_tokens": 391709664.0, "step": 10264 }, { "epoch": 1.3058135097315864, "ewc_loss": 0.055896662175655365, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002464666322339326, "grad_norm": 6.53706169128418, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8507610559463501, "num_tokens": 391755828.0, "step": 10265 }, { "epoch": 1.305940720010177, "ewc_loss": 0.05593935400247574, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024689355632290244, "grad_norm": 6.5315470695495605, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8785802125930786, "num_tokens": 391800232.0, "step": 10266 }, { "epoch": 1.3060679302887674, "ewc_loss": 0.0559237003326416, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024673700681887567, "grad_norm": 6.552474021911621, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8653371930122375, "num_tokens": 391837612.0, "step": 10267 }, { "epoch": 1.306195140567358, "ewc_loss": 0.05598090961575508, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002473091008141637, "grad_norm": 6.63522481918335, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8736473917961121, "num_tokens": 391871193.0, "step": 10268 }, { "epoch": 1.3063223508459483, "ewc_loss": 0.055923037230968475, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024673037114553154, "grad_norm": 6.59705924987793, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8546296954154968, "num_tokens": 391908203.0, "step": 10269 }, { "epoch": 1.3064495611245388, "ewc_loss": 0.05598939210176468, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024739393847994506, "grad_norm": 6.648988246917725, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8690587282180786, "num_tokens": 391938744.0, "step": 10270 }, { "epoch": 1.3065767714031293, "ewc_loss": 0.055836599320173264, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024586598738096654, "grad_norm": 6.527370452880859, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8654680252075195, "num_tokens": 391976020.0, "step": 10271 }, { "epoch": 1.3067039816817199, "ewc_loss": 0.05606399476528168, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024813992786221206, "grad_norm": 6.6256794929504395, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8537272810935974, "num_tokens": 392018006.0, "step": 10272 }, { "epoch": 1.3068311919603104, "ewc_loss": 0.05590832978487015, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024658330949023366, "grad_norm": 6.5481276512146, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8607615232467651, "num_tokens": 392055358.0, "step": 10273 }, { "epoch": 1.306958402238901, "ewc_loss": 0.05597393214702606, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002472393389325589, "grad_norm": 6.541545391082764, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8609687089920044, "num_tokens": 392097727.0, "step": 10274 }, { "epoch": 1.3070856125174914, "ewc_loss": 0.055970579385757446, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002472057822160423, "grad_norm": 6.610871315002441, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.868283748626709, "num_tokens": 392144641.0, "step": 10275 }, { "epoch": 1.307212822796082, "ewc_loss": 0.055980242788791656, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024730240693315864, "grad_norm": 6.6305251121521, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8709883689880371, "num_tokens": 392178810.0, "step": 10276 }, { "epoch": 1.3073400330746725, "ewc_loss": 0.05594278872013092, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024692786973901093, "grad_norm": 6.606296539306641, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8579160571098328, "num_tokens": 392213571.0, "step": 10277 }, { "epoch": 1.3074672433532628, "ewc_loss": 0.05592840909957886, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002467840677127242, "grad_norm": 6.556232929229736, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8592507839202881, "num_tokens": 392254628.0, "step": 10278 }, { "epoch": 1.3075944536318533, "ewc_loss": 0.05590633675456047, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002465633733663708, "grad_norm": 6.575648784637451, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8614349365234375, "num_tokens": 392297243.0, "step": 10279 }, { "epoch": 1.3077216639104439, "ewc_loss": 0.05592650547623634, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024676506291143596, "grad_norm": 6.641745090484619, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.865277886390686, "num_tokens": 392329116.0, "step": 10280 }, { "epoch": 1.3078488741890344, "ewc_loss": 0.055925555527210236, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002467555459588766, "grad_norm": 6.552308559417725, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8591373562812805, "num_tokens": 392370542.0, "step": 10281 }, { "epoch": 1.307976084467625, "ewc_loss": 0.055972158908843994, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024722161469981074, "grad_norm": 6.581557273864746, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8685842752456665, "num_tokens": 392411155.0, "step": 10282 }, { "epoch": 1.3081032947462155, "ewc_loss": 0.05590756982564926, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024657571339048445, "grad_norm": 6.5358991622924805, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8638570308685303, "num_tokens": 392450440.0, "step": 10283 }, { "epoch": 1.308230505024806, "ewc_loss": 0.05605313181877136, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024803131236694753, "grad_norm": 6.57808256149292, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8732089996337891, "num_tokens": 392488545.0, "step": 10284 }, { "epoch": 1.3083577153033965, "ewc_loss": 0.055907174944877625, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024657175526954234, "grad_norm": 6.5417985916137695, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.861009955406189, "num_tokens": 392527138.0, "step": 10285 }, { "epoch": 1.308484925581987, "ewc_loss": 0.0560324490070343, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002478244714438915, "grad_norm": 6.595443248748779, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8625554442405701, "num_tokens": 392568181.0, "step": 10286 }, { "epoch": 1.3086121358605776, "ewc_loss": 0.05595310777425766, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002470310719218105, "grad_norm": 6.505584716796875, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.876392126083374, "num_tokens": 392610900.0, "step": 10287 }, { "epoch": 1.308739346139168, "ewc_loss": 0.0560077503323555, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475774963386357, "grad_norm": 6.667556285858154, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8540056347846985, "num_tokens": 392646624.0, "step": 10288 }, { "epoch": 1.3088665564177586, "ewc_loss": 0.05591675639152527, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002466675650794059, "grad_norm": 6.540416240692139, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8623597025871277, "num_tokens": 392683103.0, "step": 10289 }, { "epoch": 1.3089937666963491, "ewc_loss": 0.055923059582710266, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002467306039761752, "grad_norm": 6.521727085113525, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8644605875015259, "num_tokens": 392723169.0, "step": 10290 }, { "epoch": 1.3091209769749397, "ewc_loss": 0.055995095521211624, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002474509528838098, "grad_norm": 6.571972370147705, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8783515095710754, "num_tokens": 392762323.0, "step": 10291 }, { "epoch": 1.3092481872535302, "ewc_loss": 0.055947717279195786, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024697717162780464, "grad_norm": 6.572665691375732, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8808786869049072, "num_tokens": 392797555.0, "step": 10292 }, { "epoch": 1.3093753975321207, "ewc_loss": 0.055980537086725235, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002473053755238652, "grad_norm": 6.534412384033203, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8736475110054016, "num_tokens": 392840001.0, "step": 10293 }, { "epoch": 1.309502607810711, "ewc_loss": 0.05606392025947571, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024813920026645064, "grad_norm": 6.612438678741455, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8626381754875183, "num_tokens": 392880871.0, "step": 10294 }, { "epoch": 1.3096298180893016, "ewc_loss": 0.05590229481458664, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002465229481458664, "grad_norm": 6.615163326263428, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8736985921859741, "num_tokens": 392917554.0, "step": 10295 }, { "epoch": 1.309757028367892, "ewc_loss": 0.05592954903841019, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024679547641426325, "grad_norm": 6.495134353637695, "learning_rate": 1e-06, "loss": 0.397, "mean_token_accuracy": 0.8812426328659058, "num_tokens": 392958601.0, "step": 10296 }, { "epoch": 1.3098842386464826, "ewc_loss": 0.05601873621344566, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024768736329860985, "grad_norm": 6.616601943969727, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8649604916572571, "num_tokens": 393000108.0, "step": 10297 }, { "epoch": 1.3100114489250732, "ewc_loss": 0.05591292679309845, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002466292935423553, "grad_norm": 6.589182376861572, "learning_rate": 1e-06, "loss": 0.3935, "mean_token_accuracy": 0.8837454915046692, "num_tokens": 393032321.0, "step": 10298 }, { "epoch": 1.3101386592036637, "ewc_loss": 0.05596115067601204, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002471115149091929, "grad_norm": 6.585209846496582, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8618263006210327, "num_tokens": 393068347.0, "step": 10299 }, { "epoch": 1.3102658694822542, "ewc_loss": 0.05598106235265732, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002473106433171779, "grad_norm": 6.652920246124268, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8800243735313416, "num_tokens": 393102509.0, "step": 10300 }, { "epoch": 1.3103930797608447, "ewc_loss": 0.05594620853662491, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024696209584362805, "grad_norm": 6.633204936981201, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8725298047065735, "num_tokens": 393140974.0, "step": 10301 }, { "epoch": 1.3105202900394353, "ewc_loss": 0.05591972544789314, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024669725098647177, "grad_norm": 6.620640754699707, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8734745979309082, "num_tokens": 393177214.0, "step": 10302 }, { "epoch": 1.3106475003180256, "ewc_loss": 0.05591465160250664, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002466465230099857, "grad_norm": 6.601965427398682, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8663630485534668, "num_tokens": 393215801.0, "step": 10303 }, { "epoch": 1.310774710596616, "ewc_loss": 0.05593327432870865, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024683272931724787, "grad_norm": 6.585343837738037, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8739609718322754, "num_tokens": 393256444.0, "step": 10304 }, { "epoch": 1.3109019208752066, "ewc_loss": 0.05585312470793724, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002460312389302999, "grad_norm": 6.475041389465332, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8601542711257935, "num_tokens": 393300238.0, "step": 10305 }, { "epoch": 1.3110291311537972, "ewc_loss": 0.056123875081539154, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002487387682776898, "grad_norm": 6.674372673034668, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8503893613815308, "num_tokens": 393344087.0, "step": 10306 }, { "epoch": 1.3111563414323877, "ewc_loss": 0.05598486587405205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002473486529197544, "grad_norm": 6.555605411529541, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8688461780548096, "num_tokens": 393381678.0, "step": 10307 }, { "epoch": 1.3112835517109782, "ewc_loss": 0.05609385669231415, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024843859137035906, "grad_norm": 7.751188278198242, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8629262447357178, "num_tokens": 393415740.0, "step": 10308 }, { "epoch": 1.3114107619895687, "ewc_loss": 0.05636140704154968, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002511140482965857, "grad_norm": 6.465755462646484, "learning_rate": 1e-06, "loss": 0.5351, "mean_token_accuracy": 0.8391761183738708, "num_tokens": 393457692.0, "step": 10309 }, { "epoch": 1.3115379722681593, "ewc_loss": 0.05646918714046478, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025219187955372036, "grad_norm": 6.757272720336914, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8592177629470825, "num_tokens": 393499003.0, "step": 10310 }, { "epoch": 1.3116651825467498, "ewc_loss": 0.05591672658920288, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024666727404110134, "grad_norm": 6.559811115264893, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8524646759033203, "num_tokens": 393533842.0, "step": 10311 }, { "epoch": 1.3117923928253403, "ewc_loss": 0.05643129721283913, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025181297678500414, "grad_norm": 6.705611705780029, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8718789219856262, "num_tokens": 393573906.0, "step": 10312 }, { "epoch": 1.3119196031039309, "ewc_loss": 0.056006357073783875, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475635556038469, "grad_norm": 6.57563591003418, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8613914251327515, "num_tokens": 393615298.0, "step": 10313 }, { "epoch": 1.3120468133825214, "ewc_loss": 0.056163206696510315, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002491320774424821, "grad_norm": 6.651361465454102, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8537314534187317, "num_tokens": 393654690.0, "step": 10314 }, { "epoch": 1.312174023661112, "ewc_loss": 0.05611538887023926, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000248653901508078, "grad_norm": 6.629081726074219, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8649253845214844, "num_tokens": 393696222.0, "step": 10315 }, { "epoch": 1.3123012339397024, "ewc_loss": 0.05609089136123657, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002484089054632932, "grad_norm": 6.625589847564697, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8607903718948364, "num_tokens": 393736748.0, "step": 10316 }, { "epoch": 1.312428444218293, "ewc_loss": 0.05605587363243103, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024805875727906823, "grad_norm": 6.604578018188477, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8550299406051636, "num_tokens": 393773077.0, "step": 10317 }, { "epoch": 1.3125556544968833, "ewc_loss": 0.056085050106048584, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024835049407556653, "grad_norm": 6.632282257080078, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8753805756568909, "num_tokens": 393806466.0, "step": 10318 }, { "epoch": 1.3126828647754738, "ewc_loss": 0.05605223402380943, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024802234838716686, "grad_norm": 6.682828426361084, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8599996566772461, "num_tokens": 393840922.0, "step": 10319 }, { "epoch": 1.3128100750540643, "ewc_loss": 0.05598100647330284, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024731006124056876, "grad_norm": 6.575197219848633, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8681725263595581, "num_tokens": 393874544.0, "step": 10320 }, { "epoch": 1.3129372853326549, "ewc_loss": 0.05602317675948143, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024773177574388683, "grad_norm": 6.601997375488281, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8598518371582031, "num_tokens": 393917773.0, "step": 10321 }, { "epoch": 1.3130644956112454, "ewc_loss": 0.05598375201225281, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002473375352565199, "grad_norm": 6.606886863708496, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8584500551223755, "num_tokens": 393951525.0, "step": 10322 }, { "epoch": 1.313191705889836, "ewc_loss": 0.05604031682014465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024790316820144653, "grad_norm": 6.64102029800415, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8676453232765198, "num_tokens": 393989796.0, "step": 10323 }, { "epoch": 1.3133189161684264, "ewc_loss": 0.05600053071975708, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475052897352725, "grad_norm": 6.587094783782959, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8597090244293213, "num_tokens": 394028746.0, "step": 10324 }, { "epoch": 1.313446126447017, "ewc_loss": 0.05599452182650566, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024744521942920983, "grad_norm": 6.596107482910156, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8712587356567383, "num_tokens": 394065241.0, "step": 10325 }, { "epoch": 1.3135733367256075, "ewc_loss": 0.056022875010967255, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024772874894551933, "grad_norm": 6.568192958831787, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8580905199050903, "num_tokens": 394110113.0, "step": 10326 }, { "epoch": 1.3137005470041978, "ewc_loss": 0.055969275534152985, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024719274369999766, "grad_norm": 6.566166400909424, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8457025289535522, "num_tokens": 394146577.0, "step": 10327 }, { "epoch": 1.3138277572827883, "ewc_loss": 0.056043609976768494, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024793611373752356, "grad_norm": 6.575090408325195, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8641988039016724, "num_tokens": 394185392.0, "step": 10328 }, { "epoch": 1.3139549675613789, "ewc_loss": 0.056042589247226715, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024792589829303324, "grad_norm": 6.560995101928711, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8544175028800964, "num_tokens": 394226301.0, "step": 10329 }, { "epoch": 1.3140821778399694, "ewc_loss": 0.056046534329652786, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002479653339833021, "grad_norm": 6.6285552978515625, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8548770546913147, "num_tokens": 394262654.0, "step": 10330 }, { "epoch": 1.31420938811856, "ewc_loss": 0.05601909011602402, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002476909139659256, "grad_norm": 6.5992817878723145, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8751435875892639, "num_tokens": 394299185.0, "step": 10331 }, { "epoch": 1.3143365983971504, "ewc_loss": 0.05603466555476189, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024784664856269956, "grad_norm": 6.580697059631348, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8638495802879333, "num_tokens": 394331364.0, "step": 10332 }, { "epoch": 1.314463808675741, "ewc_loss": 0.055998027324676514, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002474802895449102, "grad_norm": 6.609927177429199, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8513269424438477, "num_tokens": 394375479.0, "step": 10333 }, { "epoch": 1.3145910189543315, "ewc_loss": 0.05602804943919182, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002477804955560714, "grad_norm": 6.595495223999023, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8687647581100464, "num_tokens": 394414913.0, "step": 10334 }, { "epoch": 1.314718229232922, "ewc_loss": 0.05610395595431328, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024853955255821347, "grad_norm": 6.590665817260742, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8637232780456543, "num_tokens": 394450889.0, "step": 10335 }, { "epoch": 1.3148454395115126, "ewc_loss": 0.056006234139204025, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475623332429677, "grad_norm": 6.571486473083496, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8474187850952148, "num_tokens": 394495280.0, "step": 10336 }, { "epoch": 1.314972649790103, "ewc_loss": 0.05611834675073624, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024868344189599156, "grad_norm": 6.658082485198975, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8725898265838623, "num_tokens": 394536811.0, "step": 10337 }, { "epoch": 1.3150998600686936, "ewc_loss": 0.055986665189266205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024736663908697665, "grad_norm": 6.532495975494385, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8599454164505005, "num_tokens": 394573353.0, "step": 10338 }, { "epoch": 1.3152270703472841, "ewc_loss": 0.05609368532896042, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002484368742443621, "grad_norm": 6.703468322753906, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.855806827545166, "num_tokens": 394603517.0, "step": 10339 }, { "epoch": 1.3153542806258747, "ewc_loss": 0.05598406866192818, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024734067847020924, "grad_norm": 6.495811939239502, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8694568872451782, "num_tokens": 394650547.0, "step": 10340 }, { "epoch": 1.3154814909044652, "ewc_loss": 0.05611495301127434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024864953593350947, "grad_norm": 6.666779518127441, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8676101565361023, "num_tokens": 394684460.0, "step": 10341 }, { "epoch": 1.3156087011830557, "ewc_loss": 0.05590152367949486, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024651523563079536, "grad_norm": 6.466965198516846, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.8793317675590515, "num_tokens": 394733520.0, "step": 10342 }, { "epoch": 1.315735911461646, "ewc_loss": 0.05618736892938614, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024937366833910346, "grad_norm": 6.668779373168945, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8575467467308044, "num_tokens": 394775498.0, "step": 10343 }, { "epoch": 1.3158631217402366, "ewc_loss": 0.05596637353301048, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002471637271810323, "grad_norm": 6.539675712585449, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8503141403198242, "num_tokens": 394815138.0, "step": 10344 }, { "epoch": 1.315990332018827, "ewc_loss": 0.05613928660750389, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002488928730599582, "grad_norm": 6.679751396179199, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8558181524276733, "num_tokens": 394850447.0, "step": 10345 }, { "epoch": 1.3161175422974176, "ewc_loss": 0.05600357800722122, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475357614457607, "grad_norm": 6.553990364074707, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8567529916763306, "num_tokens": 394891800.0, "step": 10346 }, { "epoch": 1.3162447525760081, "ewc_loss": 0.056158192455768585, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490819024387747, "grad_norm": 6.601832866668701, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8690981864929199, "num_tokens": 394930087.0, "step": 10347 }, { "epoch": 1.3163719628545987, "ewc_loss": 0.05610329657793045, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024853297509253025, "grad_norm": 6.599739074707031, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8728639483451843, "num_tokens": 394964497.0, "step": 10348 }, { "epoch": 1.3164991731331892, "ewc_loss": 0.05612298101186752, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002487298334017396, "grad_norm": 6.6099395751953125, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8453711867332458, "num_tokens": 395003127.0, "step": 10349 }, { "epoch": 1.3166263834117797, "ewc_loss": 0.05606229603290558, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002481229603290558, "grad_norm": 6.590786457061768, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8537206649780273, "num_tokens": 395040570.0, "step": 10350 }, { "epoch": 1.3167535936903703, "ewc_loss": 0.05617036670446396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002492036728654057, "grad_norm": 6.6276774406433105, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8653274178504944, "num_tokens": 395075205.0, "step": 10351 }, { "epoch": 1.3168808039689606, "ewc_loss": 0.05607493966817856, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002482494164723903, "grad_norm": 6.580617427825928, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.862288236618042, "num_tokens": 395117492.0, "step": 10352 }, { "epoch": 1.317008014247551, "ewc_loss": 0.05609576404094696, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024845762527547777, "grad_norm": 6.5623016357421875, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8606852293014526, "num_tokens": 395162449.0, "step": 10353 }, { "epoch": 1.3171352245261416, "ewc_loss": 0.05609123036265373, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024841231061145663, "grad_norm": 6.593221664428711, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8721914887428284, "num_tokens": 395197310.0, "step": 10354 }, { "epoch": 1.3172624348047322, "ewc_loss": 0.05612028017640114, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002487027959432453, "grad_norm": 6.57676362991333, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8745834231376648, "num_tokens": 395232420.0, "step": 10355 }, { "epoch": 1.3173896450833227, "ewc_loss": 0.056145407259464264, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024895407841540873, "grad_norm": 6.605027675628662, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8413921594619751, "num_tokens": 395269392.0, "step": 10356 }, { "epoch": 1.3175168553619132, "ewc_loss": 0.056116558611392975, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024866557214409113, "grad_norm": 6.566540241241455, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8512734770774841, "num_tokens": 395305496.0, "step": 10357 }, { "epoch": 1.3176440656405037, "ewc_loss": 0.05617011338472366, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002492011117283255, "grad_norm": 6.6089653968811035, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8723829984664917, "num_tokens": 395347089.0, "step": 10358 }, { "epoch": 1.3177712759190943, "ewc_loss": 0.05612427741289139, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024874278460629284, "grad_norm": 6.567927837371826, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8754082322120667, "num_tokens": 395382885.0, "step": 10359 }, { "epoch": 1.3178984861976848, "ewc_loss": 0.05619840323925018, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024948405916802585, "grad_norm": 6.630986213684082, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8775123357772827, "num_tokens": 395416260.0, "step": 10360 }, { "epoch": 1.3180256964762753, "ewc_loss": 0.05616551637649536, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002491551567800343, "grad_norm": 6.620765686035156, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8548012375831604, "num_tokens": 395454179.0, "step": 10361 }, { "epoch": 1.3181529067548658, "ewc_loss": 0.05609184876084328, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024841848062351346, "grad_norm": 6.5884199142456055, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8528510928153992, "num_tokens": 395489173.0, "step": 10362 }, { "epoch": 1.3182801170334564, "ewc_loss": 0.056206587702035904, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024956587003543973, "grad_norm": 6.5867109298706055, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8562685251235962, "num_tokens": 395532057.0, "step": 10363 }, { "epoch": 1.318407327312047, "ewc_loss": 0.05618666484951973, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002493666543159634, "grad_norm": 6.593871593475342, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8699119687080383, "num_tokens": 395570321.0, "step": 10364 }, { "epoch": 1.3185345375906374, "ewc_loss": 0.0562402717769146, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024990271776914597, "grad_norm": 6.6038007736206055, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8836039304733276, "num_tokens": 395609041.0, "step": 10365 }, { "epoch": 1.318661747869228, "ewc_loss": 0.05620262026786804, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002495262015145272, "grad_norm": 6.648895740509033, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8400103449821472, "num_tokens": 395649961.0, "step": 10366 }, { "epoch": 1.3187889581478183, "ewc_loss": 0.0561252124607563, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024875212693586946, "grad_norm": 6.566006660461426, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8638923764228821, "num_tokens": 395691882.0, "step": 10367 }, { "epoch": 1.3189161684264088, "ewc_loss": 0.05622916668653488, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002497916866559535, "grad_norm": 6.67292594909668, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8501507043838501, "num_tokens": 395731496.0, "step": 10368 }, { "epoch": 1.3190433787049993, "ewc_loss": 0.05608277767896652, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002483277930878103, "grad_norm": 6.572150707244873, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8679229021072388, "num_tokens": 395772618.0, "step": 10369 }, { "epoch": 1.3191705889835899, "ewc_loss": 0.05618275701999664, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024932756787166, "grad_norm": 6.69646692276001, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8453608751296997, "num_tokens": 395811907.0, "step": 10370 }, { "epoch": 1.3192977992621804, "ewc_loss": 0.05606216564774513, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024812165065668523, "grad_norm": 6.581058502197266, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8567581176757812, "num_tokens": 395851186.0, "step": 10371 }, { "epoch": 1.319425009540771, "ewc_loss": 0.05616573616862297, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000249157368671149, "grad_norm": 6.703203201293945, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8636112213134766, "num_tokens": 395893868.0, "step": 10372 }, { "epoch": 1.3195522198193614, "ewc_loss": 0.056046806275844574, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024796806974336505, "grad_norm": 6.5760297775268555, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8733692765235901, "num_tokens": 395930519.0, "step": 10373 }, { "epoch": 1.319679430097952, "ewc_loss": 0.0562012754380703, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002495127555448562, "grad_norm": 6.744472980499268, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8783022165298462, "num_tokens": 395965692.0, "step": 10374 }, { "epoch": 1.3198066403765425, "ewc_loss": 0.05600931495428085, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024759312509559095, "grad_norm": 6.573897838592529, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8634948134422302, "num_tokens": 396002316.0, "step": 10375 }, { "epoch": 1.3199338506551328, "ewc_loss": 0.056150443851947784, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490044280420989, "grad_norm": 6.7487473487854, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8689481019973755, "num_tokens": 396043392.0, "step": 10376 }, { "epoch": 1.3200610609337233, "ewc_loss": 0.05600123107433319, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475123037584126, "grad_norm": 6.569840908050537, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8608304262161255, "num_tokens": 396078603.0, "step": 10377 }, { "epoch": 1.3201882712123139, "ewc_loss": 0.05610564351081848, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024855646188370883, "grad_norm": 6.656605243682861, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8546794652938843, "num_tokens": 396121132.0, "step": 10378 }, { "epoch": 1.3203154814909044, "ewc_loss": 0.05600389838218689, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002475389919709414, "grad_norm": 6.670967102050781, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8721033334732056, "num_tokens": 396155730.0, "step": 10379 }, { "epoch": 1.320442691769495, "ewc_loss": 0.05605079233646393, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002480079128872603, "grad_norm": 6.659524917602539, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8493328094482422, "num_tokens": 396190976.0, "step": 10380 }, { "epoch": 1.3205699020480854, "ewc_loss": 0.05608014762401581, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002483014832250774, "grad_norm": 6.665711402893066, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8629776239395142, "num_tokens": 396229571.0, "step": 10381 }, { "epoch": 1.320697112326676, "ewc_loss": 0.05603564903140068, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024785648565739393, "grad_norm": 6.613987922668457, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8632221817970276, "num_tokens": 396265586.0, "step": 10382 }, { "epoch": 1.3208243226052665, "ewc_loss": 0.05609007552266121, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002484007563907653, "grad_norm": 6.6196980476379395, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8687816858291626, "num_tokens": 396303810.0, "step": 10383 }, { "epoch": 1.320951532883857, "ewc_loss": 0.05601236969232559, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002476236841175705, "grad_norm": 6.592174053192139, "learning_rate": 1e-06, "loss": 0.3897, "mean_token_accuracy": 0.8826699256896973, "num_tokens": 396345671.0, "step": 10384 }, { "epoch": 1.3210787431624476, "ewc_loss": 0.05608135834336281, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002483135904185474, "grad_norm": 6.538773059844971, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8797541856765747, "num_tokens": 396388062.0, "step": 10385 }, { "epoch": 1.321205953441038, "ewc_loss": 0.056118059903383255, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002486805897206068, "grad_norm": 6.6394171714782715, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.876758337020874, "num_tokens": 396425189.0, "step": 10386 }, { "epoch": 1.3213331637196286, "ewc_loss": 0.056052494794130325, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002480249386280775, "grad_norm": 6.599130153656006, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8530961275100708, "num_tokens": 396461861.0, "step": 10387 }, { "epoch": 1.3214603739982191, "ewc_loss": 0.05605832114815712, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002480832044966519, "grad_norm": 6.554145336151123, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8736667037010193, "num_tokens": 396506190.0, "step": 10388 }, { "epoch": 1.3215875842768097, "ewc_loss": 0.056159719824790955, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490971819497645, "grad_norm": 6.656214714050293, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8651411533355713, "num_tokens": 396538900.0, "step": 10389 }, { "epoch": 1.3217147945554002, "ewc_loss": 0.05611569061875343, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000248656899202615, "grad_norm": 6.552053928375244, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8718992471694946, "num_tokens": 396577029.0, "step": 10390 }, { "epoch": 1.3218420048339907, "ewc_loss": 0.05617199093103409, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002492198836989701, "grad_norm": 6.593378067016602, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.848872721195221, "num_tokens": 396616917.0, "step": 10391 }, { "epoch": 1.321969215112581, "ewc_loss": 0.05616431310772896, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024914313689805567, "grad_norm": 6.58165979385376, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8723716735839844, "num_tokens": 396651726.0, "step": 10392 }, { "epoch": 1.3220964253911716, "ewc_loss": 0.05621834471821785, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024968344951048493, "grad_norm": 6.593207836151123, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8515094518661499, "num_tokens": 396693796.0, "step": 10393 }, { "epoch": 1.322223635669762, "ewc_loss": 0.05619499832391739, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024945000768639147, "grad_norm": 6.640789985656738, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8527325391769409, "num_tokens": 396731947.0, "step": 10394 }, { "epoch": 1.3223508459483526, "ewc_loss": 0.056203510612249374, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024953510728664696, "grad_norm": 6.588993549346924, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8672040700912476, "num_tokens": 396767062.0, "step": 10395 }, { "epoch": 1.3224780562269431, "ewc_loss": 0.056225769221782684, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024975769338198006, "grad_norm": 6.658536911010742, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8666408061981201, "num_tokens": 396803918.0, "step": 10396 }, { "epoch": 1.3226052665055337, "ewc_loss": 0.05614820867776871, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002489820763003081, "grad_norm": 6.606101989746094, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.837178111076355, "num_tokens": 396850314.0, "step": 10397 }, { "epoch": 1.3227324767841242, "ewc_loss": 0.05617040395736694, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002492040512152016, "grad_norm": 6.6143717765808105, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.856232225894928, "num_tokens": 396887920.0, "step": 10398 }, { "epoch": 1.3228596870627147, "ewc_loss": 0.05616265535354614, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024912654771469533, "grad_norm": 6.660998821258545, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8527704477310181, "num_tokens": 396923369.0, "step": 10399 }, { "epoch": 1.3229868973413053, "ewc_loss": 0.056182991713285446, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000249329925281927, "grad_norm": 6.595564365386963, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8614106178283691, "num_tokens": 396962338.0, "step": 10400 }, { "epoch": 1.3231141076198956, "ewc_loss": 0.05610775575041771, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024857756216078997, "grad_norm": 6.5756449699401855, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8741786479949951, "num_tokens": 396998275.0, "step": 10401 }, { "epoch": 1.323241317898486, "ewc_loss": 0.05619565397500992, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024945655604824424, "grad_norm": 6.592413425445557, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.859644889831543, "num_tokens": 397040034.0, "step": 10402 }, { "epoch": 1.3233685281770766, "ewc_loss": 0.05618490278720856, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024934904649853706, "grad_norm": 6.609645366668701, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.883467435836792, "num_tokens": 397080549.0, "step": 10403 }, { "epoch": 1.3234957384556671, "ewc_loss": 0.056195780634880066, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002494578075129539, "grad_norm": 6.589501857757568, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8762049674987793, "num_tokens": 397120803.0, "step": 10404 }, { "epoch": 1.3236229487342577, "ewc_loss": 0.05622604861855507, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002497604873497039, "grad_norm": 6.631043434143066, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8683269023895264, "num_tokens": 397160463.0, "step": 10405 }, { "epoch": 1.3237501590128482, "ewc_loss": 0.056152187287807465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490218903403729, "grad_norm": 6.57476806640625, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8567677140235901, "num_tokens": 397205425.0, "step": 10406 }, { "epoch": 1.3238773692914387, "ewc_loss": 0.05620434507727623, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024954346008598804, "grad_norm": 6.644955635070801, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8580719828605652, "num_tokens": 397238907.0, "step": 10407 }, { "epoch": 1.3240045795700293, "ewc_loss": 0.05615587159991264, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490587066859007, "grad_norm": 6.616510391235352, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8639816045761108, "num_tokens": 397281639.0, "step": 10408 }, { "epoch": 1.3241317898486198, "ewc_loss": 0.056198831647634506, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024948830832727253, "grad_norm": 6.618048667907715, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8523362874984741, "num_tokens": 397323350.0, "step": 10409 }, { "epoch": 1.3242590001272103, "ewc_loss": 0.056134387850761414, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024884389131329954, "grad_norm": 6.557782173156738, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8569475412368774, "num_tokens": 397364438.0, "step": 10410 }, { "epoch": 1.3243862104058008, "ewc_loss": 0.05625021085143089, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002500021073501557, "grad_norm": 6.662275314331055, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8504821062088013, "num_tokens": 397406881.0, "step": 10411 }, { "epoch": 1.3245134206843914, "ewc_loss": 0.05608352646231651, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024833527277223766, "grad_norm": 6.522565841674805, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8643612861633301, "num_tokens": 397451190.0, "step": 10412 }, { "epoch": 1.324640630962982, "ewc_loss": 0.05634523928165436, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000250952405622229, "grad_norm": 6.664007186889648, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8633440732955933, "num_tokens": 397491063.0, "step": 10413 }, { "epoch": 1.3247678412415724, "ewc_loss": 0.056130439043045044, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024880439741536975, "grad_norm": 6.588656425476074, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8635035753250122, "num_tokens": 397529961.0, "step": 10414 }, { "epoch": 1.324895051520163, "ewc_loss": 0.05625149607658386, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025001494213938713, "grad_norm": 6.653256416320801, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8748146891593933, "num_tokens": 397565616.0, "step": 10415 }, { "epoch": 1.3250222617987533, "ewc_loss": 0.056141532957553864, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024891531211324036, "grad_norm": 6.604786396026611, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8694237470626831, "num_tokens": 397602223.0, "step": 10416 }, { "epoch": 1.3251494720773438, "ewc_loss": 0.05625471472740173, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025004716007970273, "grad_norm": 6.699580192565918, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8426693677902222, "num_tokens": 397638910.0, "step": 10417 }, { "epoch": 1.3252766823559343, "ewc_loss": 0.05615769326686859, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490769256837666, "grad_norm": 6.585533142089844, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8556171655654907, "num_tokens": 397681200.0, "step": 10418 }, { "epoch": 1.3254038926345248, "ewc_loss": 0.05628088116645813, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025030883261933923, "grad_norm": 6.630200386047363, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8654308319091797, "num_tokens": 397724832.0, "step": 10419 }, { "epoch": 1.3255311029131154, "ewc_loss": 0.05626460909843445, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025014608399942517, "grad_norm": 6.639220714569092, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8557013273239136, "num_tokens": 397765537.0, "step": 10420 }, { "epoch": 1.325658313191706, "ewc_loss": 0.05620599538087845, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024955993285402656, "grad_norm": 6.619616508483887, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8555771708488464, "num_tokens": 397800945.0, "step": 10421 }, { "epoch": 1.3257855234702964, "ewc_loss": 0.05626547336578369, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002501547569409013, "grad_norm": 6.650076389312744, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8683061003684998, "num_tokens": 397840786.0, "step": 10422 }, { "epoch": 1.325912733748887, "ewc_loss": 0.056192174553871155, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000249421747867018, "grad_norm": 6.629065990447998, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8683121800422668, "num_tokens": 397875607.0, "step": 10423 }, { "epoch": 1.3260399440274775, "ewc_loss": 0.056271087378263474, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025021086912602186, "grad_norm": 6.668142795562744, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8794426918029785, "num_tokens": 397909524.0, "step": 10424 }, { "epoch": 1.3261671543060678, "ewc_loss": 0.056198105216026306, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024948103236965835, "grad_norm": 6.626270294189453, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8475497364997864, "num_tokens": 397947862.0, "step": 10425 }, { "epoch": 1.3262943645846583, "ewc_loss": 0.05621028319001198, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002496028319001198, "grad_norm": 6.685824871063232, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8775690197944641, "num_tokens": 397980958.0, "step": 10426 }, { "epoch": 1.3264215748632489, "ewc_loss": 0.05616939067840576, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002491938939783722, "grad_norm": 6.587759971618652, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8601983785629272, "num_tokens": 398021231.0, "step": 10427 }, { "epoch": 1.3265487851418394, "ewc_loss": 0.056347765028476715, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002509776677470654, "grad_norm": 6.669130802154541, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8613526821136475, "num_tokens": 398059202.0, "step": 10428 }, { "epoch": 1.32667599542043, "ewc_loss": 0.05619170516729355, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024941706215031445, "grad_norm": 6.568771839141846, "learning_rate": 1e-06, "loss": 0.3844, "mean_token_accuracy": 0.8849916458129883, "num_tokens": 398097121.0, "step": 10429 }, { "epoch": 1.3268032056990204, "ewc_loss": 0.056392133235931396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025142135564237833, "grad_norm": 6.697590351104736, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8626781702041626, "num_tokens": 398130413.0, "step": 10430 }, { "epoch": 1.326930415977611, "ewc_loss": 0.05619282275438309, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002494282089173794, "grad_norm": 6.622803688049316, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.861453115940094, "num_tokens": 398172535.0, "step": 10431 }, { "epoch": 1.3270576262562015, "ewc_loss": 0.05636826530098915, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002511826460249722, "grad_norm": 6.6831512451171875, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8638942241668701, "num_tokens": 398206681.0, "step": 10432 }, { "epoch": 1.327184836534792, "ewc_loss": 0.056151263415813446, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490126353222877, "grad_norm": 6.636630058288574, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8663705587387085, "num_tokens": 398245206.0, "step": 10433 }, { "epoch": 1.3273120468133826, "ewc_loss": 0.056273914873600006, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002502391580492258, "grad_norm": 6.682236194610596, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8599027991294861, "num_tokens": 398278902.0, "step": 10434 }, { "epoch": 1.327439257091973, "ewc_loss": 0.05621516332030296, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002496516390237957, "grad_norm": 6.642877578735352, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8643356561660767, "num_tokens": 398315833.0, "step": 10435 }, { "epoch": 1.3275664673705636, "ewc_loss": 0.056191399693489075, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002494140062481165, "grad_norm": 6.64070987701416, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8466055393218994, "num_tokens": 398356496.0, "step": 10436 }, { "epoch": 1.3276936776491541, "ewc_loss": 0.05621698871254921, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002496698871254921, "grad_norm": 6.593308448791504, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8698345422744751, "num_tokens": 398393661.0, "step": 10437 }, { "epoch": 1.3278208879277447, "ewc_loss": 0.056166645139455795, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002491664490662515, "grad_norm": 6.60344934463501, "learning_rate": 1e-06, "loss": 0.393, "mean_token_accuracy": 0.8803064823150635, "num_tokens": 398430463.0, "step": 10438 }, { "epoch": 1.3279480982063352, "ewc_loss": 0.05624557286500931, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024995574494823813, "grad_norm": 6.674689292907715, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.862464427947998, "num_tokens": 398470382.0, "step": 10439 }, { "epoch": 1.3280753084849257, "ewc_loss": 0.056213922798633575, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024963924079202116, "grad_norm": 6.665046215057373, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8463121652603149, "num_tokens": 398504793.0, "step": 10440 }, { "epoch": 1.328202518763516, "ewc_loss": 0.05628306046128273, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025033060228452086, "grad_norm": 6.663033962249756, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8544094562530518, "num_tokens": 398541038.0, "step": 10441 }, { "epoch": 1.3283297290421066, "ewc_loss": 0.05618656426668167, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002493656356818974, "grad_norm": 6.6073455810546875, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8463588953018188, "num_tokens": 398578047.0, "step": 10442 }, { "epoch": 1.328456939320697, "ewc_loss": 0.05628912150859833, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025039122556336224, "grad_norm": 6.6467695236206055, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8512896299362183, "num_tokens": 398618946.0, "step": 10443 }, { "epoch": 1.3285841495992876, "ewc_loss": 0.0562506839632988, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025000685127452016, "grad_norm": 6.649231910705566, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8670455813407898, "num_tokens": 398656895.0, "step": 10444 }, { "epoch": 1.3287113598778781, "ewc_loss": 0.05628158897161484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025031587574630976, "grad_norm": 6.687278747558594, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.863633394241333, "num_tokens": 398696521.0, "step": 10445 }, { "epoch": 1.3288385701564687, "ewc_loss": 0.05627603828907013, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002502604038454592, "grad_norm": 6.678901195526123, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8645755648612976, "num_tokens": 398735275.0, "step": 10446 }, { "epoch": 1.3289657804350592, "ewc_loss": 0.05622158199548721, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002497158420737833, "grad_norm": 6.638190746307373, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8475821018218994, "num_tokens": 398774753.0, "step": 10447 }, { "epoch": 1.3290929907136497, "ewc_loss": 0.0562058687210083, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002495587104931474, "grad_norm": 6.701256275177002, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8568547964096069, "num_tokens": 398810893.0, "step": 10448 }, { "epoch": 1.3292202009922403, "ewc_loss": 0.05622541159391403, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024975414271466434, "grad_norm": 6.629760265350342, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8578945398330688, "num_tokens": 398849741.0, "step": 10449 }, { "epoch": 1.3293474112708306, "ewc_loss": 0.056242238730192184, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002499223919585347, "grad_norm": 6.775600433349609, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8572577238082886, "num_tokens": 398881186.0, "step": 10450 }, { "epoch": 1.329474621549421, "ewc_loss": 0.0561840757727623, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024934078101068735, "grad_norm": 6.569684982299805, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8799833059310913, "num_tokens": 398918677.0, "step": 10451 }, { "epoch": 1.3296018318280116, "ewc_loss": 0.05637761205434799, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025127612752839923, "grad_norm": 6.771867275238037, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.861351490020752, "num_tokens": 398953435.0, "step": 10452 }, { "epoch": 1.3297290421066021, "ewc_loss": 0.05608337000012398, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000248333701165393, "grad_norm": 6.6068434715271, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8700991868972778, "num_tokens": 398991799.0, "step": 10453 }, { "epoch": 1.3298562523851927, "ewc_loss": 0.05625462532043457, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002500462578609586, "grad_norm": 6.730983734130859, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8664047122001648, "num_tokens": 399023211.0, "step": 10454 }, { "epoch": 1.3299834626637832, "ewc_loss": 0.05605645477771759, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002480645489413291, "grad_norm": 6.58274507522583, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8598241209983826, "num_tokens": 399061431.0, "step": 10455 }, { "epoch": 1.3301106729423737, "ewc_loss": 0.0562548004090786, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000250048004090786, "grad_norm": 6.707107067108154, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8649089932441711, "num_tokens": 399097384.0, "step": 10456 }, { "epoch": 1.3302378832209643, "ewc_loss": 0.05620594322681427, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024955940898507833, "grad_norm": 6.664744853973389, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8659166097640991, "num_tokens": 399133935.0, "step": 10457 }, { "epoch": 1.3303650934995548, "ewc_loss": 0.056191254407167435, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024941255105659366, "grad_norm": 6.680268287658691, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8683786392211914, "num_tokens": 399165709.0, "step": 10458 }, { "epoch": 1.3304923037781453, "ewc_loss": 0.05619543045759201, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002494542859494686, "grad_norm": 6.614526748657227, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8603912591934204, "num_tokens": 399203548.0, "step": 10459 }, { "epoch": 1.3306195140567358, "ewc_loss": 0.05622764304280281, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002497764362487942, "grad_norm": 6.620095729827881, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8606641888618469, "num_tokens": 399244341.0, "step": 10460 }, { "epoch": 1.3307467243353264, "ewc_loss": 0.05624502897262573, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024995027342811227, "grad_norm": 6.587667942047119, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.868228018283844, "num_tokens": 399282284.0, "step": 10461 }, { "epoch": 1.330873934613917, "ewc_loss": 0.05625314265489578, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025003141490742564, "grad_norm": 6.571630954742432, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8769772052764893, "num_tokens": 399320523.0, "step": 10462 }, { "epoch": 1.3310011448925074, "ewc_loss": 0.056240566074848175, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024990568635985255, "grad_norm": 6.653005599975586, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8636409044265747, "num_tokens": 399357489.0, "step": 10463 }, { "epoch": 1.331128355171098, "ewc_loss": 0.05630441755056381, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025054419529624283, "grad_norm": 6.660775184631348, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8509974479675293, "num_tokens": 399393561.0, "step": 10464 }, { "epoch": 1.3312555654496883, "ewc_loss": 0.05625256150960922, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025002562324516475, "grad_norm": 6.669917106628418, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8679708242416382, "num_tokens": 399424514.0, "step": 10465 }, { "epoch": 1.3313827757282788, "ewc_loss": 0.05627595633268356, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000250259559834376, "grad_norm": 6.668034553527832, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8685625791549683, "num_tokens": 399461372.0, "step": 10466 }, { "epoch": 1.3315099860068693, "ewc_loss": 0.056188661605119705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002493866195436567, "grad_norm": 6.592861175537109, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8783150911331177, "num_tokens": 399502503.0, "step": 10467 }, { "epoch": 1.3316371962854598, "ewc_loss": 0.05618952214717865, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002493952342774719, "grad_norm": 6.7259697914123535, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8568980097770691, "num_tokens": 399537836.0, "step": 10468 }, { "epoch": 1.3317644065640504, "ewc_loss": 0.05619511753320694, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024945117183960974, "grad_norm": 6.65151309967041, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8586474061012268, "num_tokens": 399575189.0, "step": 10469 }, { "epoch": 1.331891616842641, "ewc_loss": 0.05616436153650284, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024914363166317344, "grad_norm": 6.633146286010742, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.844772219657898, "num_tokens": 399614406.0, "step": 10470 }, { "epoch": 1.3320188271212314, "ewc_loss": 0.05614398792386055, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024893987574614584, "grad_norm": 6.618039131164551, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8689761161804199, "num_tokens": 399652847.0, "step": 10471 }, { "epoch": 1.332146037399822, "ewc_loss": 0.05619243532419205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024942433810792863, "grad_norm": 6.615236282348633, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8528828620910645, "num_tokens": 399691628.0, "step": 10472 }, { "epoch": 1.3322732476784125, "ewc_loss": 0.056462883949279785, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002496874367352575, "grad_norm": 6.644928932189941, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8749293088912964, "num_tokens": 399732875.0, "step": 10473 }, { "epoch": 1.3324004579570028, "ewc_loss": 0.05614149942994118, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024891499197110534, "grad_norm": 6.563982009887695, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8742592930793762, "num_tokens": 399769537.0, "step": 10474 }, { "epoch": 1.3325276682355933, "ewc_loss": 0.05623626336455345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024986264179460704, "grad_norm": 6.632381916046143, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8474764823913574, "num_tokens": 399807757.0, "step": 10475 }, { "epoch": 1.3326548785141838, "ewc_loss": 0.056179240345954895, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002492923813406378, "grad_norm": 6.7670578956604, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8705953359603882, "num_tokens": 399837351.0, "step": 10476 }, { "epoch": 1.3327820887927744, "ewc_loss": 0.056117527186870575, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002486752637196332, "grad_norm": 6.630320072174072, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8814278841018677, "num_tokens": 399873678.0, "step": 10477 }, { "epoch": 1.332909299071365, "ewc_loss": 0.05614857375621796, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002489857142791152, "grad_norm": 6.594566345214844, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8648168444633484, "num_tokens": 399916206.0, "step": 10478 }, { "epoch": 1.3330365093499554, "ewc_loss": 0.0561705119907856, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002492051280569285, "grad_norm": 6.58903694152832, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8755180835723877, "num_tokens": 399954144.0, "step": 10479 }, { "epoch": 1.333163719628546, "ewc_loss": 0.05621683597564697, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024966837372630835, "grad_norm": 6.595288276672363, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8724876642227173, "num_tokens": 399995865.0, "step": 10480 }, { "epoch": 1.3332909299071365, "ewc_loss": 0.05617913603782654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024929133360274136, "grad_norm": 6.595038890838623, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8642435073852539, "num_tokens": 400033234.0, "step": 10481 }, { "epoch": 1.333418140185727, "ewc_loss": 0.05621032416820526, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024960326845757663, "grad_norm": 6.609445571899414, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8712679147720337, "num_tokens": 400066842.0, "step": 10482 }, { "epoch": 1.3335453504643175, "ewc_loss": 0.0561852753162384, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002493527717888355, "grad_norm": 6.603418350219727, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8738964200019836, "num_tokens": 400099380.0, "step": 10483 }, { "epoch": 1.333672560742908, "ewc_loss": 0.05627083033323288, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025020830798894167, "grad_norm": 6.612818241119385, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8752412796020508, "num_tokens": 400137419.0, "step": 10484 }, { "epoch": 1.3337997710214986, "ewc_loss": 0.05627290531992912, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002502290590200573, "grad_norm": 6.646025657653809, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8647782802581787, "num_tokens": 400171086.0, "step": 10485 }, { "epoch": 1.3339269813000891, "ewc_loss": 0.056239448487758636, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002498944813851267, "grad_norm": 6.6327128410339355, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8628659248352051, "num_tokens": 400206763.0, "step": 10486 }, { "epoch": 1.3340541915786797, "ewc_loss": 0.056278422474861145, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025028421077877283, "grad_norm": 6.649412155151367, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8488606810569763, "num_tokens": 400242708.0, "step": 10487 }, { "epoch": 1.3341814018572702, "ewc_loss": 0.056194622069597244, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002494462241884321, "grad_norm": 6.6177592277526855, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8430349826812744, "num_tokens": 400277417.0, "step": 10488 }, { "epoch": 1.3343086121358605, "ewc_loss": 0.056233108043670654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002498310641385615, "grad_norm": 6.583675384521484, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8621487021446228, "num_tokens": 400315811.0, "step": 10489 }, { "epoch": 1.334435822414451, "ewc_loss": 0.0562305748462677, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024980574380606413, "grad_norm": 6.5963454246521, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8447037935256958, "num_tokens": 400356321.0, "step": 10490 }, { "epoch": 1.3345630326930416, "ewc_loss": 0.056216537952423096, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024966540513560176, "grad_norm": 6.601673603057861, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8631488680839539, "num_tokens": 400392382.0, "step": 10491 }, { "epoch": 1.334690242971632, "ewc_loss": 0.056279201060533524, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025029201060533524, "grad_norm": 6.713613510131836, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.870983362197876, "num_tokens": 400434947.0, "step": 10492 }, { "epoch": 1.3348174532502226, "ewc_loss": 0.05620157718658447, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024951575323939323, "grad_norm": 6.556121826171875, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8448175191879272, "num_tokens": 400472742.0, "step": 10493 }, { "epoch": 1.3349446635288131, "ewc_loss": 0.05634905397891998, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002509905316401273, "grad_norm": 6.632853031158447, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8767768740653992, "num_tokens": 400510029.0, "step": 10494 }, { "epoch": 1.3350718738074037, "ewc_loss": 0.05622190982103348, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024971910170279443, "grad_norm": 6.553890228271484, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8599892854690552, "num_tokens": 400547383.0, "step": 10495 }, { "epoch": 1.3351990840859942, "ewc_loss": 0.05636835843324661, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002511835773475468, "grad_norm": 6.597999095916748, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8748999834060669, "num_tokens": 400585242.0, "step": 10496 }, { "epoch": 1.3353262943645847, "ewc_loss": 0.056309252977371216, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000250592507654801, "grad_norm": 6.57243537902832, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8754467964172363, "num_tokens": 400623117.0, "step": 10497 }, { "epoch": 1.3354535046431752, "ewc_loss": 0.05632001906633377, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002507001918274909, "grad_norm": 6.560509204864502, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8528496026992798, "num_tokens": 400660170.0, "step": 10498 }, { "epoch": 1.3355807149217656, "ewc_loss": 0.05635043606162071, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025100435595959425, "grad_norm": 6.599695205688477, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8706634640693665, "num_tokens": 400700467.0, "step": 10499 }, { "epoch": 1.335707925200356, "ewc_loss": 0.056340865790843964, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002509086625650525, "grad_norm": 6.55922269821167, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8569564819335938, "num_tokens": 400740269.0, "step": 10500 }, { "epoch": 1.3358351354789466, "ewc_loss": 0.05643117055296898, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025181169621646404, "grad_norm": 6.653203010559082, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8733919858932495, "num_tokens": 400776465.0, "step": 10501 }, { "epoch": 1.3359623457575371, "ewc_loss": 0.056345924735069275, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025095927412621677, "grad_norm": 6.605978965759277, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8591299057006836, "num_tokens": 400811210.0, "step": 10502 }, { "epoch": 1.3360895560361277, "ewc_loss": 0.05642912536859512, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025179123622365296, "grad_norm": 6.631230354309082, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8562692999839783, "num_tokens": 400848820.0, "step": 10503 }, { "epoch": 1.3362167663147182, "ewc_loss": 0.056306127458810806, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025056127924472094, "grad_norm": 6.530487537384033, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.865565299987793, "num_tokens": 400888774.0, "step": 10504 }, { "epoch": 1.3363439765933087, "ewc_loss": 0.05647262558341026, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025222625117748976, "grad_norm": 6.5952839851379395, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8650438785552979, "num_tokens": 400935067.0, "step": 10505 }, { "epoch": 1.3364711868718993, "ewc_loss": 0.056321628391742706, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025071625714190304, "grad_norm": 6.588726997375488, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.873664915561676, "num_tokens": 400974122.0, "step": 10506 }, { "epoch": 1.3365983971504898, "ewc_loss": 0.05644645169377327, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025196452043019235, "grad_norm": 6.686740398406982, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8731247782707214, "num_tokens": 401008911.0, "step": 10507 }, { "epoch": 1.3367256074290803, "ewc_loss": 0.05634945631027222, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025099454796873033, "grad_norm": 6.592536449432373, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8697280883789062, "num_tokens": 401039453.0, "step": 10508 }, { "epoch": 1.3368528177076708, "ewc_loss": 0.056388020515441895, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025138023192994297, "grad_norm": 6.644444942474365, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8742178678512573, "num_tokens": 401075854.0, "step": 10509 }, { "epoch": 1.3369800279862614, "ewc_loss": 0.056344784796237946, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025094786542467773, "grad_norm": 6.5889363288879395, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8629983067512512, "num_tokens": 401114922.0, "step": 10510 }, { "epoch": 1.337107238264852, "ewc_loss": 0.056384116411209106, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025134114548563957, "grad_norm": 6.617712497711182, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8720732927322388, "num_tokens": 401155833.0, "step": 10511 }, { "epoch": 1.3372344485434424, "ewc_loss": 0.056356750428676605, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025106751127168536, "grad_norm": 6.609225273132324, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8699035048484802, "num_tokens": 401196971.0, "step": 10512 }, { "epoch": 1.337361658822033, "ewc_loss": 0.05634065717458725, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002509065961930901, "grad_norm": 6.643145561218262, "learning_rate": 1e-06, "loss": 0.5581, "mean_token_accuracy": 0.8319072127342224, "num_tokens": 401237312.0, "step": 10513 }, { "epoch": 1.3374888691006233, "ewc_loss": 0.05631076544523239, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002506076416466385, "grad_norm": 6.663180828094482, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8609909415245056, "num_tokens": 401274630.0, "step": 10514 }, { "epoch": 1.3376160793792138, "ewc_loss": 0.05632089450955391, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002507089520804584, "grad_norm": 6.660784721374512, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8682634830474854, "num_tokens": 401311780.0, "step": 10515 }, { "epoch": 1.3377432896578043, "ewc_loss": 0.056314438581466675, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025064439978450537, "grad_norm": 6.637881278991699, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8711303472518921, "num_tokens": 401352614.0, "step": 10516 }, { "epoch": 1.3378704999363948, "ewc_loss": 0.05626668781042099, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025016689323820174, "grad_norm": 6.623049259185791, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8672917485237122, "num_tokens": 401394495.0, "step": 10517 }, { "epoch": 1.3379977102149854, "ewc_loss": 0.056351833045482635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025101835490204394, "grad_norm": 6.695661544799805, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.850037157535553, "num_tokens": 401427758.0, "step": 10518 }, { "epoch": 1.338124920493576, "ewc_loss": 0.05628693476319313, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002503693394828588, "grad_norm": 6.622737407684326, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8481676578521729, "num_tokens": 401473868.0, "step": 10519 }, { "epoch": 1.3382521307721664, "ewc_loss": 0.05636327341198921, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002511327329557389, "grad_norm": 6.6604084968566895, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8471370935440063, "num_tokens": 401510459.0, "step": 10520 }, { "epoch": 1.338379341050757, "ewc_loss": 0.05631731450557709, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025067312526516616, "grad_norm": 6.694839954376221, "learning_rate": 1e-06, "loss": 0.5536, "mean_token_accuracy": 0.8346339464187622, "num_tokens": 401545283.0, "step": 10521 }, { "epoch": 1.3385065513293475, "ewc_loss": 0.056352898478507996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002510290069039911, "grad_norm": 6.694210052490234, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8652409315109253, "num_tokens": 401577166.0, "step": 10522 }, { "epoch": 1.3386337616079378, "ewc_loss": 0.056320078670978546, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025070077390410006, "grad_norm": 6.55562162399292, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8761580586433411, "num_tokens": 401619075.0, "step": 10523 }, { "epoch": 1.3387609718865283, "ewc_loss": 0.05638530105352402, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025135299074463546, "grad_norm": 6.645717620849609, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8578248023986816, "num_tokens": 401661286.0, "step": 10524 }, { "epoch": 1.3388881821651188, "ewc_loss": 0.0562736913561821, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002502368879504502, "grad_norm": 6.633636474609375, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8551957607269287, "num_tokens": 401699672.0, "step": 10525 }, { "epoch": 1.3390153924437094, "ewc_loss": 0.056568991392850876, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002507485041860491, "grad_norm": 6.686036586761475, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8572001457214355, "num_tokens": 401732676.0, "step": 10526 }, { "epoch": 1.3391426027223, "ewc_loss": 0.056383538991212845, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025133538292720914, "grad_norm": 6.605112552642822, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8613229393959045, "num_tokens": 401771593.0, "step": 10527 }, { "epoch": 1.3392698130008904, "ewc_loss": 0.056310199201107025, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025060196639969945, "grad_norm": 6.665372848510742, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.849288284778595, "num_tokens": 401809605.0, "step": 10528 }, { "epoch": 1.339397023279481, "ewc_loss": 0.05628107860684395, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025031078257597983, "grad_norm": 6.592163562774658, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8550302982330322, "num_tokens": 401845349.0, "step": 10529 }, { "epoch": 1.3395242335580715, "ewc_loss": 0.05634815990924835, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002509815967641771, "grad_norm": 6.59331750869751, "learning_rate": 1e-06, "loss": 0.3755, "mean_token_accuracy": 0.8862912058830261, "num_tokens": 401883186.0, "step": 10530 }, { "epoch": 1.339651443836662, "ewc_loss": 0.05635042488574982, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002510042686481029, "grad_norm": 6.698845386505127, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8484249114990234, "num_tokens": 401921149.0, "step": 10531 }, { "epoch": 1.3397786541152525, "ewc_loss": 0.05626687780022621, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002501687849871814, "grad_norm": 6.524162769317627, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8570237159729004, "num_tokens": 401961813.0, "step": 10532 }, { "epoch": 1.339905864393843, "ewc_loss": 0.0564802885055542, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025230288156308234, "grad_norm": 6.695069313049316, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8465797901153564, "num_tokens": 402001212.0, "step": 10533 }, { "epoch": 1.3400330746724336, "ewc_loss": 0.05649111419916153, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002499697438906878, "grad_norm": 6.5450592041015625, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8572298288345337, "num_tokens": 402044648.0, "step": 10534 }, { "epoch": 1.3401602849510241, "ewc_loss": 0.05650701746344566, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000252570171141997, "grad_norm": 6.658673286437988, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.865772008895874, "num_tokens": 402087391.0, "step": 10535 }, { "epoch": 1.3402874952296147, "ewc_loss": 0.056532055139541626, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002503791474737227, "grad_norm": 6.65734338760376, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.883390486240387, "num_tokens": 402119598.0, "step": 10536 }, { "epoch": 1.3404147055082052, "ewc_loss": 0.05636570602655411, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025115706375800073, "grad_norm": 6.651471138000488, "learning_rate": 1e-06, "loss": 0.5431, "mean_token_accuracy": 0.8387051224708557, "num_tokens": 402153628.0, "step": 10537 }, { "epoch": 1.3405419157867955, "ewc_loss": 0.05633033439517021, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002508033358026296, "grad_norm": 6.6044816970825195, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8794785737991333, "num_tokens": 402190876.0, "step": 10538 }, { "epoch": 1.340669126065386, "ewc_loss": 0.056358225643634796, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025108223780989647, "grad_norm": 6.603602886199951, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8721874952316284, "num_tokens": 402227168.0, "step": 10539 }, { "epoch": 1.3407963363439765, "ewc_loss": 0.056347496807575226, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025097496109083295, "grad_norm": 6.689455509185791, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8726403713226318, "num_tokens": 402262731.0, "step": 10540 }, { "epoch": 1.340923546622567, "ewc_loss": 0.056177929043769836, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002492792846169323, "grad_norm": 6.581379413604736, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8654706478118896, "num_tokens": 402299992.0, "step": 10541 }, { "epoch": 1.3410507569011576, "ewc_loss": 0.056302718818187714, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002505271986592561, "grad_norm": 6.6250433921813965, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8561134338378906, "num_tokens": 402339129.0, "step": 10542 }, { "epoch": 1.3411779671797481, "ewc_loss": 0.05622001737356186, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002497001551091671, "grad_norm": 6.593259811401367, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8464011549949646, "num_tokens": 402375380.0, "step": 10543 }, { "epoch": 1.3413051774583387, "ewc_loss": 0.056307241320610046, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025057242601178586, "grad_norm": 6.745287895202637, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8522052764892578, "num_tokens": 402412159.0, "step": 10544 }, { "epoch": 1.3414323877369292, "ewc_loss": 0.05622756853699684, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002497756795492023, "grad_norm": 6.606961250305176, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8598127365112305, "num_tokens": 402446676.0, "step": 10545 }, { "epoch": 1.3415595980155197, "ewc_loss": 0.05635366588830948, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025103663210757077, "grad_norm": 6.718263626098633, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8651509284973145, "num_tokens": 402490741.0, "step": 10546 }, { "epoch": 1.3416868082941102, "ewc_loss": 0.056442346423864365, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00024948205100372434, "grad_norm": 6.658936023712158, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8730058670043945, "num_tokens": 402527398.0, "step": 10547 }, { "epoch": 1.3418140185727006, "ewc_loss": 0.056507233530282974, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002501309209037572, "grad_norm": 6.7322306632995605, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8626028299331665, "num_tokens": 402565914.0, "step": 10548 }, { "epoch": 1.341941228851291, "ewc_loss": 0.05641369894146919, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002491955820005387, "grad_norm": 6.625495910644531, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8670362234115601, "num_tokens": 402600671.0, "step": 10549 }, { "epoch": 1.3420684391298816, "ewc_loss": 0.056453362107276917, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00024959223810583353, "grad_norm": 6.726413726806641, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8755213022232056, "num_tokens": 402637381.0, "step": 10550 }, { "epoch": 1.3421956494084721, "ewc_loss": 0.05629798024892807, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002480383845977485, "grad_norm": 6.5321855545043945, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8682343363761902, "num_tokens": 402671190.0, "step": 10551 }, { "epoch": 1.3423228596870627, "ewc_loss": 0.0565352626144886, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000250411219894886, "grad_norm": 6.892780303955078, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8638008832931519, "num_tokens": 402721518.0, "step": 10552 }, { "epoch": 1.3424500699656532, "ewc_loss": 0.05633227527141571, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002483813732396811, "grad_norm": 6.55886173248291, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8632240295410156, "num_tokens": 402758949.0, "step": 10553 }, { "epoch": 1.3425772802442437, "ewc_loss": 0.05655476450920105, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025060621555894613, "grad_norm": 7.0545196533203125, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8576610088348389, "num_tokens": 402794342.0, "step": 10554 }, { "epoch": 1.3427044905228342, "ewc_loss": 0.05627098307013512, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00024776841746643186, "grad_norm": 6.4432291984558105, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.875713586807251, "num_tokens": 402831031.0, "step": 10555 }, { "epoch": 1.3428317008014248, "ewc_loss": 0.05678047239780426, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002528633049223572, "grad_norm": 6.7384796142578125, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8744121789932251, "num_tokens": 402871486.0, "step": 10556 }, { "epoch": 1.3429589110800153, "ewc_loss": 0.05640221759676933, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002490807673893869, "grad_norm": 6.642594337463379, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8764125108718872, "num_tokens": 402916024.0, "step": 10557 }, { "epoch": 1.3430861213586058, "ewc_loss": 0.05660489574074745, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025110755814239383, "grad_norm": 6.758157730102539, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8461115956306458, "num_tokens": 402952116.0, "step": 10558 }, { "epoch": 1.3432133316371964, "ewc_loss": 0.056245822459459305, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024995821877382696, "grad_norm": 6.6774468421936035, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8624351024627686, "num_tokens": 402998064.0, "step": 10559 }, { "epoch": 1.343340541915787, "ewc_loss": 0.0562216192483902, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00024971619131974876, "grad_norm": 6.608438014984131, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8681509494781494, "num_tokens": 403041763.0, "step": 10560 }, { "epoch": 1.3434677521943774, "ewc_loss": 0.05633831024169922, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025088308029808104, "grad_norm": 6.667852401733398, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8750042915344238, "num_tokens": 403080956.0, "step": 10561 }, { "epoch": 1.343594962472968, "ewc_loss": 0.05618700385093689, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002493700594641268, "grad_norm": 6.666967391967773, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8722922801971436, "num_tokens": 403117200.0, "step": 10562 }, { "epoch": 1.3437221727515583, "ewc_loss": 0.05629962682723999, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002504962612874806, "grad_norm": 6.695687294006348, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8644031286239624, "num_tokens": 403154112.0, "step": 10563 }, { "epoch": 1.3438493830301488, "ewc_loss": 0.05615504831075668, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002490504994057119, "grad_norm": 6.591507911682129, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8620399236679077, "num_tokens": 403188265.0, "step": 10564 }, { "epoch": 1.3439765933087393, "ewc_loss": 0.05637757480144501, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002512757491786033, "grad_norm": 6.643260955810547, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.841189444065094, "num_tokens": 403231236.0, "step": 10565 }, { "epoch": 1.3441038035873298, "ewc_loss": 0.05634734034538269, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025097341858781874, "grad_norm": 6.680890083312988, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8655086755752563, "num_tokens": 403268642.0, "step": 10566 }, { "epoch": 1.3442310138659204, "ewc_loss": 0.05622916296124458, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002497916284482926, "grad_norm": 6.608443737030029, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8714447617530823, "num_tokens": 403307679.0, "step": 10567 }, { "epoch": 1.344358224144511, "ewc_loss": 0.05639054626226425, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025140546495094895, "grad_norm": 6.628603458404541, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8694579601287842, "num_tokens": 403346395.0, "step": 10568 }, { "epoch": 1.3444854344231014, "ewc_loss": 0.05628534406423569, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002503534487914294, "grad_norm": 6.626132488250732, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8719462752342224, "num_tokens": 403383001.0, "step": 10569 }, { "epoch": 1.344612644701692, "ewc_loss": 0.0563504621386528, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025100461789406836, "grad_norm": 6.627984046936035, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8555401563644409, "num_tokens": 403426090.0, "step": 10570 }, { "epoch": 1.3447398549802825, "ewc_loss": 0.0563320592045784, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025082056527026, "grad_norm": 6.600066184997559, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.857234001159668, "num_tokens": 403470231.0, "step": 10571 }, { "epoch": 1.3448670652588728, "ewc_loss": 0.05639813840389252, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025148139684461057, "grad_norm": 6.665057182312012, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8693766593933105, "num_tokens": 403508677.0, "step": 10572 }, { "epoch": 1.3449942755374633, "ewc_loss": 0.056374404579401016, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002512440551072359, "grad_norm": 6.647883415222168, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8442527055740356, "num_tokens": 403545175.0, "step": 10573 }, { "epoch": 1.3451214858160538, "ewc_loss": 0.05641043931245804, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002516044187359512, "grad_norm": 6.62776517868042, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.878047525882721, "num_tokens": 403583321.0, "step": 10574 }, { "epoch": 1.3452486960946444, "ewc_loss": 0.056506261229515076, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002525626332499087, "grad_norm": 6.676121711730957, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8571611046791077, "num_tokens": 403618547.0, "step": 10575 }, { "epoch": 1.345375906373235, "ewc_loss": 0.05641235411167145, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025162353995256126, "grad_norm": 6.62834358215332, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8666706085205078, "num_tokens": 403657552.0, "step": 10576 }, { "epoch": 1.3455031166518254, "ewc_loss": 0.056510500609874725, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025260497932322323, "grad_norm": 6.74829626083374, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8667117357254028, "num_tokens": 403697021.0, "step": 10577 }, { "epoch": 1.345630326930416, "ewc_loss": 0.05629077926278114, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002504077856428921, "grad_norm": 6.581644535064697, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.853245735168457, "num_tokens": 403736480.0, "step": 10578 }, { "epoch": 1.3457575372090065, "ewc_loss": 0.05640404671430588, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002515404485166073, "grad_norm": 6.694506645202637, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8716694116592407, "num_tokens": 403772056.0, "step": 10579 }, { "epoch": 1.345884747487597, "ewc_loss": 0.056266650557518005, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002501665148884058, "grad_norm": 6.5543413162231445, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8587152361869812, "num_tokens": 403813646.0, "step": 10580 }, { "epoch": 1.3460119577661875, "ewc_loss": 0.05650991201400757, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025259912945330143, "grad_norm": 6.627139568328857, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8667188882827759, "num_tokens": 403853904.0, "step": 10581 }, { "epoch": 1.346139168044778, "ewc_loss": 0.056356243789196014, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025106241810135543, "grad_norm": 6.602687835693359, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8473563194274902, "num_tokens": 403898528.0, "step": 10582 }, { "epoch": 1.3462663783233686, "ewc_loss": 0.05647614970803261, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025226149591617286, "grad_norm": 6.7301177978515625, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8491706848144531, "num_tokens": 403929518.0, "step": 10583 }, { "epoch": 1.3463935886019591, "ewc_loss": 0.056376613676548004, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025126614491455257, "grad_norm": 6.545681953430176, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.865276575088501, "num_tokens": 403970323.0, "step": 10584 }, { "epoch": 1.3465207988805497, "ewc_loss": 0.05652417987585068, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002527418255340308, "grad_norm": 6.651082515716553, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8676271438598633, "num_tokens": 404012499.0, "step": 10585 }, { "epoch": 1.3466480091591402, "ewc_loss": 0.056402064859867096, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002515206579118967, "grad_norm": 6.611199378967285, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8592976331710815, "num_tokens": 404055370.0, "step": 10586 }, { "epoch": 1.3467752194377305, "ewc_loss": 0.05645018070936203, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025200180243700743, "grad_norm": 6.603579044342041, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8655971884727478, "num_tokens": 404092498.0, "step": 10587 }, { "epoch": 1.346902429716321, "ewc_loss": 0.056548260152339935, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025298260152339935, "grad_norm": 6.657433032989502, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8694186210632324, "num_tokens": 404132466.0, "step": 10588 }, { "epoch": 1.3470296399949115, "ewc_loss": 0.05648021399974823, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025230212486349046, "grad_norm": 6.5718817710876465, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8639568090438843, "num_tokens": 404171913.0, "step": 10589 }, { "epoch": 1.347156850273502, "ewc_loss": 0.056569017469882965, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025319019914604723, "grad_norm": 6.676095962524414, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.877517580986023, "num_tokens": 404205817.0, "step": 10590 }, { "epoch": 1.3472840605520926, "ewc_loss": 0.05655248463153839, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025302486028522253, "grad_norm": 6.60433292388916, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8673306703567505, "num_tokens": 404243922.0, "step": 10591 }, { "epoch": 1.3474112708306831, "ewc_loss": 0.05666661635041237, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025416616699658334, "grad_norm": 6.666318416595459, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.866126298904419, "num_tokens": 404283392.0, "step": 10592 }, { "epoch": 1.3475384811092737, "ewc_loss": 0.05653082951903343, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000252808298682794, "grad_norm": 6.619906425476074, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8383364081382751, "num_tokens": 404323044.0, "step": 10593 }, { "epoch": 1.3476656913878642, "ewc_loss": 0.05660521984100342, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025355222169309855, "grad_norm": 6.644974231719971, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8686263561248779, "num_tokens": 404359761.0, "step": 10594 }, { "epoch": 1.3477929016664547, "ewc_loss": 0.05658110976219177, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002533110964577645, "grad_norm": 6.685839653015137, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8502086400985718, "num_tokens": 404395354.0, "step": 10595 }, { "epoch": 1.3479201119450452, "ewc_loss": 0.056521374732255936, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025271374033764005, "grad_norm": 6.666471004486084, "learning_rate": 1e-06, "loss": 0.5273, "mean_token_accuracy": 0.847249448299408, "num_tokens": 404432052.0, "step": 10596 }, { "epoch": 1.3480473222236355, "ewc_loss": 0.05655759945511818, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000253075995715335, "grad_norm": 6.642724990844727, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.856204628944397, "num_tokens": 404472689.0, "step": 10597 }, { "epoch": 1.348174532502226, "ewc_loss": 0.056474894285202026, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025224892306141555, "grad_norm": 6.581236362457275, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8484300374984741, "num_tokens": 404515916.0, "step": 10598 }, { "epoch": 1.3483017427808166, "ewc_loss": 0.05654136836528778, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002529136836528778, "grad_norm": 6.666077613830566, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8621252775192261, "num_tokens": 404557399.0, "step": 10599 }, { "epoch": 1.3484289530594071, "ewc_loss": 0.05651620775461197, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002526620519347489, "grad_norm": 6.610340595245361, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8604850769042969, "num_tokens": 404592573.0, "step": 10600 }, { "epoch": 1.3485561633379977, "ewc_loss": 0.05657655745744705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002532655489630997, "grad_norm": 6.582310199737549, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8606223464012146, "num_tokens": 404635266.0, "step": 10601 }, { "epoch": 1.3486833736165882, "ewc_loss": 0.056538112461566925, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002528811164665967, "grad_norm": 6.601280212402344, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8719741702079773, "num_tokens": 404673004.0, "step": 10602 }, { "epoch": 1.3488105838951787, "ewc_loss": 0.05662766844034195, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025377669953741133, "grad_norm": 6.747038841247559, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8476406335830688, "num_tokens": 404705153.0, "step": 10603 }, { "epoch": 1.3489377941737692, "ewc_loss": 0.05649673938751221, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002524673764128238, "grad_norm": 6.6368608474731445, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8538639545440674, "num_tokens": 404737068.0, "step": 10604 }, { "epoch": 1.3490650044523598, "ewc_loss": 0.05656326562166214, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002531326317694038, "grad_norm": 6.680700302124023, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8435646295547485, "num_tokens": 404773680.0, "step": 10605 }, { "epoch": 1.3491922147309503, "ewc_loss": 0.05670785903930664, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002521371643524617, "grad_norm": 6.865768909454346, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8688160181045532, "num_tokens": 404809937.0, "step": 10606 }, { "epoch": 1.3493194250095408, "ewc_loss": 0.05641278997063637, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025162790552712977, "grad_norm": 6.5597357749938965, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8551057577133179, "num_tokens": 404847146.0, "step": 10607 }, { "epoch": 1.3494466352881314, "ewc_loss": 0.05653679370880127, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002528679615352303, "grad_norm": 6.681027889251709, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.864240825176239, "num_tokens": 404882952.0, "step": 10608 }, { "epoch": 1.3495738455667219, "ewc_loss": 0.05640961974859238, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025159618235193193, "grad_norm": 6.564063549041748, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8733968734741211, "num_tokens": 404925347.0, "step": 10609 }, { "epoch": 1.3497010558453124, "ewc_loss": 0.05661862716078758, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025368627393618226, "grad_norm": 6.665761947631836, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.848553478717804, "num_tokens": 404963411.0, "step": 10610 }, { "epoch": 1.349828266123903, "ewc_loss": 0.05648369342088699, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002523369330447167, "grad_norm": 6.616670608520508, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8622781038284302, "num_tokens": 404997522.0, "step": 10611 }, { "epoch": 1.3499554764024932, "ewc_loss": 0.056524746119976044, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002527474716771394, "grad_norm": 6.6510009765625, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.860685408115387, "num_tokens": 405034049.0, "step": 10612 }, { "epoch": 1.3500826866810838, "ewc_loss": 0.05647765100002289, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025227651349268854, "grad_norm": 6.582167148590088, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8600975275039673, "num_tokens": 405074985.0, "step": 10613 }, { "epoch": 1.3502098969596743, "ewc_loss": 0.05659756436944008, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002534756495151669, "grad_norm": 6.600751876831055, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8792648315429688, "num_tokens": 405116133.0, "step": 10614 }, { "epoch": 1.3503371072382648, "ewc_loss": 0.05660417675971985, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002535417734179646, "grad_norm": 6.665533065795898, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8624399900436401, "num_tokens": 405155572.0, "step": 10615 }, { "epoch": 1.3504643175168554, "ewc_loss": 0.05645647644996643, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025206475402228534, "grad_norm": 6.591540813446045, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8662490248680115, "num_tokens": 405194700.0, "step": 10616 }, { "epoch": 1.350591527795446, "ewc_loss": 0.05665501952171326, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002540502173360437, "grad_norm": 6.691977024078369, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8586724996566772, "num_tokens": 405233582.0, "step": 10617 }, { "epoch": 1.3507187380740364, "ewc_loss": 0.056589242070913315, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025095100863836706, "grad_norm": 6.963736534118652, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8745757341384888, "num_tokens": 405274987.0, "step": 10618 }, { "epoch": 1.350845948352627, "ewc_loss": 0.05635494738817215, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025104949600063264, "grad_norm": 6.588677883148193, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8595733642578125, "num_tokens": 405308151.0, "step": 10619 }, { "epoch": 1.3509731586312175, "ewc_loss": 0.056467533111572266, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002521753194741905, "grad_norm": 6.690740585327148, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.861397385597229, "num_tokens": 405343939.0, "step": 10620 }, { "epoch": 1.3511003689098078, "ewc_loss": 0.056250229477882385, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002500023110769689, "grad_norm": 6.536391735076904, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8625062704086304, "num_tokens": 405384211.0, "step": 10621 }, { "epoch": 1.3512275791883983, "ewc_loss": 0.05660710483789444, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002535710227675736, "grad_norm": 6.705710411071777, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8699260950088501, "num_tokens": 405419971.0, "step": 10622 }, { "epoch": 1.3513547894669888, "ewc_loss": 0.05632220208644867, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002507220197003335, "grad_norm": 6.613848686218262, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8568822741508484, "num_tokens": 405451322.0, "step": 10623 }, { "epoch": 1.3514819997455794, "ewc_loss": 0.056569695472717285, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025319698033854365, "grad_norm": 6.65159797668457, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8548275232315063, "num_tokens": 405496101.0, "step": 10624 }, { "epoch": 1.35160921002417, "ewc_loss": 0.05642271041870117, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002517270913813263, "grad_norm": 6.641514301300049, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.847100019454956, "num_tokens": 405530034.0, "step": 10625 }, { "epoch": 1.3517364203027604, "ewc_loss": 0.05647970363497734, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025229703169316053, "grad_norm": 6.583078861236572, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8747109770774841, "num_tokens": 405570893.0, "step": 10626 }, { "epoch": 1.351863630581351, "ewc_loss": 0.05653981864452362, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025289817131124437, "grad_norm": 6.630950927734375, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8502809405326843, "num_tokens": 405610120.0, "step": 10627 }, { "epoch": 1.3519908408599415, "ewc_loss": 0.056446537375450134, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025196539354510605, "grad_norm": 6.583735466003418, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8619521856307983, "num_tokens": 405651878.0, "step": 10628 }, { "epoch": 1.352118051138532, "ewc_loss": 0.05651712417602539, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002526712487451732, "grad_norm": 6.6272149085998535, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8676055073738098, "num_tokens": 405694578.0, "step": 10629 }, { "epoch": 1.3522452614171225, "ewc_loss": 0.05642290413379669, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002517290413379669, "grad_norm": 6.566869258880615, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8688969612121582, "num_tokens": 405734128.0, "step": 10630 }, { "epoch": 1.352372471695713, "ewc_loss": 0.05652686208486557, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025276863016188145, "grad_norm": 6.67022180557251, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8687822818756104, "num_tokens": 405776588.0, "step": 10631 }, { "epoch": 1.3524996819743036, "ewc_loss": 0.05646467208862305, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002521467104088515, "grad_norm": 6.635969638824463, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8594380617141724, "num_tokens": 405810867.0, "step": 10632 }, { "epoch": 1.3526268922528941, "ewc_loss": 0.05643389746546745, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000251838966505602, "grad_norm": 6.610679626464844, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8805488348007202, "num_tokens": 405848884.0, "step": 10633 }, { "epoch": 1.3527541025314846, "ewc_loss": 0.056516382843256, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025266382726840675, "grad_norm": 6.609829902648926, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8812940120697021, "num_tokens": 405888759.0, "step": 10634 }, { "epoch": 1.3528813128100752, "ewc_loss": 0.05646497756242752, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002521497954148799, "grad_norm": 6.597130298614502, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8761528730392456, "num_tokens": 405925686.0, "step": 10635 }, { "epoch": 1.3530085230886655, "ewc_loss": 0.056524697691202164, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025274697691202164, "grad_norm": 6.665283203125, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.877764105796814, "num_tokens": 405960836.0, "step": 10636 }, { "epoch": 1.353135733367256, "ewc_loss": 0.056464917957782745, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025214915513060987, "grad_norm": 6.612380504608154, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.87447190284729, "num_tokens": 406000314.0, "step": 10637 }, { "epoch": 1.3532629436458465, "ewc_loss": 0.056599535048007965, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002534953528083861, "grad_norm": 6.633199691772461, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8735543489456177, "num_tokens": 406039468.0, "step": 10638 }, { "epoch": 1.353390153924437, "ewc_loss": 0.05653209239244461, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025282090064138174, "grad_norm": 6.640084743499756, "learning_rate": 1e-06, "loss": 0.3853, "mean_token_accuracy": 0.8871971964836121, "num_tokens": 406076828.0, "step": 10639 }, { "epoch": 1.3535173642030276, "ewc_loss": 0.0565834566950798, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025333455414511263, "grad_norm": 6.649706840515137, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8631815910339355, "num_tokens": 406116504.0, "step": 10640 }, { "epoch": 1.3536445744816181, "ewc_loss": 0.05652138590812683, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025271388585679233, "grad_norm": 6.648443222045898, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8622583150863647, "num_tokens": 406155184.0, "step": 10641 }, { "epoch": 1.3537717847602087, "ewc_loss": 0.05657033249735832, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002532033249735832, "grad_norm": 6.684802055358887, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.844744086265564, "num_tokens": 406195411.0, "step": 10642 }, { "epoch": 1.3538989950387992, "ewc_loss": 0.05652007460594177, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002527007309254259, "grad_norm": 6.706912040710449, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8539749383926392, "num_tokens": 406230159.0, "step": 10643 }, { "epoch": 1.3540262053173897, "ewc_loss": 0.0565347746014595, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025284773437306285, "grad_norm": 6.688376426696777, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8523612022399902, "num_tokens": 406264541.0, "step": 10644 }, { "epoch": 1.3541534155959802, "ewc_loss": 0.056534286588430405, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002528428740333766, "grad_norm": 6.6398468017578125, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8703839182853699, "num_tokens": 406301869.0, "step": 10645 }, { "epoch": 1.3542806258745705, "ewc_loss": 0.056512363255023956, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025262360577471554, "grad_norm": 6.705078601837158, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8651570081710815, "num_tokens": 406338819.0, "step": 10646 }, { "epoch": 1.354407836153161, "ewc_loss": 0.05653999373316765, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025289994664490223, "grad_norm": 6.654287338256836, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8640148639678955, "num_tokens": 406375914.0, "step": 10647 }, { "epoch": 1.3545350464317516, "ewc_loss": 0.056494321674108505, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002524432202335447, "grad_norm": 6.601534366607666, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8689326047897339, "num_tokens": 406418543.0, "step": 10648 }, { "epoch": 1.3546622567103421, "ewc_loss": 0.05655069649219513, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002530069905333221, "grad_norm": 6.679713249206543, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8613177537918091, "num_tokens": 406455731.0, "step": 10649 }, { "epoch": 1.3547894669889327, "ewc_loss": 0.05651219189167023, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025262191775254905, "grad_norm": 6.621801376342773, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8636617064476013, "num_tokens": 406496266.0, "step": 10650 }, { "epoch": 1.3549166772675232, "ewc_loss": 0.056504569947719574, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025254569482058287, "grad_norm": 6.635232448577881, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.858403742313385, "num_tokens": 406541676.0, "step": 10651 }, { "epoch": 1.3550438875461137, "ewc_loss": 0.056447066366672516, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002519706904422492, "grad_norm": 6.665065765380859, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8593028783798218, "num_tokens": 406578997.0, "step": 10652 }, { "epoch": 1.3551710978247042, "ewc_loss": 0.056489843875169754, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002523984294384718, "grad_norm": 6.660205364227295, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.86845862865448, "num_tokens": 406619404.0, "step": 10653 }, { "epoch": 1.3552983081032948, "ewc_loss": 0.05650275945663452, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025252762134186924, "grad_norm": 6.647287368774414, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8823227882385254, "num_tokens": 406656092.0, "step": 10654 }, { "epoch": 1.3554255183818853, "ewc_loss": 0.05643116682767868, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002518116671126336, "grad_norm": 6.6698174476623535, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8465423583984375, "num_tokens": 406701600.0, "step": 10655 }, { "epoch": 1.3555527286604758, "ewc_loss": 0.05646680295467377, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025216801441274583, "grad_norm": 6.622874736785889, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8634253144264221, "num_tokens": 406739649.0, "step": 10656 }, { "epoch": 1.3556799389390664, "ewc_loss": 0.05645359307527542, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002520359412301332, "grad_norm": 6.627302646636963, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8835455179214478, "num_tokens": 406774962.0, "step": 10657 }, { "epoch": 1.3558071492176569, "ewc_loss": 0.05648515373468399, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000252351543167606, "grad_norm": 6.655482769012451, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.859283447265625, "num_tokens": 406818047.0, "step": 10658 }, { "epoch": 1.3559343594962474, "ewc_loss": 0.05654837563633919, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002529837656766176, "grad_norm": 6.621968746185303, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8569135069847107, "num_tokens": 406859464.0, "step": 10659 }, { "epoch": 1.356061569774838, "ewc_loss": 0.05647699162364006, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025226990692317486, "grad_norm": 6.667266845703125, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8625480532646179, "num_tokens": 406896444.0, "step": 10660 }, { "epoch": 1.3561887800534282, "ewc_loss": 0.056497130542993546, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025247130542993546, "grad_norm": 6.648112773895264, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8576488494873047, "num_tokens": 406934623.0, "step": 10661 }, { "epoch": 1.3563159903320188, "ewc_loss": 0.05650453269481659, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025254531647078693, "grad_norm": 6.598668098449707, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8563635349273682, "num_tokens": 406974290.0, "step": 10662 }, { "epoch": 1.3564432006106093, "ewc_loss": 0.05649556219577789, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002524556184653193, "grad_norm": 6.627260684967041, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.881821870803833, "num_tokens": 407011279.0, "step": 10663 }, { "epoch": 1.3565704108891998, "ewc_loss": 0.0565398670732975, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025289866607636213, "grad_norm": 6.657557487487793, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.852460503578186, "num_tokens": 407054733.0, "step": 10664 }, { "epoch": 1.3566976211677904, "ewc_loss": 0.056573137640953064, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025323135196231306, "grad_norm": 6.644886493682861, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8643853664398193, "num_tokens": 407098410.0, "step": 10665 }, { "epoch": 1.3568248314463809, "ewc_loss": 0.05657034367322922, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002532034122850746, "grad_norm": 6.683246612548828, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8576600551605225, "num_tokens": 407134231.0, "step": 10666 }, { "epoch": 1.3569520417249714, "ewc_loss": 0.05651416629552841, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002526416501495987, "grad_norm": 6.6542229652404785, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8490774631500244, "num_tokens": 407167231.0, "step": 10667 }, { "epoch": 1.357079252003562, "ewc_loss": 0.05663567781448364, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002538567641749978, "grad_norm": 6.641578197479248, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8652234077453613, "num_tokens": 407206604.0, "step": 10668 }, { "epoch": 1.3572064622821525, "ewc_loss": 0.05658475309610367, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025334753445349634, "grad_norm": 6.608022689819336, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8660687804222107, "num_tokens": 407248247.0, "step": 10669 }, { "epoch": 1.3573336725607428, "ewc_loss": 0.05669957399368286, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002544957387726754, "grad_norm": 6.697615623474121, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8737841844558716, "num_tokens": 407282870.0, "step": 10670 }, { "epoch": 1.3574608828393333, "ewc_loss": 0.05655359849333763, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000253035977948457, "grad_norm": 6.61330509185791, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8700363636016846, "num_tokens": 407318704.0, "step": 10671 }, { "epoch": 1.3575880931179238, "ewc_loss": 0.05679071322083473, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000255407125223428, "grad_norm": 6.6710662841796875, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8659265041351318, "num_tokens": 407360003.0, "step": 10672 }, { "epoch": 1.3577153033965144, "ewc_loss": 0.05666610598564148, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025416104472242296, "grad_norm": 6.655314922332764, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8576549887657166, "num_tokens": 407399473.0, "step": 10673 }, { "epoch": 1.357842513675105, "ewc_loss": 0.05672551691532135, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002547551703173667, "grad_norm": 6.710583209991455, "learning_rate": 1e-06, "loss": 0.5523, "mean_token_accuracy": 0.833237886428833, "num_tokens": 407440576.0, "step": 10674 }, { "epoch": 1.3579697239536954, "ewc_loss": 0.056732818484306335, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002548281627241522, "grad_norm": 6.697754859924316, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8607649803161621, "num_tokens": 407481057.0, "step": 10675 }, { "epoch": 1.358096934232286, "ewc_loss": 0.056625038385391235, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000253750360570848, "grad_norm": 6.627636432647705, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8761464357376099, "num_tokens": 407517104.0, "step": 10676 }, { "epoch": 1.3582241445108765, "ewc_loss": 0.05662433058023453, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025374331744387746, "grad_norm": 6.679405212402344, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8711353540420532, "num_tokens": 407552604.0, "step": 10677 }, { "epoch": 1.358351354789467, "ewc_loss": 0.05669625848531723, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002544625895097852, "grad_norm": 6.725301265716553, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8775534629821777, "num_tokens": 407588282.0, "step": 10678 }, { "epoch": 1.3584785650680575, "ewc_loss": 0.056655801832675934, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002540580171626061, "grad_norm": 6.7472310066223145, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8591222763061523, "num_tokens": 407619837.0, "step": 10679 }, { "epoch": 1.358605775346648, "ewc_loss": 0.05655735731124878, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025307355099357665, "grad_norm": 6.7374796867370605, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8605786561965942, "num_tokens": 407656939.0, "step": 10680 }, { "epoch": 1.3587329856252386, "ewc_loss": 0.05654348433017731, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025293484213761985, "grad_norm": 6.644015789031982, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8548421859741211, "num_tokens": 407702756.0, "step": 10681 }, { "epoch": 1.3588601959038291, "ewc_loss": 0.05651581287384033, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002526581520214677, "grad_norm": 6.673159122467041, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8726626038551331, "num_tokens": 407736142.0, "step": 10682 }, { "epoch": 1.3589874061824196, "ewc_loss": 0.05650056153535843, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002525055897422135, "grad_norm": 6.657399654388428, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.860602617263794, "num_tokens": 407773184.0, "step": 10683 }, { "epoch": 1.3591146164610102, "ewc_loss": 0.056515321135520935, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025265320437029004, "grad_norm": 6.658920764923096, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8476952314376831, "num_tokens": 407806543.0, "step": 10684 }, { "epoch": 1.3592418267396005, "ewc_loss": 0.05661499500274658, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002536499232519418, "grad_norm": 6.662129878997803, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8611811399459839, "num_tokens": 407841371.0, "step": 10685 }, { "epoch": 1.359369037018191, "ewc_loss": 0.05655844137072563, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000253084406722337, "grad_norm": 6.693873405456543, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8673774003982544, "num_tokens": 407879862.0, "step": 10686 }, { "epoch": 1.3594962472967815, "ewc_loss": 0.05657828599214554, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002532828657422215, "grad_norm": 6.656799793243408, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8572198748588562, "num_tokens": 407917947.0, "step": 10687 }, { "epoch": 1.359623457575372, "ewc_loss": 0.056586362421512604, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025336359976790845, "grad_norm": 6.6815714836120605, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8702267408370972, "num_tokens": 407953987.0, "step": 10688 }, { "epoch": 1.3597506678539626, "ewc_loss": 0.05652065575122833, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025270655169151723, "grad_norm": 6.680413246154785, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8463115692138672, "num_tokens": 407993166.0, "step": 10689 }, { "epoch": 1.3598778781325531, "ewc_loss": 0.05656495690345764, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025314957019872963, "grad_norm": 6.6900835037231445, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8728793859481812, "num_tokens": 408037063.0, "step": 10690 }, { "epoch": 1.3600050884111436, "ewc_loss": 0.05653609335422516, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025286091840825975, "grad_norm": 6.612209320068359, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8659090995788574, "num_tokens": 408078526.0, "step": 10691 }, { "epoch": 1.3601322986897342, "ewc_loss": 0.056537628173828125, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025287625612691045, "grad_norm": 6.64414644241333, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8710680603981018, "num_tokens": 408116002.0, "step": 10692 }, { "epoch": 1.3602595089683247, "ewc_loss": 0.05658126622438431, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025331266806460917, "grad_norm": 6.701419830322266, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.859371542930603, "num_tokens": 408155885.0, "step": 10693 }, { "epoch": 1.3603867192469152, "ewc_loss": 0.05660538375377655, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025355382240377367, "grad_norm": 6.6844282150268555, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8684446811676025, "num_tokens": 408190085.0, "step": 10694 }, { "epoch": 1.3605139295255055, "ewc_loss": 0.056558847427368164, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002530884521547705, "grad_norm": 6.667368412017822, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8649659752845764, "num_tokens": 408227289.0, "step": 10695 }, { "epoch": 1.360641139804096, "ewc_loss": 0.05662999302148819, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002537999243941158, "grad_norm": 6.660299301147461, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8710548877716064, "num_tokens": 408271853.0, "step": 10696 }, { "epoch": 1.3607683500826866, "ewc_loss": 0.056590769439935684, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002534076920710504, "grad_norm": 6.7060394287109375, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8769620656967163, "num_tokens": 408311133.0, "step": 10697 }, { "epoch": 1.3608955603612771, "ewc_loss": 0.056514397263526917, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025264397845603526, "grad_norm": 6.690074443817139, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8645491003990173, "num_tokens": 408342775.0, "step": 10698 }, { "epoch": 1.3610227706398677, "ewc_loss": 0.05654985457658768, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025299855042248964, "grad_norm": 6.653026580810547, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8766995668411255, "num_tokens": 408379063.0, "step": 10699 }, { "epoch": 1.3611499809184582, "ewc_loss": 0.05656109005212784, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025311089120805264, "grad_norm": 6.697455406188965, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8500810861587524, "num_tokens": 408415856.0, "step": 10700 }, { "epoch": 1.3612771911970487, "ewc_loss": 0.0565117672085762, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002526176976971328, "grad_norm": 6.6456298828125, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8687604069709778, "num_tokens": 408451522.0, "step": 10701 }, { "epoch": 1.3614044014756392, "ewc_loss": 0.05657463148236275, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002532463113311678, "grad_norm": 6.631181716918945, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8687991499900818, "num_tokens": 408494605.0, "step": 10702 }, { "epoch": 1.3615316117542298, "ewc_loss": 0.05658832937479019, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002533832739572972, "grad_norm": 6.664079189300537, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8632009029388428, "num_tokens": 408536373.0, "step": 10703 }, { "epoch": 1.3616588220328203, "ewc_loss": 0.05655474215745926, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002530474157538265, "grad_norm": 7.450756072998047, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.851570725440979, "num_tokens": 408574724.0, "step": 10704 }, { "epoch": 1.3617860323114108, "ewc_loss": 0.056444935500621796, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025194932823069394, "grad_norm": 6.536823272705078, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8563207983970642, "num_tokens": 408615057.0, "step": 10705 }, { "epoch": 1.3619132425900013, "ewc_loss": 0.05671191215515137, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002546191099099815, "grad_norm": 6.735973834991455, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.851545512676239, "num_tokens": 408651324.0, "step": 10706 }, { "epoch": 1.3620404528685919, "ewc_loss": 0.05629425495862961, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002504425647202879, "grad_norm": 6.557041168212891, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8697420358657837, "num_tokens": 408690533.0, "step": 10707 }, { "epoch": 1.3621676631471824, "ewc_loss": 0.05677793174982071, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025527930120006204, "grad_norm": 6.724445819854736, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8569200038909912, "num_tokens": 408725970.0, "step": 10708 }, { "epoch": 1.362294873425773, "ewc_loss": 0.056504517793655396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002525452000554651, "grad_norm": 6.630429267883301, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8711209297180176, "num_tokens": 408766550.0, "step": 10709 }, { "epoch": 1.3624220837043632, "ewc_loss": 0.056667935103178024, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025417935103178024, "grad_norm": 6.686718463897705, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8668574094772339, "num_tokens": 408804320.0, "step": 10710 }, { "epoch": 1.3625492939829538, "ewc_loss": 0.056630443781614304, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002538044354878366, "grad_norm": 6.652956962585449, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8672802448272705, "num_tokens": 408840902.0, "step": 10711 }, { "epoch": 1.3626765042615443, "ewc_loss": 0.05672178789973259, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025471788831055164, "grad_norm": 6.655475616455078, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8636020421981812, "num_tokens": 408879346.0, "step": 10712 }, { "epoch": 1.3628037145401348, "ewc_loss": 0.056673482060432434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002542348229326308, "grad_norm": 6.640202045440674, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8764037489891052, "num_tokens": 408919336.0, "step": 10713 }, { "epoch": 1.3629309248187254, "ewc_loss": 0.056677404791116714, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025427405489608645, "grad_norm": 6.64768123626709, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8585700392723083, "num_tokens": 408961547.0, "step": 10714 }, { "epoch": 1.3630581350973159, "ewc_loss": 0.056702762842178345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000254527636570856, "grad_norm": 6.6514506340026855, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8783177137374878, "num_tokens": 409003573.0, "step": 10715 }, { "epoch": 1.3631853453759064, "ewc_loss": 0.0566941574215889, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025444154744036496, "grad_norm": 7.440835475921631, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8571731448173523, "num_tokens": 409045560.0, "step": 10716 }, { "epoch": 1.363312555654497, "ewc_loss": 0.05661623179912567, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025366232148371637, "grad_norm": 6.516347408294678, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8666160106658936, "num_tokens": 409087245.0, "step": 10717 }, { "epoch": 1.3634397659330875, "ewc_loss": 0.05699635297060013, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025746351457200944, "grad_norm": 6.777498722076416, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.873299241065979, "num_tokens": 409134177.0, "step": 10718 }, { "epoch": 1.3635669762116778, "ewc_loss": 0.056461870670318604, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025211868342012167, "grad_norm": 6.598796844482422, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8501698970794678, "num_tokens": 409169495.0, "step": 10719 }, { "epoch": 1.3636941864902683, "ewc_loss": 0.0568840354681015, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002563403395470232, "grad_norm": 6.741262435913086, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8403337001800537, "num_tokens": 409207781.0, "step": 10720 }, { "epoch": 1.3638213967688588, "ewc_loss": 0.05662352219223976, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002537352265790105, "grad_norm": 6.655319690704346, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8634580373764038, "num_tokens": 409245411.0, "step": 10721 }, { "epoch": 1.3639486070474494, "ewc_loss": 0.05668492615222931, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002543492882978171, "grad_norm": 6.635969638824463, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8725005388259888, "num_tokens": 409281757.0, "step": 10722 }, { "epoch": 1.3640758173260399, "ewc_loss": 0.05674755573272705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002549755445215851, "grad_norm": 6.660882949829102, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.851891040802002, "num_tokens": 409321317.0, "step": 10723 }, { "epoch": 1.3642030276046304, "ewc_loss": 0.05668731778860092, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002543731825426221, "grad_norm": 6.66990327835083, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8709503412246704, "num_tokens": 409358919.0, "step": 10724 }, { "epoch": 1.364330237883221, "ewc_loss": 0.05671434849500656, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002546434698160738, "grad_norm": 6.674887657165527, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8550763726234436, "num_tokens": 409394355.0, "step": 10725 }, { "epoch": 1.3644574481618115, "ewc_loss": 0.056700535118579865, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002545053430367261, "grad_norm": 6.643825054168701, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8736412525177002, "num_tokens": 409433781.0, "step": 10726 }, { "epoch": 1.364584658440402, "ewc_loss": 0.05663116276264191, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025381165323778987, "grad_norm": 6.704259872436523, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8538771867752075, "num_tokens": 409471728.0, "step": 10727 }, { "epoch": 1.3647118687189925, "ewc_loss": 0.05657597631216049, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025325975730083883, "grad_norm": 6.648648738861084, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8828519582748413, "num_tokens": 409502294.0, "step": 10728 }, { "epoch": 1.364839078997583, "ewc_loss": 0.05666351318359375, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000254135113209486, "grad_norm": 6.658968448638916, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8649957776069641, "num_tokens": 409545878.0, "step": 10729 }, { "epoch": 1.3649662892761736, "ewc_loss": 0.0566713772714138, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025421378086321056, "grad_norm": 6.646782875061035, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8656377196311951, "num_tokens": 409584753.0, "step": 10730 }, { "epoch": 1.3650934995547641, "ewc_loss": 0.05664458125829697, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025394579279236495, "grad_norm": 6.730539798736572, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8686113357543945, "num_tokens": 409613725.0, "step": 10731 }, { "epoch": 1.3652207098333546, "ewc_loss": 0.05664491653442383, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002539491397328675, "grad_norm": 6.6843461990356445, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.856387197971344, "num_tokens": 409652443.0, "step": 10732 }, { "epoch": 1.3653479201119452, "ewc_loss": 0.056608326733112335, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002535832463763654, "grad_norm": 6.636529922485352, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8697876930236816, "num_tokens": 409690683.0, "step": 10733 }, { "epoch": 1.3654751303905355, "ewc_loss": 0.056663382798433304, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002541338326409459, "grad_norm": 6.667557716369629, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8642721772193909, "num_tokens": 409727502.0, "step": 10734 }, { "epoch": 1.365602340669126, "ewc_loss": 0.05661643669009209, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025366435875184834, "grad_norm": 6.6234450340271, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8602297306060791, "num_tokens": 409764592.0, "step": 10735 }, { "epoch": 1.3657295509477165, "ewc_loss": 0.05666240304708481, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000254124024650082, "grad_norm": 6.661746978759766, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.842560887336731, "num_tokens": 409809645.0, "step": 10736 }, { "epoch": 1.365856761226307, "ewc_loss": 0.05671009421348572, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025460092001594603, "grad_norm": 6.698172569274902, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8631783127784729, "num_tokens": 409848048.0, "step": 10737 }, { "epoch": 1.3659839715048976, "ewc_loss": 0.05665808171033859, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002540808345656842, "grad_norm": 6.677152633666992, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8581869006156921, "num_tokens": 409884176.0, "step": 10738 }, { "epoch": 1.3661111817834881, "ewc_loss": 0.05668247491121292, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002543247537687421, "grad_norm": 6.678647994995117, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8660531044006348, "num_tokens": 409922630.0, "step": 10739 }, { "epoch": 1.3662383920620786, "ewc_loss": 0.05665501207113266, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002540501009207219, "grad_norm": 6.63834285736084, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.869652271270752, "num_tokens": 409961876.0, "step": 10740 }, { "epoch": 1.3663656023406692, "ewc_loss": 0.056717999279499054, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000254679995123297, "grad_norm": 6.710803031921387, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8719984292984009, "num_tokens": 409995432.0, "step": 10741 }, { "epoch": 1.3664928126192597, "ewc_loss": 0.056644685566425323, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025394686963409185, "grad_norm": 6.68078088760376, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8622841835021973, "num_tokens": 410034876.0, "step": 10742 }, { "epoch": 1.3666200228978502, "ewc_loss": 0.05665723234415054, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002540723071433604, "grad_norm": 6.642387390136719, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8673399686813354, "num_tokens": 410079190.0, "step": 10743 }, { "epoch": 1.3667472331764405, "ewc_loss": 0.05664948374032974, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000253994861850515, "grad_norm": 6.693533897399902, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8679007291793823, "num_tokens": 410112475.0, "step": 10744 }, { "epoch": 1.366874443455031, "ewc_loss": 0.05662401020526886, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002537401160225272, "grad_norm": 6.639256000518799, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8789606094360352, "num_tokens": 410154583.0, "step": 10745 }, { "epoch": 1.3670016537336216, "ewc_loss": 0.05671350657939911, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002546350588090718, "grad_norm": 6.770756721496582, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8635760545730591, "num_tokens": 410191631.0, "step": 10746 }, { "epoch": 1.3671288640122121, "ewc_loss": 0.05653739720582962, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002528739860281348, "grad_norm": 6.6240620613098145, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8419758081436157, "num_tokens": 410232435.0, "step": 10747 }, { "epoch": 1.3672560742908026, "ewc_loss": 0.056755565106868744, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025505563826300204, "grad_norm": 6.680422782897949, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8685898184776306, "num_tokens": 410272940.0, "step": 10748 }, { "epoch": 1.3673832845693932, "ewc_loss": 0.05660630762577057, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002535630774218589, "grad_norm": 6.66006326675415, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8593025207519531, "num_tokens": 410312807.0, "step": 10749 }, { "epoch": 1.3675104948479837, "ewc_loss": 0.05674484372138977, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002549484488554299, "grad_norm": 6.647538185119629, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8695122599601746, "num_tokens": 410352433.0, "step": 10750 }, { "epoch": 1.3676377051265742, "ewc_loss": 0.05676702409982681, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002551702200435102, "grad_norm": 6.670218467712402, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8639810681343079, "num_tokens": 410394843.0, "step": 10751 }, { "epoch": 1.3677649154051648, "ewc_loss": 0.05670279264450073, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025452792760916054, "grad_norm": 6.641395092010498, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8658643960952759, "num_tokens": 410439353.0, "step": 10752 }, { "epoch": 1.3678921256837553, "ewc_loss": 0.05679595470428467, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025545957032591105, "grad_norm": 6.665931701660156, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8422242403030396, "num_tokens": 410485789.0, "step": 10753 }, { "epoch": 1.3680193359623458, "ewc_loss": 0.05669210106134415, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025442102923989296, "grad_norm": 6.646693229675293, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8476545810699463, "num_tokens": 410523385.0, "step": 10754 }, { "epoch": 1.3681465462409363, "ewc_loss": 0.05673748254776001, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002548748452682048, "grad_norm": 6.765096187591553, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8579918742179871, "num_tokens": 410553340.0, "step": 10755 }, { "epoch": 1.3682737565195269, "ewc_loss": 0.05675218254327774, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025502184871584177, "grad_norm": 6.668552875518799, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.872809648513794, "num_tokens": 410590041.0, "step": 10756 }, { "epoch": 1.3684009667981174, "ewc_loss": 0.05675773322582245, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002550773206166923, "grad_norm": 6.69008207321167, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8654367923736572, "num_tokens": 410627703.0, "step": 10757 }, { "epoch": 1.368528177076708, "ewc_loss": 0.056613482534885406, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002536348474677652, "grad_norm": 6.638866901397705, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8562042713165283, "num_tokens": 410662555.0, "step": 10758 }, { "epoch": 1.3686553873552982, "ewc_loss": 0.05674091354012489, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025490912958048284, "grad_norm": 6.682534217834473, "learning_rate": 1e-06, "loss": 0.5175, "mean_token_accuracy": 0.8466935157775879, "num_tokens": 410706026.0, "step": 10759 }, { "epoch": 1.3687825976338888, "ewc_loss": 0.056730348616838455, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002548034826759249, "grad_norm": 6.725188255310059, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8601916432380676, "num_tokens": 410743275.0, "step": 10760 }, { "epoch": 1.3689098079124793, "ewc_loss": 0.05664900317788124, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002539900306146592, "grad_norm": 6.623944282531738, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.872954249382019, "num_tokens": 410781220.0, "step": 10761 }, { "epoch": 1.3690370181910698, "ewc_loss": 0.05681431293487549, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025564312818460166, "grad_norm": 6.733353614807129, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8711110353469849, "num_tokens": 410820014.0, "step": 10762 }, { "epoch": 1.3691642284696603, "ewc_loss": 0.05664893984794617, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025398939033038914, "grad_norm": 6.615960121154785, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8587176203727722, "num_tokens": 410858721.0, "step": 10763 }, { "epoch": 1.3692914387482509, "ewc_loss": 0.056815337389707565, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025565337273292243, "grad_norm": 6.687347412109375, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8695268630981445, "num_tokens": 410898527.0, "step": 10764 }, { "epoch": 1.3694186490268414, "ewc_loss": 0.05670495331287384, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025454952265135944, "grad_norm": 6.67056131362915, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8642835021018982, "num_tokens": 410938426.0, "step": 10765 }, { "epoch": 1.369545859305432, "ewc_loss": 0.056725382804870605, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002547538315411657, "grad_norm": 6.73707389831543, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8688297867774963, "num_tokens": 410969940.0, "step": 10766 }, { "epoch": 1.3696730695840225, "ewc_loss": 0.056720070540905, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002547006879467517, "grad_norm": 6.672658920288086, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8476473689079285, "num_tokens": 411008859.0, "step": 10767 }, { "epoch": 1.3698002798626128, "ewc_loss": 0.05678548663854599, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002553548838477582, "grad_norm": 6.690371036529541, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8775407075881958, "num_tokens": 411043516.0, "step": 10768 }, { "epoch": 1.3699274901412033, "ewc_loss": 0.056704238057136536, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002545423631090671, "grad_norm": 6.69429349899292, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8567541241645813, "num_tokens": 411079092.0, "step": 10769 }, { "epoch": 1.3700547004197938, "ewc_loss": 0.05671996995806694, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002546996984165162, "grad_norm": 6.683640956878662, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8677265644073486, "num_tokens": 411117360.0, "step": 10770 }, { "epoch": 1.3701819106983844, "ewc_loss": 0.056723058223724365, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002547305775806308, "grad_norm": 6.71370792388916, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8530465364456177, "num_tokens": 411153615.0, "step": 10771 }, { "epoch": 1.3703091209769749, "ewc_loss": 0.056744709610939026, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002549470809753984, "grad_norm": 6.67354154586792, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.865429162979126, "num_tokens": 411193571.0, "step": 10772 }, { "epoch": 1.3704363312555654, "ewc_loss": 0.0567186176776886, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002546861651353538, "grad_norm": 6.7076334953308105, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8445460200309753, "num_tokens": 411230427.0, "step": 10773 }, { "epoch": 1.370563541534156, "ewc_loss": 0.056725915521383286, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002547591575421393, "grad_norm": 6.681363105773926, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8623875379562378, "num_tokens": 411271251.0, "step": 10774 }, { "epoch": 1.3706907518127465, "ewc_loss": 0.056653253734111786, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000254032522207126, "grad_norm": 6.722403526306152, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8657175898551941, "num_tokens": 411304906.0, "step": 10775 }, { "epoch": 1.370817962091337, "ewc_loss": 0.056599974632263184, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025349974748678505, "grad_norm": 6.634517192840576, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8807567358016968, "num_tokens": 411341417.0, "step": 10776 }, { "epoch": 1.3709451723699275, "ewc_loss": 0.05669928342103958, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002544928574934602, "grad_norm": 6.7106523513793945, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8655072450637817, "num_tokens": 411382118.0, "step": 10777 }, { "epoch": 1.371072382648518, "ewc_loss": 0.056543782353401184, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025293781072832644, "grad_norm": 6.644819736480713, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8526853919029236, "num_tokens": 411426702.0, "step": 10778 }, { "epoch": 1.3711995929271086, "ewc_loss": 0.05664942413568497, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025399422156624496, "grad_norm": 6.731356620788574, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8628373146057129, "num_tokens": 411461689.0, "step": 10779 }, { "epoch": 1.371326803205699, "ewc_loss": 0.0566263347864151, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002537633408792317, "grad_norm": 6.717177391052246, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.864793062210083, "num_tokens": 411499613.0, "step": 10780 }, { "epoch": 1.3714540134842896, "ewc_loss": 0.05658147111535072, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025331470533274114, "grad_norm": 6.688656330108643, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8621536493301392, "num_tokens": 411538422.0, "step": 10781 }, { "epoch": 1.3715812237628802, "ewc_loss": 0.05662979185581207, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002537979162298143, "grad_norm": 6.723025798797607, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8768926858901978, "num_tokens": 411572401.0, "step": 10782 }, { "epoch": 1.3717084340414705, "ewc_loss": 0.056610435247421265, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025360434665344656, "grad_norm": 6.71796989440918, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8706381320953369, "num_tokens": 411611268.0, "step": 10783 }, { "epoch": 1.371835644320061, "ewc_loss": 0.056635256856679916, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025385257322341204, "grad_norm": 6.7211127281188965, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8624945878982544, "num_tokens": 411648837.0, "step": 10784 }, { "epoch": 1.3719628545986515, "ewc_loss": 0.05656500905752182, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002531500649638474, "grad_norm": 6.707444667816162, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8675395250320435, "num_tokens": 411690327.0, "step": 10785 }, { "epoch": 1.372090064877242, "ewc_loss": 0.05664362013339996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025393618852831423, "grad_norm": 6.698179721832275, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8626586198806763, "num_tokens": 411729665.0, "step": 10786 }, { "epoch": 1.3722172751558326, "ewc_loss": 0.056563496589660645, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025313496007584035, "grad_norm": 6.678417682647705, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8768334984779358, "num_tokens": 411764716.0, "step": 10787 }, { "epoch": 1.3723444854344231, "ewc_loss": 0.05662853270769119, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000253785343375057, "grad_norm": 6.737976551055908, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8649146556854248, "num_tokens": 411805560.0, "step": 10788 }, { "epoch": 1.3724716957130136, "ewc_loss": 0.05653228610754013, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025282285059802234, "grad_norm": 6.676061630249023, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8672211170196533, "num_tokens": 411841151.0, "step": 10789 }, { "epoch": 1.3725989059916042, "ewc_loss": 0.056666865944862366, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002541686699260026, "grad_norm": 6.762875556945801, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8706668615341187, "num_tokens": 411875088.0, "step": 10790 }, { "epoch": 1.3727261162701947, "ewc_loss": 0.05665472149848938, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002540471905376762, "grad_norm": 6.6874470710754395, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8634443283081055, "num_tokens": 411914554.0, "step": 10791 }, { "epoch": 1.3728533265487852, "ewc_loss": 0.05657213181257248, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002532213111408055, "grad_norm": 6.668838024139404, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8652443885803223, "num_tokens": 411952237.0, "step": 10792 }, { "epoch": 1.3729805368273755, "ewc_loss": 0.05667731910943985, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025427318178117275, "grad_norm": 6.745711326599121, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8578013181686401, "num_tokens": 411989676.0, "step": 10793 }, { "epoch": 1.373107747105966, "ewc_loss": 0.05756482854485512, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002533826627768576, "grad_norm": 36.11766052246094, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8768439292907715, "num_tokens": 412025804.0, "step": 10794 }, { "epoch": 1.3732349573845566, "ewc_loss": 0.08269187808036804, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0005144188180565834, "grad_norm": 10.21194076538086, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8790720701217651, "num_tokens": 412060085.0, "step": 10795 }, { "epoch": 1.3733621676631471, "ewc_loss": 0.056534722447395325, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025284721050411463, "grad_norm": 5.7114763259887695, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8705532550811768, "num_tokens": 412100750.0, "step": 10796 }, { "epoch": 1.3734893779417376, "ewc_loss": 0.07003699243068695, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00038786992081440985, "grad_norm": 9.384028434753418, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.866115391254425, "num_tokens": 412145829.0, "step": 10797 }, { "epoch": 1.3736165882203282, "ewc_loss": 0.07437817752361298, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000431281776400283, "grad_norm": 9.05540657043457, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.869575023651123, "num_tokens": 412187607.0, "step": 10798 }, { "epoch": 1.3737437984989187, "ewc_loss": 0.0619964525103569, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000307464535580948, "grad_norm": 6.9251389503479, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8512070775032043, "num_tokens": 412227214.0, "step": 10799 }, { "epoch": 1.3738710087775092, "ewc_loss": 0.0635443925857544, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0003229438734706491, "grad_norm": 8.185372352600098, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8665629625320435, "num_tokens": 412261929.0, "step": 10800 }, { "epoch": 1.3739982190560998, "ewc_loss": 0.06515797972679138, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00033907979377545416, "grad_norm": 7.682467937469482, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.85960453748703, "num_tokens": 412299441.0, "step": 10801 }, { "epoch": 1.3741254293346903, "ewc_loss": 0.06055622920393944, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002930623013526201, "grad_norm": 7.1982197761535645, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8624846339225769, "num_tokens": 412339248.0, "step": 10802 }, { "epoch": 1.3742526396132808, "ewc_loss": 0.06114036589860916, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00029890367295593023, "grad_norm": 7.4870686531066895, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.863015353679657, "num_tokens": 412377224.0, "step": 10803 }, { "epoch": 1.3743798498918713, "ewc_loss": 0.06039249524474144, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00029142494895495474, "grad_norm": 7.178256988525391, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8424157500267029, "num_tokens": 412415331.0, "step": 10804 }, { "epoch": 1.3745070601704619, "ewc_loss": 0.05934785678982735, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00028097856556996703, "grad_norm": 7.158178806304932, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8632615804672241, "num_tokens": 412455604.0, "step": 10805 }, { "epoch": 1.3746342704490524, "ewc_loss": 0.059144213795661926, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002789421414490789, "grad_norm": 7.0672173500061035, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8706433773040771, "num_tokens": 412489295.0, "step": 10806 }, { "epoch": 1.374761480727643, "ewc_loss": 0.05872557684779167, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027475575916469097, "grad_norm": 7.069812297821045, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.860548198223114, "num_tokens": 412525750.0, "step": 10807 }, { "epoch": 1.3748886910062332, "ewc_loss": 0.05820249021053314, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026952489861287177, "grad_norm": 6.880599498748779, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8732490539550781, "num_tokens": 412567163.0, "step": 10808 }, { "epoch": 1.3750159012848238, "ewc_loss": 0.058139242231845856, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026889241416938603, "grad_norm": 7.001680850982666, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8709512948989868, "num_tokens": 412598789.0, "step": 10809 }, { "epoch": 1.3751431115634143, "ewc_loss": 0.0577983520925045, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026548351161181927, "grad_norm": 6.870827674865723, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8680403828620911, "num_tokens": 412632588.0, "step": 10810 }, { "epoch": 1.3752703218420048, "ewc_loss": 0.05773851275444031, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002648851368576288, "grad_norm": 6.8700175285339355, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.86338210105896, "num_tokens": 412676369.0, "step": 10811 }, { "epoch": 1.3753975321205953, "ewc_loss": 0.05740761011838913, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002615760895423591, "grad_norm": 6.8188605308532715, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8785924315452576, "num_tokens": 412712674.0, "step": 10812 }, { "epoch": 1.3755247423991859, "ewc_loss": 0.05737868696451187, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002612868556752801, "grad_norm": 6.778476238250732, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8585741519927979, "num_tokens": 412753531.0, "step": 10813 }, { "epoch": 1.3756519526777764, "ewc_loss": 0.05719402804970741, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025944027584046125, "grad_norm": 6.7551445960998535, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.861303985118866, "num_tokens": 412799450.0, "step": 10814 }, { "epoch": 1.375779162956367, "ewc_loss": 0.05718974769115448, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025939749320968986, "grad_norm": 6.792702674865723, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8871417045593262, "num_tokens": 412836858.0, "step": 10815 }, { "epoch": 1.3759063732349575, "ewc_loss": 0.057032544165849686, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002578254498075694, "grad_norm": 6.793054580688477, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8671176433563232, "num_tokens": 412871169.0, "step": 10816 }, { "epoch": 1.3760335835135478, "ewc_loss": 0.057013098150491714, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576309780124575, "grad_norm": 6.764614582061768, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8697335124015808, "num_tokens": 412909281.0, "step": 10817 }, { "epoch": 1.3761607937921383, "ewc_loss": 0.05695068836212158, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025700690457597375, "grad_norm": 9.894747734069824, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8639587759971619, "num_tokens": 412950126.0, "step": 10818 }, { "epoch": 1.3762880040707288, "ewc_loss": 0.060181815177202225, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00028931815177202225, "grad_norm": 7.092039585113525, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8519670963287354, "num_tokens": 412988644.0, "step": 10819 }, { "epoch": 1.3764152143493193, "ewc_loss": 0.056680139154195786, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025430138339288533, "grad_norm": 6.848148345947266, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8498568534851074, "num_tokens": 413028403.0, "step": 10820 }, { "epoch": 1.3765424246279099, "ewc_loss": 0.05724509060382843, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002599509316496551, "grad_norm": 6.840575218200684, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8555883169174194, "num_tokens": 413067429.0, "step": 10821 }, { "epoch": 1.3766696349065004, "ewc_loss": 0.05735546350479126, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002610546362120658, "grad_norm": 6.850881576538086, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8613406419754028, "num_tokens": 413101630.0, "step": 10822 }, { "epoch": 1.376796845185091, "ewc_loss": 0.057026226073503494, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002577622653916478, "grad_norm": 6.7896223068237305, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8576272130012512, "num_tokens": 413147757.0, "step": 10823 }, { "epoch": 1.3769240554636815, "ewc_loss": 0.05704789608716965, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025797897251322865, "grad_norm": 6.734219551086426, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8667194843292236, "num_tokens": 413186803.0, "step": 10824 }, { "epoch": 1.377051265742272, "ewc_loss": 0.05705498903989792, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025804986944422126, "grad_norm": 6.825116157531738, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8582015037536621, "num_tokens": 413227201.0, "step": 10825 }, { "epoch": 1.3771784760208625, "ewc_loss": 0.05692141875624657, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567141782492399, "grad_norm": 6.724488258361816, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8632140755653381, "num_tokens": 413270594.0, "step": 10826 }, { "epoch": 1.377305686299453, "ewc_loss": 0.05698239058256149, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025732393260113895, "grad_norm": 6.795636177062988, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8459649085998535, "num_tokens": 413307345.0, "step": 10827 }, { "epoch": 1.3774328965780436, "ewc_loss": 0.05698562413454056, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573562669567764, "grad_norm": 6.75069522857666, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8698920011520386, "num_tokens": 413344310.0, "step": 10828 }, { "epoch": 1.377560106856634, "ewc_loss": 0.056988462805747986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573846431914717, "grad_norm": 6.7608489990234375, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8813157081604004, "num_tokens": 413382047.0, "step": 10829 }, { "epoch": 1.3776873171352246, "ewc_loss": 0.056994203478097916, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002574420359451324, "grad_norm": 6.763413429260254, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8620275259017944, "num_tokens": 413424658.0, "step": 10830 }, { "epoch": 1.3778145274138152, "ewc_loss": 0.05690331757068634, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025653315242379904, "grad_norm": 6.801652431488037, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8726590871810913, "num_tokens": 413458386.0, "step": 10831 }, { "epoch": 1.3779417376924055, "ewc_loss": 0.05692562460899353, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567562332842499, "grad_norm": 6.836818218231201, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8505635857582092, "num_tokens": 413492530.0, "step": 10832 }, { "epoch": 1.378068947970996, "ewc_loss": 0.056901611387729645, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002565160975791514, "grad_norm": 6.756499290466309, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8752306699752808, "num_tokens": 413528894.0, "step": 10833 }, { "epoch": 1.3781961582495865, "ewc_loss": 0.05688367784023285, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000256336759775877, "grad_norm": 9.901838302612305, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8599016070365906, "num_tokens": 413564765.0, "step": 10834 }, { "epoch": 1.378323368528177, "ewc_loss": 0.06003311276435852, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00028783114976249635, "grad_norm": 7.131440162658691, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8382328748703003, "num_tokens": 413598122.0, "step": 10835 }, { "epoch": 1.3784505788067676, "ewc_loss": 0.056616976857185364, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025366974296048284, "grad_norm": 6.7855544090271, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.84760582447052, "num_tokens": 413634519.0, "step": 10836 }, { "epoch": 1.378577789085358, "ewc_loss": 0.05719441920518875, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002594442048575729, "grad_norm": 6.82203483581543, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8564395904541016, "num_tokens": 413674021.0, "step": 10837 }, { "epoch": 1.3787049993639486, "ewc_loss": 0.0572979599237442, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026047960272990167, "grad_norm": 6.84433650970459, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8694698810577393, "num_tokens": 413713804.0, "step": 10838 }, { "epoch": 1.3788322096425392, "ewc_loss": 0.05701489374041557, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576489350758493, "grad_norm": 6.720489025115967, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.870662271976471, "num_tokens": 413755665.0, "step": 10839 }, { "epoch": 1.3789594199211297, "ewc_loss": 0.05711394175887108, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002586394257377833, "grad_norm": 6.80285120010376, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8627325892448425, "num_tokens": 413790872.0, "step": 10840 }, { "epoch": 1.3790866301997202, "ewc_loss": 0.057080432772636414, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002583043242339045, "grad_norm": 6.857377052307129, "learning_rate": 1e-06, "loss": 0.3919, "mean_token_accuracy": 0.8829972743988037, "num_tokens": 413830798.0, "step": 10841 }, { "epoch": 1.3792138404783105, "ewc_loss": 0.05701758712530136, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576758561190218, "grad_norm": 6.749073505401611, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8685413599014282, "num_tokens": 413865960.0, "step": 10842 }, { "epoch": 1.379341050756901, "ewc_loss": 0.057046905159950256, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002579690481070429, "grad_norm": 6.839776039123535, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8739109039306641, "num_tokens": 413898222.0, "step": 10843 }, { "epoch": 1.3794682610354916, "ewc_loss": 0.05694685876369476, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025696857483126223, "grad_norm": 6.765989303588867, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8582463264465332, "num_tokens": 413931839.0, "step": 10844 }, { "epoch": 1.3795954713140821, "ewc_loss": 0.05696587264537811, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025715871015563607, "grad_norm": 6.825126647949219, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8542634844779968, "num_tokens": 413963815.0, "step": 10845 }, { "epoch": 1.3797226815926726, "ewc_loss": 0.0568310022354126, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025581003865227103, "grad_norm": 6.721168041229248, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.871706485748291, "num_tokens": 414002476.0, "step": 10846 }, { "epoch": 1.3798498918712632, "ewc_loss": 0.05701817572116852, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025768173509277403, "grad_norm": 6.765625, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8591617345809937, "num_tokens": 414041121.0, "step": 10847 }, { "epoch": 1.3799771021498537, "ewc_loss": 0.056938961148262024, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002568896161392331, "grad_norm": 6.757851600646973, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8515480756759644, "num_tokens": 414080447.0, "step": 10848 }, { "epoch": 1.3801043124284442, "ewc_loss": 0.05692647397518158, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567647607065737, "grad_norm": 6.834129333496094, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8525428771972656, "num_tokens": 414108925.0, "step": 10849 }, { "epoch": 1.3802315227070348, "ewc_loss": 0.056953370571136475, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002570337092038244, "grad_norm": 6.647087574005127, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8683288097381592, "num_tokens": 414154296.0, "step": 10850 }, { "epoch": 1.3803587329856253, "ewc_loss": 0.05701446533203125, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025764465681277215, "grad_norm": 6.705842971801758, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8524750471115112, "num_tokens": 414194612.0, "step": 10851 }, { "epoch": 1.3804859432642158, "ewc_loss": 0.05695980042219162, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002570979995653033, "grad_norm": 6.728573322296143, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8677830100059509, "num_tokens": 414229939.0, "step": 10852 }, { "epoch": 1.3806131535428063, "ewc_loss": 0.05703424662351608, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002578424755483866, "grad_norm": 6.741703033447266, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.860170304775238, "num_tokens": 414265193.0, "step": 10853 }, { "epoch": 1.3807403638213969, "ewc_loss": 0.056960996240377426, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025710996123962104, "grad_norm": 6.728214740753174, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8603396415710449, "num_tokens": 414301471.0, "step": 10854 }, { "epoch": 1.3808675740999874, "ewc_loss": 0.057062968611717224, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002581297012511641, "grad_norm": 6.784478664398193, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.8361164331436157, "num_tokens": 414338755.0, "step": 10855 }, { "epoch": 1.380994784378578, "ewc_loss": 0.056849345564842224, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002559934800956398, "grad_norm": 6.655878067016602, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8659496307373047, "num_tokens": 414379996.0, "step": 10856 }, { "epoch": 1.3811219946571682, "ewc_loss": 0.05711880326271057, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025868802913464606, "grad_norm": 6.783249855041504, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8656428456306458, "num_tokens": 414421905.0, "step": 10857 }, { "epoch": 1.3812492049357588, "ewc_loss": 0.05693419277667999, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002568419149611145, "grad_norm": 6.649101734161377, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8691272735595703, "num_tokens": 414465149.0, "step": 10858 }, { "epoch": 1.3813764152143493, "ewc_loss": 0.0571005716919899, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002585056936368346, "grad_norm": 6.726316452026367, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8737947344779968, "num_tokens": 414504086.0, "step": 10859 }, { "epoch": 1.3815036254929398, "ewc_loss": 0.05693141371011734, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025681412080302835, "grad_norm": 6.716907978057861, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8477901220321655, "num_tokens": 414545392.0, "step": 10860 }, { "epoch": 1.3816308357715303, "ewc_loss": 0.05712134391069412, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587134367786348, "grad_norm": 6.768453121185303, "learning_rate": 1e-06, "loss": 0.563, "mean_token_accuracy": 0.8293896913528442, "num_tokens": 414586445.0, "step": 10861 }, { "epoch": 1.3817580460501209, "ewc_loss": 0.05701926350593567, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576926490291953, "grad_norm": 6.768768310546875, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8731731176376343, "num_tokens": 414618467.0, "step": 10862 }, { "epoch": 1.3818852563287114, "ewc_loss": 0.056946080178022385, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002569608041085303, "grad_norm": 6.698092937469482, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8736851215362549, "num_tokens": 414651253.0, "step": 10863 }, { "epoch": 1.382012466607302, "ewc_loss": 0.05701747164130211, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025767472106963396, "grad_norm": 6.752564907073975, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8636322021484375, "num_tokens": 414692016.0, "step": 10864 }, { "epoch": 1.3821396768858925, "ewc_loss": 0.057020243257284164, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025770242791622877, "grad_norm": 6.735581398010254, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8693227767944336, "num_tokens": 414731840.0, "step": 10865 }, { "epoch": 1.3822668871644828, "ewc_loss": 0.056944943964481354, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025694945361465216, "grad_norm": 6.680553913116455, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8658316135406494, "num_tokens": 414767110.0, "step": 10866 }, { "epoch": 1.3823940974430733, "ewc_loss": 0.057020410895347595, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025770411593839526, "grad_norm": 6.777692794799805, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8682702779769897, "num_tokens": 414799340.0, "step": 10867 }, { "epoch": 1.3825213077216638, "ewc_loss": 0.05686246231198311, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025612462195567787, "grad_norm": 6.663641929626465, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8676362037658691, "num_tokens": 414840915.0, "step": 10868 }, { "epoch": 1.3826485180002543, "ewc_loss": 0.05697335675358772, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572335652075708, "grad_norm": 6.7388153076171875, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8517165184020996, "num_tokens": 414877298.0, "step": 10869 }, { "epoch": 1.3827757282788449, "ewc_loss": 0.056980691850185394, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025730690686032176, "grad_norm": 6.71030330657959, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8705421090126038, "num_tokens": 414916431.0, "step": 10870 }, { "epoch": 1.3829029385574354, "ewc_loss": 0.056963637471199036, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002571363584138453, "grad_norm": 6.734269142150879, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8617200255393982, "num_tokens": 414958148.0, "step": 10871 }, { "epoch": 1.383030148836026, "ewc_loss": 0.0569872222840786, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025737221585586667, "grad_norm": 6.65791654586792, "learning_rate": 1e-06, "loss": 0.3787, "mean_token_accuracy": 0.8897445201873779, "num_tokens": 414997008.0, "step": 10872 }, { "epoch": 1.3831573591146165, "ewc_loss": 0.05700862780213356, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025758627452887595, "grad_norm": 6.75504207611084, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8512860536575317, "num_tokens": 415042893.0, "step": 10873 }, { "epoch": 1.383284569393207, "ewc_loss": 0.05699533596634865, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025745335733518004, "grad_norm": 6.721284866333008, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.862043559551239, "num_tokens": 415077476.0, "step": 10874 }, { "epoch": 1.3834117796717975, "ewc_loss": 0.05698268860578537, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025732690119184554, "grad_norm": 6.683860778808594, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8597230911254883, "num_tokens": 415124329.0, "step": 10875 }, { "epoch": 1.383538989950388, "ewc_loss": 0.05705436319112778, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025804361212067306, "grad_norm": 6.789044380187988, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8760703206062317, "num_tokens": 415164101.0, "step": 10876 }, { "epoch": 1.3836662002289786, "ewc_loss": 0.05687219649553299, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025622197426855564, "grad_norm": 6.672024726867676, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8652389049530029, "num_tokens": 415203792.0, "step": 10877 }, { "epoch": 1.383793410507569, "ewc_loss": 0.057062745094299316, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002581274602562189, "grad_norm": 6.8295769691467285, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8606119751930237, "num_tokens": 415233754.0, "step": 10878 }, { "epoch": 1.3839206207861596, "ewc_loss": 0.056864723563194275, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025614723563194275, "grad_norm": 6.8028645515441895, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8725422620773315, "num_tokens": 415269567.0, "step": 10879 }, { "epoch": 1.3840478310647502, "ewc_loss": 0.05694270506501198, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025692704366520047, "grad_norm": 6.667473793029785, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8668228387832642, "num_tokens": 415312660.0, "step": 10880 }, { "epoch": 1.3841750413433405, "ewc_loss": 0.05686208978295326, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002561208966653794, "grad_norm": 6.681077003479004, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8749831318855286, "num_tokens": 415355377.0, "step": 10881 }, { "epoch": 1.384302251621931, "ewc_loss": 0.0569545179605484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025704517611302435, "grad_norm": 6.706966876983643, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8758262395858765, "num_tokens": 415393109.0, "step": 10882 }, { "epoch": 1.3844294619005215, "ewc_loss": 0.05689378082752228, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002564378082752228, "grad_norm": 6.6457037925720215, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8763267993927002, "num_tokens": 415428650.0, "step": 10883 }, { "epoch": 1.384556672179112, "ewc_loss": 0.05699682608246803, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002574682584963739, "grad_norm": 6.737455368041992, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.861372709274292, "num_tokens": 415467166.0, "step": 10884 }, { "epoch": 1.3846838824577026, "ewc_loss": 0.05692267417907715, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567267220001668, "grad_norm": 6.6427435874938965, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8772286772727966, "num_tokens": 415505633.0, "step": 10885 }, { "epoch": 1.384811092736293, "ewc_loss": 0.05712080001831055, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025870802346616983, "grad_norm": 6.792166233062744, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8640853762626648, "num_tokens": 415546254.0, "step": 10886 }, { "epoch": 1.3849383030148836, "ewc_loss": 0.05697672441601753, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025726723833940923, "grad_norm": 6.7042436599731445, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8630843162536621, "num_tokens": 415584999.0, "step": 10887 }, { "epoch": 1.3850655132934742, "ewc_loss": 0.05704726278781891, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002579726278781891, "grad_norm": 6.7233428955078125, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.865784764289856, "num_tokens": 415624608.0, "step": 10888 }, { "epoch": 1.3851927235720647, "ewc_loss": 0.057001665234565735, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025751665816642344, "grad_norm": 6.7666168212890625, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8501724004745483, "num_tokens": 415663405.0, "step": 10889 }, { "epoch": 1.385319933850655, "ewc_loss": 0.056970469653606415, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025720472331158817, "grad_norm": 6.740506172180176, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8789862990379333, "num_tokens": 415698934.0, "step": 10890 }, { "epoch": 1.3854471441292455, "ewc_loss": 0.05698651820421219, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573652018327266, "grad_norm": 6.752469062805176, "learning_rate": 1e-06, "loss": 0.5442, "mean_token_accuracy": 0.835553765296936, "num_tokens": 415736973.0, "step": 10891 }, { "epoch": 1.385574354407836, "ewc_loss": 0.05698900297284126, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573900274001062, "grad_norm": 6.755588054656982, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8672835826873779, "num_tokens": 415778364.0, "step": 10892 }, { "epoch": 1.3857015646864266, "ewc_loss": 0.05698364973068237, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573364763520658, "grad_norm": 6.694911479949951, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8805522918701172, "num_tokens": 415817502.0, "step": 10893 }, { "epoch": 1.385828774965017, "ewc_loss": 0.05709366500377655, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025843665935099125, "grad_norm": 6.7082600593566895, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8597137928009033, "num_tokens": 415860364.0, "step": 10894 }, { "epoch": 1.3859559852436076, "ewc_loss": 0.056980181485414505, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025730181368999183, "grad_norm": 6.776200294494629, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8671649694442749, "num_tokens": 415896816.0, "step": 10895 }, { "epoch": 1.3860831955221982, "ewc_loss": 0.057023536413908005, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002577353734523058, "grad_norm": 6.766324996948242, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.846472978591919, "num_tokens": 415935173.0, "step": 10896 }, { "epoch": 1.3862104058007887, "ewc_loss": 0.057045236229896545, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025795234250836074, "grad_norm": 6.7357401847839355, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8740373253822327, "num_tokens": 415977043.0, "step": 10897 }, { "epoch": 1.3863376160793792, "ewc_loss": 0.05700182914733887, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000257518287980929, "grad_norm": 6.761070728302002, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8734976649284363, "num_tokens": 416014376.0, "step": 10898 }, { "epoch": 1.3864648263579697, "ewc_loss": 0.0569952130317688, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002574521058704704, "grad_norm": 6.761516094207764, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8523924946784973, "num_tokens": 416055648.0, "step": 10899 }, { "epoch": 1.3865920366365603, "ewc_loss": 0.0570160336792469, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025766034377738833, "grad_norm": 6.819185256958008, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.857090175151825, "num_tokens": 416087062.0, "step": 10900 }, { "epoch": 1.3867192469151508, "ewc_loss": 0.05695410072803497, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002570409851614386, "grad_norm": 6.7084527015686035, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8707271814346313, "num_tokens": 416122559.0, "step": 10901 }, { "epoch": 1.3868464571937413, "ewc_loss": 0.05708625167608261, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002583625027909875, "grad_norm": 6.7987494468688965, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8564764261245728, "num_tokens": 416155228.0, "step": 10902 }, { "epoch": 1.3869736674723319, "ewc_loss": 0.05695775896310806, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025707759778015316, "grad_norm": 6.751319408416748, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.850514829158783, "num_tokens": 416191190.0, "step": 10903 }, { "epoch": 1.3871008777509224, "ewc_loss": 0.05708988010883331, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025839879526756704, "grad_norm": 6.773794651031494, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8681623935699463, "num_tokens": 416228009.0, "step": 10904 }, { "epoch": 1.387228088029513, "ewc_loss": 0.05690620094537735, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002565620234236121, "grad_norm": 6.716133117675781, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8606129884719849, "num_tokens": 416263657.0, "step": 10905 }, { "epoch": 1.3873552983081032, "ewc_loss": 0.05699598789215088, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025745987659320235, "grad_norm": 6.768362045288086, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8574090600013733, "num_tokens": 416305187.0, "step": 10906 }, { "epoch": 1.3874825085866938, "ewc_loss": 0.05697367340326309, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572367375250906, "grad_norm": 6.729429244995117, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8612769842147827, "num_tokens": 416342578.0, "step": 10907 }, { "epoch": 1.3876097188652843, "ewc_loss": 0.056914083659648895, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025664083659648895, "grad_norm": 6.762996196746826, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8603991270065308, "num_tokens": 416378969.0, "step": 10908 }, { "epoch": 1.3877369291438748, "ewc_loss": 0.057043321430683136, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025793322129175067, "grad_norm": 6.722183704376221, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8627476692199707, "num_tokens": 416412958.0, "step": 10909 }, { "epoch": 1.3878641394224653, "ewc_loss": 0.056969158351421356, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002571915974840522, "grad_norm": 6.746914863586426, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8451732397079468, "num_tokens": 416445643.0, "step": 10910 }, { "epoch": 1.3879913497010559, "ewc_loss": 0.056971289217472076, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572129014879465, "grad_norm": 6.718028545379639, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8502330183982849, "num_tokens": 416484891.0, "step": 10911 }, { "epoch": 1.3881185599796464, "ewc_loss": 0.05703945457935333, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025789457140490413, "grad_norm": 6.79715633392334, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8509079813957214, "num_tokens": 416520504.0, "step": 10912 }, { "epoch": 1.388245770258237, "ewc_loss": 0.05696277320384979, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025712771457619965, "grad_norm": 6.617680072784424, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8636760711669922, "num_tokens": 416561877.0, "step": 10913 }, { "epoch": 1.3883729805368275, "ewc_loss": 0.05715110898017883, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025901111075654626, "grad_norm": 6.746621608734131, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8645337820053101, "num_tokens": 416597676.0, "step": 10914 }, { "epoch": 1.3885001908154178, "ewc_loss": 0.05698677897453308, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573677629698068, "grad_norm": 6.698078155517578, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8704111576080322, "num_tokens": 416630557.0, "step": 10915 }, { "epoch": 1.3886274010940083, "ewc_loss": 0.05710986256599426, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025859862216748297, "grad_norm": 6.7406086921691895, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8656685948371887, "num_tokens": 416668604.0, "step": 10916 }, { "epoch": 1.3887546113725988, "ewc_loss": 0.057060785591602325, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025810787337832153, "grad_norm": 6.675608158111572, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8441035747528076, "num_tokens": 416710246.0, "step": 10917 }, { "epoch": 1.3888818216511893, "ewc_loss": 0.05711642652750015, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025866428040899336, "grad_norm": 6.732478618621826, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8642749190330505, "num_tokens": 416744962.0, "step": 10918 }, { "epoch": 1.3890090319297799, "ewc_loss": 0.05714087933301926, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002589087816886604, "grad_norm": 6.736875057220459, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8650227785110474, "num_tokens": 416779483.0, "step": 10919 }, { "epoch": 1.3891362422083704, "ewc_loss": 0.05714457109570503, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025894571444951, "grad_norm": 6.701417446136475, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8764936923980713, "num_tokens": 416822135.0, "step": 10920 }, { "epoch": 1.389263452486961, "ewc_loss": 0.05711016431450844, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025860164896585047, "grad_norm": 6.730199337005615, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.861920177936554, "num_tokens": 416860383.0, "step": 10921 }, { "epoch": 1.3893906627655515, "ewc_loss": 0.057120904326438904, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587090421002358, "grad_norm": 6.7251973152160645, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8592550158500671, "num_tokens": 416896960.0, "step": 10922 }, { "epoch": 1.389517873044142, "ewc_loss": 0.057066626846790314, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002581662847660482, "grad_norm": 6.70233678817749, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8754696846008301, "num_tokens": 416931981.0, "step": 10923 }, { "epoch": 1.3896450833227325, "ewc_loss": 0.05714765563607216, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002589765645097941, "grad_norm": 6.765347480773926, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8724531531333923, "num_tokens": 416968179.0, "step": 10924 }, { "epoch": 1.389772293601323, "ewc_loss": 0.057043563574552536, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002579356369096786, "grad_norm": 6.650557041168213, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8684549927711487, "num_tokens": 417008879.0, "step": 10925 }, { "epoch": 1.3898995038799136, "ewc_loss": 0.057154104113578796, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002590410294942558, "grad_norm": 6.753917217254639, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8743458986282349, "num_tokens": 417042497.0, "step": 10926 }, { "epoch": 1.390026714158504, "ewc_loss": 0.057025521993637085, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002577552222646773, "grad_norm": 6.678665637969971, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8639310002326965, "num_tokens": 417084396.0, "step": 10927 }, { "epoch": 1.3901539244370946, "ewc_loss": 0.05713241547346115, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025882417685352266, "grad_norm": 6.727513313293457, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8764889240264893, "num_tokens": 417121014.0, "step": 10928 }, { "epoch": 1.3902811347156852, "ewc_loss": 0.05710095167160034, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025850950623862445, "grad_norm": 6.710875988006592, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8615763187408447, "num_tokens": 417160169.0, "step": 10929 }, { "epoch": 1.3904083449942755, "ewc_loss": 0.05714043974876404, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002589044161140919, "grad_norm": 6.761986255645752, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.848366916179657, "num_tokens": 417195024.0, "step": 10930 }, { "epoch": 1.390535555272866, "ewc_loss": 0.05700370669364929, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025753708905540407, "grad_norm": 6.70574426651001, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8700305819511414, "num_tokens": 417231344.0, "step": 10931 }, { "epoch": 1.3906627655514565, "ewc_loss": 0.05709230154752731, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002584230387583375, "grad_norm": 6.720763206481934, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8545958995819092, "num_tokens": 417270266.0, "step": 10932 }, { "epoch": 1.390789975830047, "ewc_loss": 0.05702962353825569, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002577962295617908, "grad_norm": 6.697272777557373, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8618542551994324, "num_tokens": 417308292.0, "step": 10933 }, { "epoch": 1.3909171861086376, "ewc_loss": 0.05708961933851242, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002583962050266564, "grad_norm": 6.7190423011779785, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8607169389724731, "num_tokens": 417346358.0, "step": 10934 }, { "epoch": 1.391044396387228, "ewc_loss": 0.05715854465961456, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002590854710433632, "grad_norm": 6.738525867462158, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8639997243881226, "num_tokens": 417381924.0, "step": 10935 }, { "epoch": 1.3911716066658186, "ewc_loss": 0.05705876648426056, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025808767531998456, "grad_norm": 6.7438812255859375, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8377352952957153, "num_tokens": 417417779.0, "step": 10936 }, { "epoch": 1.3912988169444092, "ewc_loss": 0.057154301553964615, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025904300855472684, "grad_norm": 6.766218185424805, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8559457063674927, "num_tokens": 417459076.0, "step": 10937 }, { "epoch": 1.3914260272229997, "ewc_loss": 0.057074032723903656, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002582403540145606, "grad_norm": 6.659294128417969, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8634434938430786, "num_tokens": 417499422.0, "step": 10938 }, { "epoch": 1.39155323750159, "ewc_loss": 0.05712645500898361, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587645431049168, "grad_norm": 6.790746688842773, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8640689253807068, "num_tokens": 417541776.0, "step": 10939 }, { "epoch": 1.3916804477801805, "ewc_loss": 0.05705779790878296, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025807798374444246, "grad_norm": 6.782304763793945, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8518142104148865, "num_tokens": 417576469.0, "step": 10940 }, { "epoch": 1.391807658058771, "ewc_loss": 0.0570579394698143, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002580793807283044, "grad_norm": 6.722193717956543, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8646427392959595, "num_tokens": 417611726.0, "step": 10941 }, { "epoch": 1.3919348683373616, "ewc_loss": 0.05696021765470505, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025710216141305864, "grad_norm": 6.786970615386963, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8564454317092896, "num_tokens": 417650135.0, "step": 10942 }, { "epoch": 1.392062078615952, "ewc_loss": 0.05701211094856262, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576210827101022, "grad_norm": 6.825869560241699, "learning_rate": 1e-06, "loss": 0.5092, "mean_token_accuracy": 0.8449680805206299, "num_tokens": 417690981.0, "step": 10943 }, { "epoch": 1.3921892888945426, "ewc_loss": 0.05690942704677582, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002565942704677582, "grad_norm": 6.722434043884277, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.850174069404602, "num_tokens": 417732152.0, "step": 10944 }, { "epoch": 1.3923164991731332, "ewc_loss": 0.05690601468086243, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025656013167463243, "grad_norm": 6.848282814025879, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8535625338554382, "num_tokens": 417771814.0, "step": 10945 }, { "epoch": 1.3924437094517237, "ewc_loss": 0.0568871945142746, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002563719463068992, "grad_norm": 6.754504203796387, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.852515697479248, "num_tokens": 417810663.0, "step": 10946 }, { "epoch": 1.3925709197303142, "ewc_loss": 0.05689995363354683, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002564995374996215, "grad_norm": 6.803997993469238, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8677308559417725, "num_tokens": 417845664.0, "step": 10947 }, { "epoch": 1.3926981300089047, "ewc_loss": 0.056790247559547424, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025540246861055493, "grad_norm": 6.747182369232178, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8553338646888733, "num_tokens": 417889617.0, "step": 10948 }, { "epoch": 1.3928253402874953, "ewc_loss": 0.05688992515206337, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002563992456998676, "grad_norm": 6.769833564758301, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8603266477584839, "num_tokens": 417930820.0, "step": 10949 }, { "epoch": 1.3929525505660858, "ewc_loss": 0.056755661964416504, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025505662779323757, "grad_norm": 6.7055864334106445, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8637464046478271, "num_tokens": 417966799.0, "step": 10950 }, { "epoch": 1.3930797608446763, "ewc_loss": 0.05692543089389801, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567542833276093, "grad_norm": 6.785916805267334, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8548209071159363, "num_tokens": 418009328.0, "step": 10951 }, { "epoch": 1.3932069711232669, "ewc_loss": 0.05678582936525345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025535831809975207, "grad_norm": 6.696474552154541, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8656711578369141, "num_tokens": 418048548.0, "step": 10952 }, { "epoch": 1.3933341814018574, "ewc_loss": 0.05690629035234451, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002565628965385258, "grad_norm": 6.773906707763672, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8606474995613098, "num_tokens": 418094947.0, "step": 10953 }, { "epoch": 1.393461391680448, "ewc_loss": 0.05688078701496124, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025630785967223346, "grad_norm": 6.745763778686523, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8531197309494019, "num_tokens": 418137144.0, "step": 10954 }, { "epoch": 1.3935886019590382, "ewc_loss": 0.056923504918813705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567350456956774, "grad_norm": 6.751590728759766, "learning_rate": 1e-06, "loss": 0.3887, "mean_token_accuracy": 0.8844822645187378, "num_tokens": 418171449.0, "step": 10955 }, { "epoch": 1.3937158122376287, "ewc_loss": 0.056797660887241364, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002554766251705587, "grad_norm": 6.691012382507324, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8652975559234619, "num_tokens": 418209854.0, "step": 10956 }, { "epoch": 1.3938430225162193, "ewc_loss": 0.05693754553794861, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025687547167763114, "grad_norm": 6.819189071655273, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.870611846446991, "num_tokens": 418245121.0, "step": 10957 }, { "epoch": 1.3939702327948098, "ewc_loss": 0.05680181831121445, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002555181854404509, "grad_norm": 6.651286602020264, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8672935962677002, "num_tokens": 418284158.0, "step": 10958 }, { "epoch": 1.3940974430734003, "ewc_loss": 0.05700118839740753, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025751188513822854, "grad_norm": 6.798932075500488, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8699555397033691, "num_tokens": 418324091.0, "step": 10959 }, { "epoch": 1.3942246533519909, "ewc_loss": 0.05676911771297455, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002551912039052695, "grad_norm": 6.7142534255981445, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8631642460823059, "num_tokens": 418364764.0, "step": 10960 }, { "epoch": 1.3943518636305814, "ewc_loss": 0.05693330615758896, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002568330382928252, "grad_norm": 6.739999771118164, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8606833219528198, "num_tokens": 418403974.0, "step": 10961 }, { "epoch": 1.394479073909172, "ewc_loss": 0.056857794523239136, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025607793941162527, "grad_norm": 6.70184850692749, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8523037433624268, "num_tokens": 418446341.0, "step": 10962 }, { "epoch": 1.3946062841877624, "ewc_loss": 0.05699058622121811, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002574058889877051, "grad_norm": 6.727563858032227, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8629872798919678, "num_tokens": 418490989.0, "step": 10963 }, { "epoch": 1.3947334944663528, "ewc_loss": 0.056897517293691635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002564751775935292, "grad_norm": 6.77721643447876, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8577711582183838, "num_tokens": 418524633.0, "step": 10964 }, { "epoch": 1.3948607047449433, "ewc_loss": 0.05681449547410011, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025564496172592044, "grad_norm": 6.682612895965576, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8628016710281372, "num_tokens": 418562576.0, "step": 10965 }, { "epoch": 1.3949879150235338, "ewc_loss": 0.05703570693731308, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025785708567127585, "grad_norm": 6.815159797668457, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8614375591278076, "num_tokens": 418596537.0, "step": 10966 }, { "epoch": 1.3951151253021243, "ewc_loss": 0.05686113238334656, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002561113506089896, "grad_norm": 6.7524333000183105, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8605304956436157, "num_tokens": 418625800.0, "step": 10967 }, { "epoch": 1.3952423355807149, "ewc_loss": 0.05703829601407051, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002578829589765519, "grad_norm": 6.731837749481201, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8487217426300049, "num_tokens": 418663818.0, "step": 10968 }, { "epoch": 1.3953695458593054, "ewc_loss": 0.05692823976278305, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567823976278305, "grad_norm": 6.752289295196533, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8679331541061401, "num_tokens": 418700607.0, "step": 10969 }, { "epoch": 1.395496756137896, "ewc_loss": 0.05699463188648224, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025744634331203997, "grad_norm": 6.748095989227295, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8571212887763977, "num_tokens": 418739357.0, "step": 10970 }, { "epoch": 1.3956239664164865, "ewc_loss": 0.05692417174577713, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000256741710472852, "grad_norm": 6.758486270904541, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8688661456108093, "num_tokens": 418769752.0, "step": 10971 }, { "epoch": 1.395751176695077, "ewc_loss": 0.05699218064546585, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025742180878296494, "grad_norm": 6.777825355529785, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.843574047088623, "num_tokens": 418808196.0, "step": 10972 }, { "epoch": 1.3958783869736675, "ewc_loss": 0.056968554854393005, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025718557299114764, "grad_norm": 6.7378644943237305, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.878316342830658, "num_tokens": 418843335.0, "step": 10973 }, { "epoch": 1.396005597252258, "ewc_loss": 0.05695698410272598, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002570698270574212, "grad_norm": 6.723730564117432, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.871371865272522, "num_tokens": 418876591.0, "step": 10974 }, { "epoch": 1.3961328075308486, "ewc_loss": 0.056991659104824066, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025741657009348273, "grad_norm": 6.761897563934326, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8511451482772827, "num_tokens": 418910482.0, "step": 10975 }, { "epoch": 1.396260017809439, "ewc_loss": 0.05698380991816521, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573381061665714, "grad_norm": 6.7121052742004395, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8541429042816162, "num_tokens": 418942248.0, "step": 10976 }, { "epoch": 1.3963872280880296, "ewc_loss": 0.057009968906641006, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002575996913947165, "grad_norm": 6.740033149719238, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8678117394447327, "num_tokens": 418977770.0, "step": 10977 }, { "epoch": 1.3965144383666201, "ewc_loss": 0.05697224289178848, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572224475443363, "grad_norm": 6.780721187591553, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8554784059524536, "num_tokens": 419016464.0, "step": 10978 }, { "epoch": 1.3966416486452105, "ewc_loss": 0.05697663873434067, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000257266394328326, "grad_norm": 6.690496921539307, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8547035455703735, "num_tokens": 419053254.0, "step": 10979 }, { "epoch": 1.396768858923801, "ewc_loss": 0.05728216469287872, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025788022321648896, "grad_norm": 6.795274257659912, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8556379675865173, "num_tokens": 419088940.0, "step": 10980 }, { "epoch": 1.3968960692023915, "ewc_loss": 0.05717006325721741, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002567592018749565, "grad_norm": 6.716139316558838, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8614711165428162, "num_tokens": 419127897.0, "step": 10981 }, { "epoch": 1.397023279480982, "ewc_loss": 0.05728530138731003, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002579116262495518, "grad_norm": 6.726679801940918, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8650169372558594, "num_tokens": 419170921.0, "step": 10982 }, { "epoch": 1.3971504897595726, "ewc_loss": 0.057183653116226196, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025689511676318944, "grad_norm": 6.731647968292236, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8695313930511475, "num_tokens": 419206032.0, "step": 10983 }, { "epoch": 1.397277700038163, "ewc_loss": 0.05725519359111786, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002576105180196464, "grad_norm": 6.7448577880859375, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8781027793884277, "num_tokens": 419244240.0, "step": 10984 }, { "epoch": 1.3974049103167536, "ewc_loss": 0.05722472816705704, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002573058591224253, "grad_norm": 6.788112163543701, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8717746734619141, "num_tokens": 419274902.0, "step": 10985 }, { "epoch": 1.3975321205953442, "ewc_loss": 0.057168327271938324, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002567418559920043, "grad_norm": 6.742206573486328, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8753626942634583, "num_tokens": 419312295.0, "step": 10986 }, { "epoch": 1.3976593308739347, "ewc_loss": 0.05722644180059433, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025732300127856433, "grad_norm": 6.835829257965088, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8462944626808167, "num_tokens": 419350999.0, "step": 10987 }, { "epoch": 1.397786541152525, "ewc_loss": 0.057151615619659424, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002565747417975217, "grad_norm": 6.706068515777588, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8636049032211304, "num_tokens": 419386739.0, "step": 10988 }, { "epoch": 1.3979137514311155, "ewc_loss": 0.05728936567902565, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002579522551968694, "grad_norm": 6.802549839019775, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8696197867393494, "num_tokens": 419430353.0, "step": 10989 }, { "epoch": 1.398040961709706, "ewc_loss": 0.05714643746614456, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002565229660831392, "grad_norm": 6.8047966957092285, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8587615489959717, "num_tokens": 419462910.0, "step": 10990 }, { "epoch": 1.3981681719882966, "ewc_loss": 0.05714210867881775, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025647968868725, "grad_norm": 6.6970109939575195, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8726869225502014, "num_tokens": 419498727.0, "step": 10991 }, { "epoch": 1.398295382266887, "ewc_loss": 0.05720680207014084, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025712663773447275, "grad_norm": 6.867554187774658, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.855809211730957, "num_tokens": 419534496.0, "step": 10992 }, { "epoch": 1.3984225925454776, "ewc_loss": 0.05704818293452263, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025554042076691985, "grad_norm": 6.6695756912231445, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8589786291122437, "num_tokens": 419575760.0, "step": 10993 }, { "epoch": 1.3985498028240682, "ewc_loss": 0.05724004656076431, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002574590325821191, "grad_norm": 6.759187698364258, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8710798025131226, "num_tokens": 419616822.0, "step": 10994 }, { "epoch": 1.3986770131026587, "ewc_loss": 0.057129934430122375, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025635791826061904, "grad_norm": 6.760594844818115, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.870506763458252, "num_tokens": 419650425.0, "step": 10995 }, { "epoch": 1.3988042233812492, "ewc_loss": 0.057122960686683655, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025628821458667517, "grad_norm": 6.702907085418701, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8694413900375366, "num_tokens": 419685501.0, "step": 10996 }, { "epoch": 1.3989314336598397, "ewc_loss": 0.05711011216044426, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002561597211752087, "grad_norm": 6.7161760330200195, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8667749166488647, "num_tokens": 419724471.0, "step": 10997 }, { "epoch": 1.3990586439384303, "ewc_loss": 0.057151660323143005, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000256575207458809, "grad_norm": 6.838714122772217, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8703884482383728, "num_tokens": 419763683.0, "step": 10998 }, { "epoch": 1.3991858542170208, "ewc_loss": 0.057072773575782776, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025578634813427925, "grad_norm": 6.734740734100342, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8669103384017944, "num_tokens": 419801238.0, "step": 10999 }, { "epoch": 1.3993130644956113, "ewc_loss": 0.0568748377263546, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002562483714427799, "grad_norm": 6.775193214416504, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.859796941280365, "num_tokens": 419832787.0, "step": 11000 }, { "epoch": 1.3994402747742019, "ewc_loss": 0.056887708604335785, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002563770685810596, "grad_norm": 6.820968151092529, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.852972686290741, "num_tokens": 419864294.0, "step": 11001 }, { "epoch": 1.3995674850527924, "ewc_loss": 0.05686311051249504, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002561311121098697, "grad_norm": 6.723269462585449, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8664631843566895, "num_tokens": 419900493.0, "step": 11002 }, { "epoch": 1.399694695331383, "ewc_loss": 0.057163987308740616, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002566984621807933, "grad_norm": 6.730415344238281, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8807591199874878, "num_tokens": 419938917.0, "step": 11003 }, { "epoch": 1.3998219056099732, "ewc_loss": 0.056963447481393814, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002571344666648656, "grad_norm": 6.756814479827881, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8784421682357788, "num_tokens": 419978771.0, "step": 11004 }, { "epoch": 1.3999491158885637, "ewc_loss": 0.05715346336364746, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025659322272986174, "grad_norm": 6.771109580993652, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8614563345909119, "num_tokens": 420011836.0, "step": 11005 }, { "epoch": 1.4000763261671543, "ewc_loss": 0.05717252939939499, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025678391102701426, "grad_norm": 6.757670879364014, "learning_rate": 1e-06, "loss": 0.5323, "mean_token_accuracy": 0.8426069021224976, "num_tokens": 420051618.0, "step": 11006 }, { "epoch": 1.4002035364457448, "ewc_loss": 0.0569954514503479, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002574545214883983, "grad_norm": 6.879067420959473, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8499981164932251, "num_tokens": 420089119.0, "step": 11007 }, { "epoch": 1.4003307467243353, "ewc_loss": 0.0568719357252121, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025621935492381454, "grad_norm": 6.699027061462402, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8742727041244507, "num_tokens": 420123706.0, "step": 11008 }, { "epoch": 1.4004579570029259, "ewc_loss": 0.057010795921087265, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576079568825662, "grad_norm": 6.84171724319458, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.860867977142334, "num_tokens": 420156723.0, "step": 11009 }, { "epoch": 1.4005851672815164, "ewc_loss": 0.05715179815888405, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002565765753388405, "grad_norm": 6.722238540649414, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8626822233200073, "num_tokens": 420198519.0, "step": 11010 }, { "epoch": 1.400712377560107, "ewc_loss": 0.05721765756607056, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002572351659182459, "grad_norm": 6.771810054779053, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8764920234680176, "num_tokens": 420231722.0, "step": 11011 }, { "epoch": 1.4008395878386974, "ewc_loss": 0.05691859871149063, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025668597663752735, "grad_norm": 6.7182512283325195, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8659154772758484, "num_tokens": 420272059.0, "step": 11012 }, { "epoch": 1.4009667981172877, "ewc_loss": 0.057027846574783325, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025777844712138176, "grad_norm": 6.772871971130371, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8710582852363586, "num_tokens": 420307695.0, "step": 11013 }, { "epoch": 1.4010940083958783, "ewc_loss": 0.05694273114204407, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025692733470350504, "grad_norm": 6.806042671203613, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.873059868812561, "num_tokens": 420345989.0, "step": 11014 }, { "epoch": 1.4012212186744688, "ewc_loss": 0.05697329342365265, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572329540271312, "grad_norm": 6.702915191650391, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8698300123214722, "num_tokens": 420387194.0, "step": 11015 }, { "epoch": 1.4013484289530593, "ewc_loss": 0.057236574590206146, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002574243408162147, "grad_norm": 6.747039318084717, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8789284825325012, "num_tokens": 420427608.0, "step": 11016 }, { "epoch": 1.4014756392316499, "ewc_loss": 0.057237252593040466, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002574311220087111, "grad_norm": 6.747132301330566, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.869840145111084, "num_tokens": 420461363.0, "step": 11017 }, { "epoch": 1.4016028495102404, "ewc_loss": 0.05695779249072075, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002570779179222882, "grad_norm": 6.738361835479736, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8465173244476318, "num_tokens": 420503577.0, "step": 11018 }, { "epoch": 1.401730059788831, "ewc_loss": 0.05701938271522522, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576938131824136, "grad_norm": 6.796050071716309, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8748100996017456, "num_tokens": 420538852.0, "step": 11019 }, { "epoch": 1.4018572700674214, "ewc_loss": 0.05696066468954086, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025710667250677943, "grad_norm": 6.748268127441406, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8615213632583618, "num_tokens": 420577495.0, "step": 11020 }, { "epoch": 1.401984480346012, "ewc_loss": 0.05697683244943619, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572683442849666, "grad_norm": 6.698263168334961, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8637652397155762, "num_tokens": 420616123.0, "step": 11021 }, { "epoch": 1.4021116906246025, "ewc_loss": 0.05695926770567894, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025709267356432974, "grad_norm": 6.764138698577881, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8745971918106079, "num_tokens": 420653904.0, "step": 11022 }, { "epoch": 1.402238900903193, "ewc_loss": 0.05697001516819, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000257200124906376, "grad_norm": 6.699187755584717, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8888201713562012, "num_tokens": 420693539.0, "step": 11023 }, { "epoch": 1.4023661111817836, "ewc_loss": 0.0570201575756073, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002577015839051455, "grad_norm": 9.902734756469727, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8678145408630371, "num_tokens": 420729046.0, "step": 11024 }, { "epoch": 1.402493321460374, "ewc_loss": 0.060321126133203506, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002907112648244947, "grad_norm": 7.109673500061035, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8676754236221313, "num_tokens": 420766143.0, "step": 11025 }, { "epoch": 1.4026205317389646, "ewc_loss": 0.05688333883881569, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000256333383731544, "grad_norm": 6.840523719787598, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8603145480155945, "num_tokens": 420806367.0, "step": 11026 }, { "epoch": 1.4027477420175551, "ewc_loss": 0.0573081374168396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002605813497211784, "grad_norm": 6.852875709533691, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8711757659912109, "num_tokens": 420845081.0, "step": 11027 }, { "epoch": 1.4028749522961454, "ewc_loss": 0.05738363415002823, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026133633218705654, "grad_norm": 6.8252058029174805, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8697724342346191, "num_tokens": 420885833.0, "step": 11028 }, { "epoch": 1.403002162574736, "ewc_loss": 0.05712603032588959, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587603230495006, "grad_norm": 6.909233570098877, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8691422939300537, "num_tokens": 420916313.0, "step": 11029 }, { "epoch": 1.4031293728533265, "ewc_loss": 0.05699557811021805, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002574558020569384, "grad_norm": 6.760457515716553, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.846744179725647, "num_tokens": 420957011.0, "step": 11030 }, { "epoch": 1.403256583131917, "ewc_loss": 0.057138293981552124, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025888296659104526, "grad_norm": 6.828083038330078, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8679876327514648, "num_tokens": 420994815.0, "step": 11031 }, { "epoch": 1.4033837934105076, "ewc_loss": 0.057033367455005646, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002578336570877582, "grad_norm": 6.760961532592773, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8721293210983276, "num_tokens": 421033977.0, "step": 11032 }, { "epoch": 1.403511003689098, "ewc_loss": 0.05699719488620758, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025747192557901144, "grad_norm": 6.79874849319458, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8676043152809143, "num_tokens": 421068871.0, "step": 11033 }, { "epoch": 1.4036382139676886, "ewc_loss": 0.056966736912727356, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002571673539932817, "grad_norm": 6.759363651275635, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8648897409439087, "num_tokens": 421105680.0, "step": 11034 }, { "epoch": 1.4037654242462791, "ewc_loss": 0.05705799162387848, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025807993370108306, "grad_norm": 6.7723236083984375, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8765583038330078, "num_tokens": 421149519.0, "step": 11035 }, { "epoch": 1.4038926345248697, "ewc_loss": 0.05698912590742111, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573912497609854, "grad_norm": 6.796003818511963, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8654571175575256, "num_tokens": 421190195.0, "step": 11036 }, { "epoch": 1.40401984480346, "ewc_loss": 0.0570015124976635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025751511566340923, "grad_norm": 6.830587863922119, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8599562048912048, "num_tokens": 421218637.0, "step": 11037 }, { "epoch": 1.4041470550820505, "ewc_loss": 0.05697307363152504, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572307421360165, "grad_norm": 6.784595489501953, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.869143009185791, "num_tokens": 421261300.0, "step": 11038 }, { "epoch": 1.404274265360641, "ewc_loss": 0.057028450071811676, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025778450071811676, "grad_norm": 6.765562534332275, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8609459400177002, "num_tokens": 421297250.0, "step": 11039 }, { "epoch": 1.4044014756392316, "ewc_loss": 0.05705083906650543, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002580083964858204, "grad_norm": 6.812836170196533, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8567049503326416, "num_tokens": 421334213.0, "step": 11040 }, { "epoch": 1.404528685917822, "ewc_loss": 0.0570194348692894, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002576943370513618, "grad_norm": 6.775456428527832, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8724108338356018, "num_tokens": 421368341.0, "step": 11041 }, { "epoch": 1.4046558961964126, "ewc_loss": 0.057010747492313385, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025760746211744845, "grad_norm": 6.791459083557129, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8521311283111572, "num_tokens": 421408278.0, "step": 11042 }, { "epoch": 1.4047831064750032, "ewc_loss": 0.05706787109375, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002581787121016532, "grad_norm": 6.792928218841553, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8554333448410034, "num_tokens": 421442559.0, "step": 11043 }, { "epoch": 1.4049103167535937, "ewc_loss": 0.057000890374183655, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002575088874436915, "grad_norm": 6.7696309089660645, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8617526292800903, "num_tokens": 421474014.0, "step": 11044 }, { "epoch": 1.4050375270321842, "ewc_loss": 0.05702110379934311, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002577110135462135, "grad_norm": 6.84014368057251, "learning_rate": 1e-06, "loss": 0.53, "mean_token_accuracy": 0.8394626379013062, "num_tokens": 421511193.0, "step": 11045 }, { "epoch": 1.4051647373107747, "ewc_loss": 0.057006314396858215, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002575631660874933, "grad_norm": 6.731829643249512, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.865805983543396, "num_tokens": 421547895.0, "step": 11046 }, { "epoch": 1.4052919475893653, "ewc_loss": 0.0570962056517601, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002584620378911495, "grad_norm": 6.84453010559082, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8630671501159668, "num_tokens": 421582883.0, "step": 11047 }, { "epoch": 1.4054191578679558, "ewc_loss": 0.05691712349653244, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002566712209954858, "grad_norm": 6.742465972900391, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8672115802764893, "num_tokens": 421619838.0, "step": 11048 }, { "epoch": 1.4055463681465463, "ewc_loss": 0.05714167281985283, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025891672703437507, "grad_norm": 6.811578750610352, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8745765686035156, "num_tokens": 421659451.0, "step": 11049 }, { "epoch": 1.4056735784251368, "ewc_loss": 0.056981660425662994, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025731659843586385, "grad_norm": 6.73998498916626, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8599169254302979, "num_tokens": 421704826.0, "step": 11050 }, { "epoch": 1.4058007887037274, "ewc_loss": 0.05712074786424637, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587074995972216, "grad_norm": 6.820291996002197, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8594895601272583, "num_tokens": 421748953.0, "step": 11051 }, { "epoch": 1.405927998982318, "ewc_loss": 0.057053450495004654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002580345026217401, "grad_norm": 6.772246360778809, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8462228775024414, "num_tokens": 421786614.0, "step": 11052 }, { "epoch": 1.4060552092609082, "ewc_loss": 0.057011500000953674, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025761500000953674, "grad_norm": 6.818757057189941, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8589885830879211, "num_tokens": 421817291.0, "step": 11053 }, { "epoch": 1.4061824195394987, "ewc_loss": 0.05703860521316528, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025788607308641076, "grad_norm": 6.803898334503174, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8620758056640625, "num_tokens": 421858200.0, "step": 11054 }, { "epoch": 1.4063096298180893, "ewc_loss": 0.05699589475989342, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025745894527062774, "grad_norm": 6.694034576416016, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8633796572685242, "num_tokens": 421903343.0, "step": 11055 }, { "epoch": 1.4064368400966798, "ewc_loss": 0.05710607022047043, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025856069987639785, "grad_norm": 6.7970452308654785, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8807034492492676, "num_tokens": 421941426.0, "step": 11056 }, { "epoch": 1.4065640503752703, "ewc_loss": 0.0570378415286541, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025787841877900064, "grad_norm": 6.708775043487549, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8612816333770752, "num_tokens": 421983992.0, "step": 11057 }, { "epoch": 1.4066912606538609, "ewc_loss": 0.057081546634435654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002583154710009694, "grad_norm": 6.785489082336426, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8727773427963257, "num_tokens": 422015066.0, "step": 11058 }, { "epoch": 1.4068184709324514, "ewc_loss": 0.057084064930677414, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002583406458143145, "grad_norm": 6.759959697723389, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8671804666519165, "num_tokens": 422052392.0, "step": 11059 }, { "epoch": 1.406945681211042, "ewc_loss": 0.05713307857513428, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002588308125268668, "grad_norm": 6.74334716796875, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8607586622238159, "num_tokens": 422091453.0, "step": 11060 }, { "epoch": 1.4070728914896324, "ewc_loss": 0.057078804820775986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025828805519267917, "grad_norm": 6.756113052368164, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8485103845596313, "num_tokens": 422131019.0, "step": 11061 }, { "epoch": 1.4072001017682227, "ewc_loss": 0.05712662264704704, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587662311270833, "grad_norm": 6.727606296539307, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8573362827301025, "num_tokens": 422172139.0, "step": 11062 }, { "epoch": 1.4073273120468133, "ewc_loss": 0.057176776230335236, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025926774833351374, "grad_norm": 6.810033321380615, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.871609091758728, "num_tokens": 422209708.0, "step": 11063 }, { "epoch": 1.4074545223254038, "ewc_loss": 0.05727214738726616, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025778007693588734, "grad_norm": 6.721388816833496, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.865153431892395, "num_tokens": 422245091.0, "step": 11064 }, { "epoch": 1.4075817326039943, "ewc_loss": 0.05726690590381622, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026016903575509787, "grad_norm": 6.813069820404053, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8642569780349731, "num_tokens": 422280054.0, "step": 11065 }, { "epoch": 1.4077089428825849, "ewc_loss": 0.057029396295547485, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025779398856684566, "grad_norm": 6.738645553588867, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.871284544467926, "num_tokens": 422310036.0, "step": 11066 }, { "epoch": 1.4078361531611754, "ewc_loss": 0.05719904601573944, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002594904799479991, "grad_norm": 6.800167560577393, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8489408493041992, "num_tokens": 422347914.0, "step": 11067 }, { "epoch": 1.407963363439766, "ewc_loss": 0.05706057697534561, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002581057487986982, "grad_norm": 6.693732738494873, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8652847409248352, "num_tokens": 422383207.0, "step": 11068 }, { "epoch": 1.4080905737183564, "ewc_loss": 0.057206038385629654, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002595603873487562, "grad_norm": 6.799480438232422, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8678486347198486, "num_tokens": 422414675.0, "step": 11069 }, { "epoch": 1.408217783996947, "ewc_loss": 0.05735814571380615, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002586400369182229, "grad_norm": 6.7395429611206055, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8529502153396606, "num_tokens": 422449746.0, "step": 11070 }, { "epoch": 1.4083449942755375, "ewc_loss": 0.057361528277397156, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002586738846730441, "grad_norm": 6.73117733001709, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8574351072311401, "num_tokens": 422487789.0, "step": 11071 }, { "epoch": 1.408472204554128, "ewc_loss": 0.05741012096405029, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002591598022263497, "grad_norm": 6.7950639724731445, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8697682619094849, "num_tokens": 422521750.0, "step": 11072 }, { "epoch": 1.4085994148327186, "ewc_loss": 0.05739596113562584, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002590182120911777, "grad_norm": 6.695529460906982, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8729985356330872, "num_tokens": 422562923.0, "step": 11073 }, { "epoch": 1.408726625111309, "ewc_loss": 0.05727377161383629, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026023772079497576, "grad_norm": 6.721405982971191, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8548370599746704, "num_tokens": 422606832.0, "step": 11074 }, { "epoch": 1.4088538353898996, "ewc_loss": 0.05748565495014191, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002599151630420238, "grad_norm": 6.739181995391846, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8666352033615112, "num_tokens": 422644465.0, "step": 11075 }, { "epoch": 1.4089810456684901, "ewc_loss": 0.05751510336995125, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000260209635598585, "grad_norm": 6.724798679351807, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8721345663070679, "num_tokens": 422681204.0, "step": 11076 }, { "epoch": 1.4091082559470804, "ewc_loss": 0.05757937952876091, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000260852393694222, "grad_norm": 6.8225998878479, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8532363176345825, "num_tokens": 422716325.0, "step": 11077 }, { "epoch": 1.409235466225671, "ewc_loss": 0.057169340550899506, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002591933880466968, "grad_norm": 6.744091033935547, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8540402054786682, "num_tokens": 422753644.0, "step": 11078 }, { "epoch": 1.4093626765042615, "ewc_loss": 0.05749879777431488, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002600465959403664, "grad_norm": 6.823732376098633, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8649157285690308, "num_tokens": 422790893.0, "step": 11079 }, { "epoch": 1.409489886782852, "ewc_loss": 0.057125166058540344, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587516501080245, "grad_norm": 6.734411716461182, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8560265302658081, "num_tokens": 422829400.0, "step": 11080 }, { "epoch": 1.4096170970614426, "ewc_loss": 0.057440776377916336, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002594663528725505, "grad_norm": 6.769714832305908, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8731592893600464, "num_tokens": 422867386.0, "step": 11081 }, { "epoch": 1.409744307340033, "ewc_loss": 0.05735526233911514, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002586111950222403, "grad_norm": 6.699865341186523, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8669677376747131, "num_tokens": 422910736.0, "step": 11082 }, { "epoch": 1.4098715176186236, "ewc_loss": 0.057429347187280655, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002593520621303469, "grad_norm": 6.755278587341309, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8674283027648926, "num_tokens": 422950302.0, "step": 11083 }, { "epoch": 1.4099987278972141, "ewc_loss": 0.05739803612232208, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002590389340184629, "grad_norm": 6.756795883178711, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8630084991455078, "num_tokens": 422985892.0, "step": 11084 }, { "epoch": 1.4101259381758047, "ewc_loss": 0.057127077132463455, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025877077132463455, "grad_norm": 6.805028438568115, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.855849027633667, "num_tokens": 423020075.0, "step": 11085 }, { "epoch": 1.410253148454395, "ewc_loss": 0.05709003657102585, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002584003668744117, "grad_norm": 6.761138439178467, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8663510084152222, "num_tokens": 423058260.0, "step": 11086 }, { "epoch": 1.4103803587329855, "ewc_loss": 0.05710993707180023, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025859937886707485, "grad_norm": 6.723028659820557, "learning_rate": 1e-06, "loss": 0.3794, "mean_token_accuracy": 0.8873029947280884, "num_tokens": 423092158.0, "step": 11087 }, { "epoch": 1.410507569011576, "ewc_loss": 0.057120442390441895, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025870444369502366, "grad_norm": 6.824869155883789, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8613759279251099, "num_tokens": 423123999.0, "step": 11088 }, { "epoch": 1.4106347792901666, "ewc_loss": 0.05706227943301201, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025812280364334583, "grad_norm": 6.691636562347412, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8600484132766724, "num_tokens": 423167111.0, "step": 11089 }, { "epoch": 1.410761989568757, "ewc_loss": 0.05714406073093414, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025894062127918005, "grad_norm": 6.8516340255737305, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8563585877418518, "num_tokens": 423202060.0, "step": 11090 }, { "epoch": 1.4108891998473476, "ewc_loss": 0.057085808366537094, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025835807900875807, "grad_norm": 6.690972805023193, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.856194257736206, "num_tokens": 423241200.0, "step": 11091 }, { "epoch": 1.4110164101259381, "ewc_loss": 0.05712892860174179, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587893104646355, "grad_norm": 6.798488616943359, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8732603192329407, "num_tokens": 423284713.0, "step": 11092 }, { "epoch": 1.4111436204045287, "ewc_loss": 0.05710091441869736, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002585091278888285, "grad_norm": 6.732137203216553, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8666384220123291, "num_tokens": 423322524.0, "step": 11093 }, { "epoch": 1.4112708306831192, "ewc_loss": 0.057166632264852524, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000259166321484372, "grad_norm": 6.7910895347595215, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8648654818534851, "num_tokens": 423360551.0, "step": 11094 }, { "epoch": 1.4113980409617097, "ewc_loss": 0.05709652975201607, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002584652975201607, "grad_norm": 6.7250590324401855, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8744879364967346, "num_tokens": 423395227.0, "step": 11095 }, { "epoch": 1.4115252512403003, "ewc_loss": 0.05737999081611633, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025885849026963115, "grad_norm": 6.843650817871094, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8505324125289917, "num_tokens": 423434757.0, "step": 11096 }, { "epoch": 1.4116524615188908, "ewc_loss": 0.057046130299568176, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002579613064881414, "grad_norm": 6.77955961227417, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8819575905799866, "num_tokens": 423464599.0, "step": 11097 }, { "epoch": 1.4117796717974813, "ewc_loss": 0.05734699219465256, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025852854014374316, "grad_norm": 6.803217887878418, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8566889762878418, "num_tokens": 423504327.0, "step": 11098 }, { "epoch": 1.4119068820760718, "ewc_loss": 0.05708151310682297, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025831512175500393, "grad_norm": 6.777652740478516, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8459962606430054, "num_tokens": 423543114.0, "step": 11099 }, { "epoch": 1.4120340923546624, "ewc_loss": 0.05724475532770157, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002575061225797981, "grad_norm": 6.823143482208252, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8516707420349121, "num_tokens": 423581727.0, "step": 11100 }, { "epoch": 1.412161302633253, "ewc_loss": 0.05720242112874985, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025708277826197445, "grad_norm": 6.778350830078125, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8693742156028748, "num_tokens": 423617073.0, "step": 11101 }, { "epoch": 1.4122885129118432, "ewc_loss": 0.05727914720773697, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002578500716481358, "grad_norm": 6.737522125244141, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8691578507423401, "num_tokens": 423664922.0, "step": 11102 }, { "epoch": 1.4124157231904337, "ewc_loss": 0.05717968940734863, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025685547734610736, "grad_norm": 6.716849327087402, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8553170561790466, "num_tokens": 423706990.0, "step": 11103 }, { "epoch": 1.4125429334690243, "ewc_loss": 0.05725681781768799, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025762675795704126, "grad_norm": 6.832996368408203, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8473426699638367, "num_tokens": 423745945.0, "step": 11104 }, { "epoch": 1.4126701437476148, "ewc_loss": 0.05713382735848427, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025639685918577015, "grad_norm": 6.708442687988281, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8693937063217163, "num_tokens": 423780824.0, "step": 11105 }, { "epoch": 1.4127973540262053, "ewc_loss": 0.0572659969329834, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025771858054213226, "grad_norm": 6.74877405166626, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8779429197311401, "num_tokens": 423818640.0, "step": 11106 }, { "epoch": 1.4129245643047958, "ewc_loss": 0.05716889351606369, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025674753123894334, "grad_norm": 6.735939979553223, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8724170923233032, "num_tokens": 423856657.0, "step": 11107 }, { "epoch": 1.4130517745833864, "ewc_loss": 0.05727984011173248, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002578569983597845, "grad_norm": 6.728762149810791, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8758056163787842, "num_tokens": 423900288.0, "step": 11108 }, { "epoch": 1.413178984861977, "ewc_loss": 0.05729667842388153, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002580253640189767, "grad_norm": 6.773439407348633, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.870140790939331, "num_tokens": 423935929.0, "step": 11109 }, { "epoch": 1.4133061951405674, "ewc_loss": 0.05704864487051964, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025798645219765604, "grad_norm": 6.763045310974121, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8624991774559021, "num_tokens": 423970719.0, "step": 11110 }, { "epoch": 1.4134334054191577, "ewc_loss": 0.057159438729286194, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000259094376815483, "grad_norm": 6.7804107666015625, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8644986748695374, "num_tokens": 424012338.0, "step": 11111 }, { "epoch": 1.4135606156977483, "ewc_loss": 0.05712868645787239, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002587868657428771, "grad_norm": 6.763383865356445, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8467714786529541, "num_tokens": 424050112.0, "step": 11112 }, { "epoch": 1.4136878259763388, "ewc_loss": 0.0570526048541069, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002580260334070772, "grad_norm": 6.705069541931152, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8637745976448059, "num_tokens": 424092688.0, "step": 11113 }, { "epoch": 1.4138150362549293, "ewc_loss": 0.05721556395292282, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002596556441858411, "grad_norm": 6.779071807861328, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8628391623497009, "num_tokens": 424131355.0, "step": 11114 }, { "epoch": 1.4139422465335199, "ewc_loss": 0.05711032450199127, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025860322057269514, "grad_norm": 6.714526653289795, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8531485199928284, "num_tokens": 424177768.0, "step": 11115 }, { "epoch": 1.4140694568121104, "ewc_loss": 0.057266779243946075, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002601677842903882, "grad_norm": 6.778970718383789, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8772623538970947, "num_tokens": 424215834.0, "step": 11116 }, { "epoch": 1.414196667090701, "ewc_loss": 0.057104554027318954, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002585455367807299, "grad_norm": 6.713486194610596, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8679153919219971, "num_tokens": 424260002.0, "step": 11117 }, { "epoch": 1.4143238773692914, "ewc_loss": 0.05727144330739975, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026021443773061037, "grad_norm": 6.777843952178955, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8621763586997986, "num_tokens": 424299343.0, "step": 11118 }, { "epoch": 1.414451087647882, "ewc_loss": 0.05714196711778641, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025891969562508166, "grad_norm": 6.740828514099121, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8584443926811218, "num_tokens": 424340312.0, "step": 11119 }, { "epoch": 1.4145782979264725, "ewc_loss": 0.05723626911640167, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002598626888357103, "grad_norm": 6.797337055206299, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8735457062721252, "num_tokens": 424376658.0, "step": 11120 }, { "epoch": 1.414705508205063, "ewc_loss": 0.05716598033905029, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002591598022263497, "grad_norm": 6.775949954986572, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8748536705970764, "num_tokens": 424415499.0, "step": 11121 }, { "epoch": 1.4148327184836536, "ewc_loss": 0.057168491184711456, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002591848897282034, "grad_norm": 6.727908134460449, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8650774359703064, "num_tokens": 424453401.0, "step": 11122 }, { "epoch": 1.414959928762244, "ewc_loss": 0.057285912334918976, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002603591128718108, "grad_norm": 6.80825662612915, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.867620587348938, "num_tokens": 424490654.0, "step": 11123 }, { "epoch": 1.4150871390408346, "ewc_loss": 0.0571611151099205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025911114062182605, "grad_norm": 6.734570026397705, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8738799095153809, "num_tokens": 424534469.0, "step": 11124 }, { "epoch": 1.4152143493194251, "ewc_loss": 0.05726422369480133, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026014226023107767, "grad_norm": 6.832037925720215, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8553727269172668, "num_tokens": 424570187.0, "step": 11125 }, { "epoch": 1.4153415595980154, "ewc_loss": 0.057203225791454315, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025953224394470453, "grad_norm": 6.822846412658691, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8441245555877686, "num_tokens": 424607898.0, "step": 11126 }, { "epoch": 1.415468769876606, "ewc_loss": 0.057172954082489014, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002592295641079545, "grad_norm": 6.72619104385376, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8667601346969604, "num_tokens": 424652156.0, "step": 11127 }, { "epoch": 1.4155959801551965, "ewc_loss": 0.057251058518886566, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002600105945020914, "grad_norm": 6.84055757522583, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8551244139671326, "num_tokens": 424688735.0, "step": 11128 }, { "epoch": 1.415723190433787, "ewc_loss": 0.05711548030376434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002586548216640949, "grad_norm": 6.7007575035095215, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8656700849533081, "num_tokens": 424732859.0, "step": 11129 }, { "epoch": 1.4158504007123776, "ewc_loss": 0.05730890855193138, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002605890913400799, "grad_norm": 6.9002685546875, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8565964698791504, "num_tokens": 424769381.0, "step": 11130 }, { "epoch": 1.415977610990968, "ewc_loss": 0.05702213943004608, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025772140361368656, "grad_norm": 6.711559772491455, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.864585280418396, "num_tokens": 424802362.0, "step": 11131 }, { "epoch": 1.4161048212695586, "ewc_loss": 0.05737137421965599, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002612137468531728, "grad_norm": 6.872584342956543, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8608250617980957, "num_tokens": 424837897.0, "step": 11132 }, { "epoch": 1.4162320315481491, "ewc_loss": 0.05714503675699234, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002589503419585526, "grad_norm": 6.6882219314575195, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8746141195297241, "num_tokens": 424878154.0, "step": 11133 }, { "epoch": 1.4163592418267397, "ewc_loss": 0.057348381727933884, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002609838265925646, "grad_norm": 6.793804168701172, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8694842457771301, "num_tokens": 424915063.0, "step": 11134 }, { "epoch": 1.41648645210533, "ewc_loss": 0.05725126713514328, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002600126899778843, "grad_norm": 6.721220970153809, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8634734153747559, "num_tokens": 424955159.0, "step": 11135 }, { "epoch": 1.4166136623839205, "ewc_loss": 0.05739927291870117, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026149273617193103, "grad_norm": 6.831310272216797, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.857187032699585, "num_tokens": 424992573.0, "step": 11136 }, { "epoch": 1.416740872662511, "ewc_loss": 0.05724547058343887, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000259954686043784, "grad_norm": 6.7573699951171875, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8688693046569824, "num_tokens": 425030452.0, "step": 11137 }, { "epoch": 1.4168680829411016, "ewc_loss": 0.05734448879957199, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026094491477124393, "grad_norm": 6.792950630187988, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8783137798309326, "num_tokens": 425067118.0, "step": 11138 }, { "epoch": 1.416995293219692, "ewc_loss": 0.05736295133829117, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000261129520367831, "grad_norm": 6.800607204437256, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8533612489700317, "num_tokens": 425101684.0, "step": 11139 }, { "epoch": 1.4171225034982826, "ewc_loss": 0.057322997599840164, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026072998298332095, "grad_norm": 6.760550498962402, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8727385997772217, "num_tokens": 425143127.0, "step": 11140 }, { "epoch": 1.4172497137768731, "ewc_loss": 0.057397227734327316, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026147227617911994, "grad_norm": 6.8652567863464355, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8641616106033325, "num_tokens": 425173912.0, "step": 11141 }, { "epoch": 1.4173769240554637, "ewc_loss": 0.057191893458366394, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000259418913628906, "grad_norm": 6.716507434844971, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8695263862609863, "num_tokens": 425210146.0, "step": 11142 }, { "epoch": 1.4175041343340542, "ewc_loss": 0.0573926717042923, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002614266995806247, "grad_norm": 6.788243770599365, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8766752481460571, "num_tokens": 425246082.0, "step": 11143 }, { "epoch": 1.4176313446126447, "ewc_loss": 0.057195357978343964, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025945357629098, "grad_norm": 6.732110023498535, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8613064289093018, "num_tokens": 425287470.0, "step": 11144 }, { "epoch": 1.4177585548912353, "ewc_loss": 0.057372596114873886, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002612259704619646, "grad_norm": 6.800504207611084, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8602695465087891, "num_tokens": 425326553.0, "step": 11145 }, { "epoch": 1.4178857651698258, "ewc_loss": 0.0571918860077858, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025941888452507555, "grad_norm": 6.766838073730469, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8575074672698975, "num_tokens": 425363105.0, "step": 11146 }, { "epoch": 1.4180129754484163, "ewc_loss": 0.05733799934387207, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026087998412549496, "grad_norm": 6.790191650390625, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8584281206130981, "num_tokens": 425402513.0, "step": 11147 }, { "epoch": 1.4181401857270068, "ewc_loss": 0.0572764091193676, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026026408886536956, "grad_norm": 6.718067646026611, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8646178245544434, "num_tokens": 425440132.0, "step": 11148 }, { "epoch": 1.4182673960055974, "ewc_loss": 0.05761982500553131, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002612568496260792, "grad_norm": 6.771439075469971, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8630757331848145, "num_tokens": 425481738.0, "step": 11149 }, { "epoch": 1.418394606284188, "ewc_loss": 0.05766056850552559, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000261664274148643, "grad_norm": 6.929798603057861, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8626158237457275, "num_tokens": 425516916.0, "step": 11150 }, { "epoch": 1.4185218165627782, "ewc_loss": 0.057493072003126144, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025998931960202754, "grad_norm": 6.741568088531494, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8635051846504211, "num_tokens": 425555951.0, "step": 11151 }, { "epoch": 1.4186490268413687, "ewc_loss": 0.05762898921966553, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026134849758818746, "grad_norm": 6.76756477355957, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.864298939704895, "num_tokens": 425595302.0, "step": 11152 }, { "epoch": 1.4187762371199593, "ewc_loss": 0.057502396404743195, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002600825682748109, "grad_norm": 6.770476341247559, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.867341160774231, "num_tokens": 425623281.0, "step": 11153 }, { "epoch": 1.4189034473985498, "ewc_loss": 0.05753768980503082, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002604355104267597, "grad_norm": 6.77454948425293, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8732649087905884, "num_tokens": 425659124.0, "step": 11154 }, { "epoch": 1.4190306576771403, "ewc_loss": 0.05758248269557953, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026088341837748885, "grad_norm": 6.803893566131592, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8514750003814697, "num_tokens": 425695809.0, "step": 11155 }, { "epoch": 1.4191578679557308, "ewc_loss": 0.05757291615009308, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026078775408677757, "grad_norm": 6.824880123138428, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8547745943069458, "num_tokens": 425731890.0, "step": 11156 }, { "epoch": 1.4192850782343214, "ewc_loss": 0.05748273804783821, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002598859719000757, "grad_norm": 6.746007442474365, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.872433602809906, "num_tokens": 425770899.0, "step": 11157 }, { "epoch": 1.419412288512912, "ewc_loss": 0.05751252919435501, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002601838787086308, "grad_norm": 6.7983527183532715, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8761128187179565, "num_tokens": 425807812.0, "step": 11158 }, { "epoch": 1.4195394987915024, "ewc_loss": 0.057490039616823196, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002599589934106916, "grad_norm": 6.748098373413086, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8600043058395386, "num_tokens": 425848997.0, "step": 11159 }, { "epoch": 1.4196667090700927, "ewc_loss": 0.05761747062206268, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026123327552340925, "grad_norm": 6.807753086090088, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8740555047988892, "num_tokens": 425887816.0, "step": 11160 }, { "epoch": 1.4197939193486833, "ewc_loss": 0.05745445564389229, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025960314087569714, "grad_norm": 6.776394844055176, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8550257086753845, "num_tokens": 425925488.0, "step": 11161 }, { "epoch": 1.4199211296272738, "ewc_loss": 0.05751737952232361, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002602323656901717, "grad_norm": 6.7751593589782715, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8711076974868774, "num_tokens": 425965616.0, "step": 11162 }, { "epoch": 1.4200483399058643, "ewc_loss": 0.05755045264959335, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026056310161948204, "grad_norm": 6.759263515472412, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8578486442565918, "num_tokens": 426005308.0, "step": 11163 }, { "epoch": 1.4201755501844548, "ewc_loss": 0.05755982547998428, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002606568450573832, "grad_norm": 6.839533805847168, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8607021570205688, "num_tokens": 426042339.0, "step": 11164 }, { "epoch": 1.4203027604630454, "ewc_loss": 0.05745863914489746, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002596449921838939, "grad_norm": 6.7609992027282715, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8639783263206482, "num_tokens": 426081925.0, "step": 11165 }, { "epoch": 1.420429970741636, "ewc_loss": 0.05728224664926529, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002603224420454353, "grad_norm": 6.801672458648682, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8587461113929749, "num_tokens": 426120918.0, "step": 11166 }, { "epoch": 1.4205571810202264, "ewc_loss": 0.05744699016213417, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002595284895505756, "grad_norm": 6.758142471313477, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8538405299186707, "num_tokens": 426158711.0, "step": 11167 }, { "epoch": 1.420684391298817, "ewc_loss": 0.05738108605146408, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002613108663354069, "grad_norm": 6.839305400848389, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8591679334640503, "num_tokens": 426197034.0, "step": 11168 }, { "epoch": 1.4208116015774075, "ewc_loss": 0.05726333335042, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026013332535512745, "grad_norm": 6.750481605529785, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8573674559593201, "num_tokens": 426239390.0, "step": 11169 }, { "epoch": 1.420938811855998, "ewc_loss": 0.057272814214229584, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026022811653092504, "grad_norm": 6.817030429840088, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8533313274383545, "num_tokens": 426285488.0, "step": 11170 }, { "epoch": 1.4210660221345885, "ewc_loss": 0.057230159640312195, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025980157079175115, "grad_norm": 6.841397762298584, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.851962685585022, "num_tokens": 426323955.0, "step": 11171 }, { "epoch": 1.421193232413179, "ewc_loss": 0.057206664234399796, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002595666446723044, "grad_norm": 6.761934757232666, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.884285569190979, "num_tokens": 426359460.0, "step": 11172 }, { "epoch": 1.4213204426917696, "ewc_loss": 0.05725941061973572, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026009412249550223, "grad_norm": 6.842893123626709, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.855941653251648, "num_tokens": 426397104.0, "step": 11173 }, { "epoch": 1.4214476529703601, "ewc_loss": 0.057128675282001495, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025878677843138576, "grad_norm": 6.711876392364502, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.87232905626297, "num_tokens": 426441349.0, "step": 11174 }, { "epoch": 1.4215748632489504, "ewc_loss": 0.05724138766527176, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002599138824734837, "grad_norm": 6.802359104156494, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8729028105735779, "num_tokens": 426478410.0, "step": 11175 }, { "epoch": 1.421702073527541, "ewc_loss": 0.057153426110744476, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002590342774055898, "grad_norm": 6.7932891845703125, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8635478019714355, "num_tokens": 426519874.0, "step": 11176 }, { "epoch": 1.4218292838061315, "ewc_loss": 0.05721290409564972, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025962901418097317, "grad_norm": 6.74805212020874, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8520790338516235, "num_tokens": 426559970.0, "step": 11177 }, { "epoch": 1.421956494084722, "ewc_loss": 0.05721127241849899, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025961274513974786, "grad_norm": 6.850555419921875, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8651216626167297, "num_tokens": 426593897.0, "step": 11178 }, { "epoch": 1.4220837043633126, "ewc_loss": 0.05711694806814194, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025866946089081466, "grad_norm": 6.737877368927002, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8635923862457275, "num_tokens": 426633712.0, "step": 11179 }, { "epoch": 1.422210914641903, "ewc_loss": 0.057323455810546875, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026073455228470266, "grad_norm": 6.802987098693848, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8786584734916687, "num_tokens": 426669085.0, "step": 11180 }, { "epoch": 1.4223381249204936, "ewc_loss": 0.057182617485523224, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025932618882507086, "grad_norm": 6.710067272186279, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.87827467918396, "num_tokens": 426709260.0, "step": 11181 }, { "epoch": 1.4224653351990841, "ewc_loss": 0.0573444664478302, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002609446528367698, "grad_norm": 6.825096130371094, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8533009886741638, "num_tokens": 426747747.0, "step": 11182 }, { "epoch": 1.4225925454776747, "ewc_loss": 0.0572592057287693, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002600920561235398, "grad_norm": 6.731904029846191, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8508831858634949, "num_tokens": 426788185.0, "step": 11183 }, { "epoch": 1.422719755756265, "ewc_loss": 0.05742017179727554, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026170173077844083, "grad_norm": 6.859760761260986, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8651481866836548, "num_tokens": 426828663.0, "step": 11184 }, { "epoch": 1.4228469660348555, "ewc_loss": 0.057233504951000214, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002598350401967764, "grad_norm": 6.728466987609863, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8533741235733032, "num_tokens": 426869205.0, "step": 11185 }, { "epoch": 1.422974176313446, "ewc_loss": 0.057402193546295166, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026152192731387913, "grad_norm": 6.818723678588867, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8793565034866333, "num_tokens": 426900561.0, "step": 11186 }, { "epoch": 1.4231013865920366, "ewc_loss": 0.05732918903231621, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002607918868307024, "grad_norm": 6.76431941986084, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8787267804145813, "num_tokens": 426940618.0, "step": 11187 }, { "epoch": 1.423228596870627, "ewc_loss": 0.05735161155462265, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002610161027405411, "grad_norm": 6.868401050567627, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8553034067153931, "num_tokens": 426975509.0, "step": 11188 }, { "epoch": 1.4233558071492176, "ewc_loss": 0.0573161244392395, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002606612688396126, "grad_norm": 6.793630599975586, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8735259771347046, "num_tokens": 427011647.0, "step": 11189 }, { "epoch": 1.4234830174278081, "ewc_loss": 0.05731630325317383, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026066301506944, "grad_norm": 6.853250980377197, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8612145185470581, "num_tokens": 427046631.0, "step": 11190 }, { "epoch": 1.4236102277063987, "ewc_loss": 0.057502299547195435, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026008160784840584, "grad_norm": 6.787205696105957, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.861530065536499, "num_tokens": 427081364.0, "step": 11191 }, { "epoch": 1.4237374379849892, "ewc_loss": 0.057250555604696274, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002600055595394224, "grad_norm": 6.796856880187988, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8661493062973022, "num_tokens": 427118667.0, "step": 11192 }, { "epoch": 1.4238646482635797, "ewc_loss": 0.05730343982577324, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002605344052426517, "grad_norm": 6.765522480010986, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.868746280670166, "num_tokens": 427155017.0, "step": 11193 }, { "epoch": 1.4239918585421703, "ewc_loss": 0.057380519807338715, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026130519108846784, "grad_norm": 6.846736907958984, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8570759296417236, "num_tokens": 427191741.0, "step": 11194 }, { "epoch": 1.4241190688207608, "ewc_loss": 0.057255856692790985, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002600585576146841, "grad_norm": 6.794167518615723, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8679454326629639, "num_tokens": 427226210.0, "step": 11195 }, { "epoch": 1.4242462790993513, "ewc_loss": 0.05739133805036545, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026141339913010597, "grad_norm": 6.817361354827881, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8613172769546509, "num_tokens": 427261699.0, "step": 11196 }, { "epoch": 1.4243734893779418, "ewc_loss": 0.057318948209285736, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002606894704513252, "grad_norm": 6.806547164916992, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8549848794937134, "num_tokens": 427297158.0, "step": 11197 }, { "epoch": 1.4245006996565324, "ewc_loss": 0.057340458035469055, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002609045768622309, "grad_norm": 6.792141914367676, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8481997847557068, "num_tokens": 427338234.0, "step": 11198 }, { "epoch": 1.424627909935123, "ewc_loss": 0.05733225494623184, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608225622680038, "grad_norm": 6.780983924865723, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8580183982849121, "num_tokens": 427376727.0, "step": 11199 }, { "epoch": 1.4247551202137132, "ewc_loss": 0.05733771249651909, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608771319501102, "grad_norm": 6.839101791381836, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8765401840209961, "num_tokens": 427417095.0, "step": 11200 }, { "epoch": 1.4248823304923037, "ewc_loss": 0.05728910490870476, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026039103977382183, "grad_norm": 6.772007465362549, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8472171425819397, "num_tokens": 427453243.0, "step": 11201 }, { "epoch": 1.4250095407708943, "ewc_loss": 0.05740029364824295, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002615029225125909, "grad_norm": 6.8546648025512695, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8661943078041077, "num_tokens": 427490961.0, "step": 11202 }, { "epoch": 1.4251367510494848, "ewc_loss": 0.05721646919846535, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025966469547711313, "grad_norm": 6.776880741119385, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8747960925102234, "num_tokens": 427521487.0, "step": 11203 }, { "epoch": 1.4252639613280753, "ewc_loss": 0.05732233077287674, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026072331820614636, "grad_norm": 6.787289142608643, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8611755967140198, "num_tokens": 427556607.0, "step": 11204 }, { "epoch": 1.4253911716066658, "ewc_loss": 0.05731603503227234, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026066036662086844, "grad_norm": 6.764571189880371, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8645135760307312, "num_tokens": 427598940.0, "step": 11205 }, { "epoch": 1.4255183818852564, "ewc_loss": 0.057567156851291656, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026073018671013415, "grad_norm": 6.8025054931640625, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8531174659729004, "num_tokens": 427641337.0, "step": 11206 }, { "epoch": 1.425645592163847, "ewc_loss": 0.05728432163596153, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002603432221803814, "grad_norm": 6.771923065185547, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8760059475898743, "num_tokens": 427680235.0, "step": 11207 }, { "epoch": 1.4257728024424374, "ewc_loss": 0.057283684611320496, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026033681933768094, "grad_norm": 6.7633490562438965, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8650943040847778, "num_tokens": 427719918.0, "step": 11208 }, { "epoch": 1.4259000127210277, "ewc_loss": 0.057263098657131195, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026013096794486046, "grad_norm": 6.7784271240234375, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8661193251609802, "num_tokens": 427758042.0, "step": 11209 }, { "epoch": 1.4260272229996183, "ewc_loss": 0.057295870035886765, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026045870617963374, "grad_norm": 6.814520359039307, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8661272525787354, "num_tokens": 427794591.0, "step": 11210 }, { "epoch": 1.4261544332782088, "ewc_loss": 0.05752337723970413, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026029234868474305, "grad_norm": 6.858842372894287, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.871697187423706, "num_tokens": 427828190.0, "step": 11211 }, { "epoch": 1.4262816435567993, "ewc_loss": 0.05715896189212799, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025908960378728807, "grad_norm": 6.748702526092529, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8571892976760864, "num_tokens": 427870363.0, "step": 11212 }, { "epoch": 1.4264088538353898, "ewc_loss": 0.05736032873392105, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002611032978165895, "grad_norm": 6.853132724761963, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8790862560272217, "num_tokens": 427910037.0, "step": 11213 }, { "epoch": 1.4265360641139804, "ewc_loss": 0.05744440108537674, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002595025871414691, "grad_norm": 6.791388988494873, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8637365102767944, "num_tokens": 427945782.0, "step": 11214 }, { "epoch": 1.426663274392571, "ewc_loss": 0.05738230049610138, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002613230317365378, "grad_norm": 6.825815200805664, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.871188759803772, "num_tokens": 427984635.0, "step": 11215 }, { "epoch": 1.4267904846711614, "ewc_loss": 0.057185783982276917, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002593578537926078, "grad_norm": 6.731132984161377, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8798526525497437, "num_tokens": 428024289.0, "step": 11216 }, { "epoch": 1.426917694949752, "ewc_loss": 0.05741170421242714, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026161703863181174, "grad_norm": 6.80861234664917, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8560694456100464, "num_tokens": 428067119.0, "step": 11217 }, { "epoch": 1.4270449052283425, "ewc_loss": 0.0572558268904686, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026005829568021, "grad_norm": 6.803339004516602, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8623256087303162, "num_tokens": 428108150.0, "step": 11218 }, { "epoch": 1.427172115506933, "ewc_loss": 0.05728800967335701, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002603800967335701, "grad_norm": 6.754122734069824, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8745805621147156, "num_tokens": 428150839.0, "step": 11219 }, { "epoch": 1.4272993257855235, "ewc_loss": 0.057323772460222244, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026073772460222244, "grad_norm": 6.7819743156433105, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8520200848579407, "num_tokens": 428194325.0, "step": 11220 }, { "epoch": 1.427426536064114, "ewc_loss": 0.057329460978507996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002607945934869349, "grad_norm": 6.793970108032227, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8627628087997437, "num_tokens": 428230879.0, "step": 11221 }, { "epoch": 1.4275537463427046, "ewc_loss": 0.057327352464199066, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002607735223136842, "grad_norm": 6.795617580413818, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8762524127960205, "num_tokens": 428265795.0, "step": 11222 }, { "epoch": 1.4276809566212951, "ewc_loss": 0.05724703520536423, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025997034390456975, "grad_norm": 6.809223175048828, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8699381351470947, "num_tokens": 428300413.0, "step": 11223 }, { "epoch": 1.4278081668998854, "ewc_loss": 0.05734638124704361, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026096380315721035, "grad_norm": 6.790075302124023, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8864394426345825, "num_tokens": 428339293.0, "step": 11224 }, { "epoch": 1.427935377178476, "ewc_loss": 0.05725234001874924, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026002340018749237, "grad_norm": 6.773488521575928, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8780144453048706, "num_tokens": 428374158.0, "step": 11225 }, { "epoch": 1.4280625874570665, "ewc_loss": 0.057373661547899246, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026123662246391177, "grad_norm": 6.812231063842773, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8652139902114868, "num_tokens": 428411715.0, "step": 11226 }, { "epoch": 1.428189797735657, "ewc_loss": 0.057317472994327545, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026067474391311407, "grad_norm": 6.790346622467041, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8526674509048462, "num_tokens": 428450000.0, "step": 11227 }, { "epoch": 1.4283170080142475, "ewc_loss": 0.05740443617105484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002615443372633308, "grad_norm": 6.810058116912842, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8710848093032837, "num_tokens": 428484546.0, "step": 11228 }, { "epoch": 1.428444218292838, "ewc_loss": 0.05737156420946121, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002612156677059829, "grad_norm": 6.784593105316162, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8651870489120483, "num_tokens": 428527519.0, "step": 11229 }, { "epoch": 1.4285714285714286, "ewc_loss": 0.05739535018801689, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026145350420847535, "grad_norm": 6.797473430633545, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8602899312973022, "num_tokens": 428568744.0, "step": 11230 }, { "epoch": 1.4286986388500191, "ewc_loss": 0.057376496493816376, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002612649404909462, "grad_norm": 6.774325370788574, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8606753349304199, "num_tokens": 428613050.0, "step": 11231 }, { "epoch": 1.4288258491286097, "ewc_loss": 0.05745299905538559, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026202999288216233, "grad_norm": 6.827606678009033, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8618826270103455, "num_tokens": 428647334.0, "step": 11232 }, { "epoch": 1.4289530594072, "ewc_loss": 0.05741621553897858, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616621495690197, "grad_norm": 6.8206562995910645, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8718514442443848, "num_tokens": 428681015.0, "step": 11233 }, { "epoch": 1.4290802696857905, "ewc_loss": 0.05745901167392731, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026209012139588594, "grad_norm": 6.835320949554443, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8574622869491577, "num_tokens": 428718903.0, "step": 11234 }, { "epoch": 1.429207479964381, "ewc_loss": 0.057407405227422714, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026157405227422714, "grad_norm": 6.789988040924072, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.874327540397644, "num_tokens": 428756038.0, "step": 11235 }, { "epoch": 1.4293346902429716, "ewc_loss": 0.05746851861476898, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026218517450615764, "grad_norm": 6.78665828704834, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8814722299575806, "num_tokens": 428793908.0, "step": 11236 }, { "epoch": 1.429461900521562, "ewc_loss": 0.05741983652114868, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616983838379383, "grad_norm": 6.809353828430176, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8577507734298706, "num_tokens": 428830924.0, "step": 11237 }, { "epoch": 1.4295891108001526, "ewc_loss": 0.05738483741879463, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026134838117286563, "grad_norm": 6.775994777679443, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8624666929244995, "num_tokens": 428869083.0, "step": 11238 }, { "epoch": 1.4297163210787431, "ewc_loss": 0.05748629570007324, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026236294070258737, "grad_norm": 6.908926010131836, "learning_rate": 1e-06, "loss": 0.5303, "mean_token_accuracy": 0.8402465581893921, "num_tokens": 428903970.0, "step": 11239 }, { "epoch": 1.4298435313573337, "ewc_loss": 0.05736802518367767, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026118027744814754, "grad_norm": 6.761258125305176, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.86794513463974, "num_tokens": 428940710.0, "step": 11240 }, { "epoch": 1.4299707416359242, "ewc_loss": 0.05751296132802963, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026262958999723196, "grad_norm": 6.830544471740723, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.867895245552063, "num_tokens": 428981160.0, "step": 11241 }, { "epoch": 1.4300979519145147, "ewc_loss": 0.05738730728626251, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026137306122109294, "grad_norm": 6.769617080688477, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8658047914505005, "num_tokens": 429022464.0, "step": 11242 }, { "epoch": 1.4302251621931052, "ewc_loss": 0.057574912905693054, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026324912323616445, "grad_norm": 6.8244781494140625, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8647265434265137, "num_tokens": 429060964.0, "step": 11243 }, { "epoch": 1.4303523724716958, "ewc_loss": 0.05741900950670242, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026169008924625814, "grad_norm": 6.778871536254883, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8659976124763489, "num_tokens": 429101389.0, "step": 11244 }, { "epoch": 1.4304795827502863, "ewc_loss": 0.0576019324362278, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026351932319812477, "grad_norm": 6.910743236541748, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8614811897277832, "num_tokens": 429135547.0, "step": 11245 }, { "epoch": 1.4306067930288768, "ewc_loss": 0.05743765830993652, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618765865918249, "grad_norm": 6.790659427642822, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8671625256538391, "num_tokens": 429177504.0, "step": 11246 }, { "epoch": 1.4307340033074674, "ewc_loss": 0.05757991969585419, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002632992109283805, "grad_norm": 6.873677730560303, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8693543076515198, "num_tokens": 429217923.0, "step": 11247 }, { "epoch": 1.430861213586058, "ewc_loss": 0.05737170949578285, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002612170937936753, "grad_norm": 6.840120315551758, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8653051853179932, "num_tokens": 429251514.0, "step": 11248 }, { "epoch": 1.4309884238646482, "ewc_loss": 0.05755383521318436, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002630383532959968, "grad_norm": 6.87620210647583, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8484634160995483, "num_tokens": 429291747.0, "step": 11249 }, { "epoch": 1.4311156341432387, "ewc_loss": 0.05743837729096413, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618837752379477, "grad_norm": 6.79221773147583, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8612850904464722, "num_tokens": 429333480.0, "step": 11250 }, { "epoch": 1.4312428444218293, "ewc_loss": 0.057519055902957916, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026269053341820836, "grad_norm": 6.893738269805908, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8766763210296631, "num_tokens": 429366482.0, "step": 11251 }, { "epoch": 1.4313700547004198, "ewc_loss": 0.05741597339510918, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026165973395109177, "grad_norm": 6.8147292137146, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8570115566253662, "num_tokens": 429400704.0, "step": 11252 }, { "epoch": 1.4314972649790103, "ewc_loss": 0.057472676038742065, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002622267638798803, "grad_norm": 6.851099014282227, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8524640798568726, "num_tokens": 429437463.0, "step": 11253 }, { "epoch": 1.4316244752576008, "ewc_loss": 0.05740068107843399, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002615068224258721, "grad_norm": 6.824010372161865, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8530968427658081, "num_tokens": 429474496.0, "step": 11254 }, { "epoch": 1.4317516855361914, "ewc_loss": 0.0574529767036438, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002620297600515187, "grad_norm": 6.875710487365723, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8509945869445801, "num_tokens": 429514641.0, "step": 11255 }, { "epoch": 1.431878895814782, "ewc_loss": 0.05744403600692749, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026194038218818605, "grad_norm": 6.838878631591797, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8650968074798584, "num_tokens": 429554054.0, "step": 11256 }, { "epoch": 1.4320061060933724, "ewc_loss": 0.057433005422353745, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026183004956692457, "grad_norm": 6.8022637367248535, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8614994883537292, "num_tokens": 429594004.0, "step": 11257 }, { "epoch": 1.4321333163719627, "ewc_loss": 0.05744354426860809, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026193546364083886, "grad_norm": 6.8206071853637695, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8711326122283936, "num_tokens": 429633019.0, "step": 11258 }, { "epoch": 1.4322605266505533, "ewc_loss": 0.05738748982548714, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002613748947624117, "grad_norm": 6.841994762420654, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8650009632110596, "num_tokens": 429666869.0, "step": 11259 }, { "epoch": 1.4323877369291438, "ewc_loss": 0.05740366876125336, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002615366829559207, "grad_norm": 6.865133762359619, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8748514652252197, "num_tokens": 429700049.0, "step": 11260 }, { "epoch": 1.4325149472077343, "ewc_loss": 0.057461708784103394, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002621170715428889, "grad_norm": 6.774864673614502, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8622961640357971, "num_tokens": 429737517.0, "step": 11261 }, { "epoch": 1.4326421574863248, "ewc_loss": 0.057516612112522125, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002626661444082856, "grad_norm": 6.817115783691406, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8775492310523987, "num_tokens": 429778697.0, "step": 11262 }, { "epoch": 1.4327693677649154, "ewc_loss": 0.05743860825896263, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618860744405538, "grad_norm": 6.838107585906982, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8656752109527588, "num_tokens": 429811883.0, "step": 11263 }, { "epoch": 1.432896578043506, "ewc_loss": 0.05753466486930847, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026284667546860874, "grad_norm": 6.848291873931885, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.869687557220459, "num_tokens": 429851748.0, "step": 11264 }, { "epoch": 1.4330237883220964, "ewc_loss": 0.05736710876226425, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026117110974155366, "grad_norm": 6.834075450897217, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8545538187026978, "num_tokens": 429891033.0, "step": 11265 }, { "epoch": 1.433150998600687, "ewc_loss": 0.05742466449737549, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002617466379888356, "grad_norm": 6.792078495025635, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8682104349136353, "num_tokens": 429935980.0, "step": 11266 }, { "epoch": 1.4332782088792775, "ewc_loss": 0.05733099579811096, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608099312055856, "grad_norm": 6.992076396942139, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8531018495559692, "num_tokens": 429970874.0, "step": 11267 }, { "epoch": 1.433405419157868, "ewc_loss": 0.05724921077489853, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002599920844659209, "grad_norm": 6.723308086395264, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8501687049865723, "num_tokens": 430017909.0, "step": 11268 }, { "epoch": 1.4335326294364585, "ewc_loss": 0.057501811534166336, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026251812232658267, "grad_norm": 6.934633731842041, "learning_rate": 1e-06, "loss": 0.5639, "mean_token_accuracy": 0.8387628197669983, "num_tokens": 430050641.0, "step": 11269 }, { "epoch": 1.433659839715049, "ewc_loss": 0.0572805255651474, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026030527078546584, "grad_norm": 6.807380199432373, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8680047392845154, "num_tokens": 430082985.0, "step": 11270 }, { "epoch": 1.4337870499936396, "ewc_loss": 0.05743236094713211, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618235885165632, "grad_norm": 6.777383327484131, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.863922655582428, "num_tokens": 430122340.0, "step": 11271 }, { "epoch": 1.4339142602722301, "ewc_loss": 0.057360827922821045, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002611082745715976, "grad_norm": 6.768221378326416, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8504067659378052, "num_tokens": 430162140.0, "step": 11272 }, { "epoch": 1.4340414705508204, "ewc_loss": 0.057420406490564346, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026170405908487737, "grad_norm": 6.806482791900635, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8757023215293884, "num_tokens": 430201672.0, "step": 11273 }, { "epoch": 1.434168680829411, "ewc_loss": 0.05743037164211273, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026180371060036123, "grad_norm": 6.78670072555542, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8532218337059021, "num_tokens": 430239760.0, "step": 11274 }, { "epoch": 1.4342958911080015, "ewc_loss": 0.057396989315748215, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002614698896650225, "grad_norm": 6.812038421630859, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8462095856666565, "num_tokens": 430280527.0, "step": 11275 }, { "epoch": 1.434423101386592, "ewc_loss": 0.05740005895495415, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026150059420615435, "grad_norm": 6.830178260803223, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.861204206943512, "num_tokens": 430312788.0, "step": 11276 }, { "epoch": 1.4345503116651825, "ewc_loss": 0.05747661739587784, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002622661995701492, "grad_norm": 6.862090587615967, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8723673820495605, "num_tokens": 430348044.0, "step": 11277 }, { "epoch": 1.434677521943773, "ewc_loss": 0.057293884456157684, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026043885736726224, "grad_norm": 6.77709436416626, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8798587918281555, "num_tokens": 430385676.0, "step": 11278 }, { "epoch": 1.4348047322223636, "ewc_loss": 0.05741867423057556, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616867423057556, "grad_norm": 6.88340425491333, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8602843284606934, "num_tokens": 430416976.0, "step": 11279 }, { "epoch": 1.4349319425009541, "ewc_loss": 0.05759535729885101, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000261012144619599, "grad_norm": 6.766786575317383, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8743016719818115, "num_tokens": 430460002.0, "step": 11280 }, { "epoch": 1.4350591527795447, "ewc_loss": 0.057444002479314804, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026194003294222057, "grad_norm": 6.841320991516113, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8425698280334473, "num_tokens": 430503172.0, "step": 11281 }, { "epoch": 1.435186363058135, "ewc_loss": 0.05740297585725784, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026152978534810245, "grad_norm": 6.790922164916992, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8517874479293823, "num_tokens": 430541221.0, "step": 11282 }, { "epoch": 1.4353135733367255, "ewc_loss": 0.05752590298652649, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002627590438351035, "grad_norm": 6.85392427444458, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8567935228347778, "num_tokens": 430579527.0, "step": 11283 }, { "epoch": 1.435440783615316, "ewc_loss": 0.057660944759845734, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026166802854277194, "grad_norm": 6.809680461883545, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8663275241851807, "num_tokens": 430617263.0, "step": 11284 }, { "epoch": 1.4355679938939065, "ewc_loss": 0.05747189372777939, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026221893494948745, "grad_norm": 6.842200756072998, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8762826323509216, "num_tokens": 430649961.0, "step": 11285 }, { "epoch": 1.435695204172497, "ewc_loss": 0.05733887851238251, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026088880258612335, "grad_norm": 6.82789421081543, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8581448793411255, "num_tokens": 430682599.0, "step": 11286 }, { "epoch": 1.4358224144510876, "ewc_loss": 0.057717833667993546, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002622369211167097, "grad_norm": 6.902312755584717, "learning_rate": 1e-06, "loss": 0.5451, "mean_token_accuracy": 0.8367745280265808, "num_tokens": 430721014.0, "step": 11287 }, { "epoch": 1.4359496247296781, "ewc_loss": 0.057339441031217575, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608944196254015, "grad_norm": 6.826165676116943, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8630169630050659, "num_tokens": 430757014.0, "step": 11288 }, { "epoch": 1.4360768350082687, "ewc_loss": 0.05736441910266876, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002611441886983812, "grad_norm": 6.782329559326172, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8647813200950623, "num_tokens": 430796169.0, "step": 11289 }, { "epoch": 1.4362040452868592, "ewc_loss": 0.05743845924735069, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618845901452005, "grad_norm": 6.899252414703369, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.863012969493866, "num_tokens": 430832726.0, "step": 11290 }, { "epoch": 1.4363312555654497, "ewc_loss": 0.05727218836545944, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002602218883112073, "grad_norm": 6.821177005767822, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8528210520744324, "num_tokens": 430866343.0, "step": 11291 }, { "epoch": 1.4364584658440402, "ewc_loss": 0.05741773545742035, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616773417685181, "grad_norm": 6.869301795959473, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8698794841766357, "num_tokens": 430901622.0, "step": 11292 }, { "epoch": 1.4365856761226308, "ewc_loss": 0.05727583169937134, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002602583263069391, "grad_norm": 6.8348164558410645, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8769496083259583, "num_tokens": 430937499.0, "step": 11293 }, { "epoch": 1.4367128864012213, "ewc_loss": 0.05731138586997986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002606138586997986, "grad_norm": 6.84431791305542, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8508346080780029, "num_tokens": 430974048.0, "step": 11294 }, { "epoch": 1.4368400966798118, "ewc_loss": 0.05730545148253441, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002605545159894973, "grad_norm": 6.853452205657959, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8719601631164551, "num_tokens": 431010927.0, "step": 11295 }, { "epoch": 1.4369673069584024, "ewc_loss": 0.05727212131023407, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002602212189231068, "grad_norm": 6.864561557769775, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8485814929008484, "num_tokens": 431051731.0, "step": 11296 }, { "epoch": 1.4370945172369929, "ewc_loss": 0.057615410536527634, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026121269911527634, "grad_norm": 6.812119960784912, "learning_rate": 1e-06, "loss": 0.3987, "mean_token_accuracy": 0.8790884017944336, "num_tokens": 431087876.0, "step": 11297 }, { "epoch": 1.4372217275155832, "ewc_loss": 0.05720546841621399, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002595546829979867, "grad_norm": 6.794349193572998, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8578171730041504, "num_tokens": 431126217.0, "step": 11298 }, { "epoch": 1.4373489377941737, "ewc_loss": 0.05764641612768173, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002615227713249624, "grad_norm": 6.797693729400635, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8645662069320679, "num_tokens": 431168049.0, "step": 11299 }, { "epoch": 1.4374761480727642, "ewc_loss": 0.05744658783078194, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002595244732219726, "grad_norm": 6.755680561065674, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8606757521629333, "num_tokens": 431211143.0, "step": 11300 }, { "epoch": 1.4376033583513548, "ewc_loss": 0.05735177546739578, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002610177325550467, "grad_norm": 6.812283039093018, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8709542751312256, "num_tokens": 431255747.0, "step": 11301 }, { "epoch": 1.4377305686299453, "ewc_loss": 0.05725798010826111, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002600798034109175, "grad_norm": 6.838921546936035, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8674321174621582, "num_tokens": 431296618.0, "step": 11302 }, { "epoch": 1.4378577789085358, "ewc_loss": 0.05729036033153534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002604035835247487, "grad_norm": 6.875681400299072, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8648683428764343, "num_tokens": 431336493.0, "step": 11303 }, { "epoch": 1.4379849891871264, "ewc_loss": 0.05722619220614433, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002597619313746691, "grad_norm": 6.862178802490234, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8608173131942749, "num_tokens": 431376200.0, "step": 11304 }, { "epoch": 1.438112199465717, "ewc_loss": 0.05719880387187004, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025948803522624075, "grad_norm": 6.89952278137207, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8437806963920593, "num_tokens": 431414015.0, "step": 11305 }, { "epoch": 1.4382394097443074, "ewc_loss": 0.05710263550281525, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002585263573564589, "grad_norm": 6.80706262588501, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8588333129882812, "num_tokens": 431453498.0, "step": 11306 }, { "epoch": 1.4383666200228977, "ewc_loss": 0.05714826285839081, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002589826181065291, "grad_norm": 6.926091194152832, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8636467456817627, "num_tokens": 431491126.0, "step": 11307 }, { "epoch": 1.4384938303014883, "ewc_loss": 0.057344481348991394, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00025850339443422854, "grad_norm": 6.818636894226074, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.88612961769104, "num_tokens": 431528549.0, "step": 11308 }, { "epoch": 1.4386210405800788, "ewc_loss": 0.05721861869096756, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002596861741039902, "grad_norm": 6.908062934875488, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.869990348815918, "num_tokens": 431569435.0, "step": 11309 }, { "epoch": 1.4387482508586693, "ewc_loss": 0.05697937309741974, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002572937519289553, "grad_norm": 6.777074337005615, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8695206642150879, "num_tokens": 431606125.0, "step": 11310 }, { "epoch": 1.4388754611372598, "ewc_loss": 0.05725499242544174, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026004991377703846, "grad_norm": 7.025506019592285, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.858986496925354, "num_tokens": 431643517.0, "step": 11311 }, { "epoch": 1.4390026714158504, "ewc_loss": 0.05690666288137436, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002565666218288243, "grad_norm": 6.709437847137451, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8727160692214966, "num_tokens": 431682058.0, "step": 11312 }, { "epoch": 1.439129881694441, "ewc_loss": 0.05727595463395119, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002602595486678183, "grad_norm": 7.005990028381348, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8754844665527344, "num_tokens": 431721241.0, "step": 11313 }, { "epoch": 1.4392570919730314, "ewc_loss": 0.05694565549492836, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002569565549492836, "grad_norm": 6.692560195922852, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8759973645210266, "num_tokens": 431757809.0, "step": 11314 }, { "epoch": 1.439384302251622, "ewc_loss": 0.05742860585451126, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026178607367910445, "grad_norm": 6.921712875366211, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8532276153564453, "num_tokens": 431795931.0, "step": 11315 }, { "epoch": 1.4395115125302125, "ewc_loss": 0.05707066133618355, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002582066226750612, "grad_norm": 6.8309221267700195, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8734087944030762, "num_tokens": 431830517.0, "step": 11316 }, { "epoch": 1.439638722808803, "ewc_loss": 0.057247456163167953, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025997456395998597, "grad_norm": 6.799813270568848, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8664864301681519, "num_tokens": 431866047.0, "step": 11317 }, { "epoch": 1.4397659330873935, "ewc_loss": 0.05730781704187393, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026057814829982817, "grad_norm": 6.9717326164245605, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8521842956542969, "num_tokens": 431901602.0, "step": 11318 }, { "epoch": 1.439893143365984, "ewc_loss": 0.057129282504320145, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025879283202812076, "grad_norm": 6.819654941558838, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8596591949462891, "num_tokens": 431934989.0, "step": 11319 }, { "epoch": 1.4400203536445746, "ewc_loss": 0.05726533383131027, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002601533487904817, "grad_norm": 6.873507022857666, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8690296411514282, "num_tokens": 431968664.0, "step": 11320 }, { "epoch": 1.4401475639231651, "ewc_loss": 0.057188548147678375, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002593855024315417, "grad_norm": 6.852989196777344, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8753470778465271, "num_tokens": 432008025.0, "step": 11321 }, { "epoch": 1.4402747742017554, "ewc_loss": 0.05726853013038635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002601852756924927, "grad_norm": 6.890390396118164, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8666233420372009, "num_tokens": 432043526.0, "step": 11322 }, { "epoch": 1.440401984480346, "ewc_loss": 0.057159218937158585, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002590921940281987, "grad_norm": 6.766826629638672, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8648196458816528, "num_tokens": 432085175.0, "step": 11323 }, { "epoch": 1.4405291947589365, "ewc_loss": 0.057301115244627, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026051115128211677, "grad_norm": 6.94456672668457, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8545693159103394, "num_tokens": 432122379.0, "step": 11324 }, { "epoch": 1.440656405037527, "ewc_loss": 0.05716800317168236, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025918002938851714, "grad_norm": 6.770324230194092, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8665838837623596, "num_tokens": 432157690.0, "step": 11325 }, { "epoch": 1.4407836153161175, "ewc_loss": 0.05728932470083237, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002603932225611061, "grad_norm": 6.891844749450684, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8719470500946045, "num_tokens": 432188885.0, "step": 11326 }, { "epoch": 1.440910825594708, "ewc_loss": 0.057204365730285645, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025954365264624357, "grad_norm": 6.827037811279297, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.864746630191803, "num_tokens": 432228804.0, "step": 11327 }, { "epoch": 1.4410380358732986, "ewc_loss": 0.0572696216404438, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026019621873274446, "grad_norm": 6.796115398406982, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8461811542510986, "num_tokens": 432264150.0, "step": 11328 }, { "epoch": 1.4411652461518891, "ewc_loss": 0.05726694315671921, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026016944320872426, "grad_norm": 6.826788902282715, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8577356338500977, "num_tokens": 432306759.0, "step": 11329 }, { "epoch": 1.4412924564304797, "ewc_loss": 0.05721144378185272, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025961443316191435, "grad_norm": 6.832028388977051, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8736558556556702, "num_tokens": 432346928.0, "step": 11330 }, { "epoch": 1.44141966670907, "ewc_loss": 0.05726853385567665, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026018533390015364, "grad_norm": 6.7338480949401855, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8709436655044556, "num_tokens": 432386630.0, "step": 11331 }, { "epoch": 1.4415468769876605, "ewc_loss": 0.05734556168317795, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000260955624980852, "grad_norm": 6.832675457000732, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8518556356430054, "num_tokens": 432427331.0, "step": 11332 }, { "epoch": 1.441674087266251, "ewc_loss": 0.057291314005851746, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002604131295811385, "grad_norm": 6.81093692779541, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8554505109786987, "num_tokens": 432462901.0, "step": 11333 }, { "epoch": 1.4418012975448415, "ewc_loss": 0.0573432520031929, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002609325456432998, "grad_norm": 6.831873893737793, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8618320226669312, "num_tokens": 432494232.0, "step": 11334 }, { "epoch": 1.441928507823432, "ewc_loss": 0.05724674090743065, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002599674044176936, "grad_norm": 6.80648136138916, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.875837504863739, "num_tokens": 432534276.0, "step": 11335 }, { "epoch": 1.4420557181020226, "ewc_loss": 0.057334162294864655, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608415961731225, "grad_norm": 6.784485816955566, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8558347225189209, "num_tokens": 432577337.0, "step": 11336 }, { "epoch": 1.4421829283806131, "ewc_loss": 0.05731097608804703, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002606097550597042, "grad_norm": 6.81986665725708, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8566335439682007, "num_tokens": 432614290.0, "step": 11337 }, { "epoch": 1.4423101386592037, "ewc_loss": 0.057419367134571075, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616936690174043, "grad_norm": 6.818421840667725, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8645561337471008, "num_tokens": 432648558.0, "step": 11338 }, { "epoch": 1.4424373489377942, "ewc_loss": 0.05737163871526718, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026121639530174434, "grad_norm": 6.86838960647583, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8373731970787048, "num_tokens": 432684243.0, "step": 11339 }, { "epoch": 1.4425645592163847, "ewc_loss": 0.05733303725719452, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608303911983967, "grad_norm": 6.840817451477051, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8631200790405273, "num_tokens": 432719355.0, "step": 11340 }, { "epoch": 1.4426917694949752, "ewc_loss": 0.05762399733066559, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026129858451895416, "grad_norm": 6.831101417541504, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8641146421432495, "num_tokens": 432758675.0, "step": 11341 }, { "epoch": 1.4428189797735658, "ewc_loss": 0.05737000331282616, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026120003894902766, "grad_norm": 6.840602874755859, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8605260848999023, "num_tokens": 432801677.0, "step": 11342 }, { "epoch": 1.4429461900521563, "ewc_loss": 0.05737590044736862, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000261259003309533, "grad_norm": 6.860320568084717, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8621358871459961, "num_tokens": 432836299.0, "step": 11343 }, { "epoch": 1.4430734003307468, "ewc_loss": 0.0576055683195591, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026111426996067166, "grad_norm": 6.8728346824646, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8620879650115967, "num_tokens": 432868838.0, "step": 11344 }, { "epoch": 1.4432006106093374, "ewc_loss": 0.05739311873912811, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002614312106743455, "grad_norm": 6.793461322784424, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8654369115829468, "num_tokens": 432904895.0, "step": 11345 }, { "epoch": 1.4433278208879279, "ewc_loss": 0.05742901563644409, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026179017731919885, "grad_norm": 6.870446681976318, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8582291603088379, "num_tokens": 432944785.0, "step": 11346 }, { "epoch": 1.4434550311665182, "ewc_loss": 0.05736132711172104, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026111328043043613, "grad_norm": 6.854703426361084, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8524386286735535, "num_tokens": 432975645.0, "step": 11347 }, { "epoch": 1.4435822414451087, "ewc_loss": 0.05744530260562897, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026195304235443473, "grad_norm": 6.810555458068848, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8585086464881897, "num_tokens": 433016887.0, "step": 11348 }, { "epoch": 1.4437094517236992, "ewc_loss": 0.05738522857427597, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002613523101899773, "grad_norm": 6.856479167938232, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8649531602859497, "num_tokens": 433053666.0, "step": 11349 }, { "epoch": 1.4438366620022898, "ewc_loss": 0.05733884125947952, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026088839513249695, "grad_norm": 7.515476226806641, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8584970831871033, "num_tokens": 433101131.0, "step": 11350 }, { "epoch": 1.4439638722808803, "ewc_loss": 0.057170119136571884, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002592011878732592, "grad_norm": 6.755913734436035, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8698657751083374, "num_tokens": 433132535.0, "step": 11351 }, { "epoch": 1.4440910825594708, "ewc_loss": 0.05733928829431534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608928771223873, "grad_norm": 6.8887715339660645, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8489314317703247, "num_tokens": 433168229.0, "step": 11352 }, { "epoch": 1.4442182928380614, "ewc_loss": 0.056921493262052536, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002567149349488318, "grad_norm": 6.728696823120117, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8674968481063843, "num_tokens": 433206094.0, "step": 11353 }, { "epoch": 1.4443455031166519, "ewc_loss": 0.057418256998062134, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026168255135416985, "grad_norm": 6.889813423156738, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8517981767654419, "num_tokens": 433241038.0, "step": 11354 }, { "epoch": 1.4444727133952424, "ewc_loss": 0.05714686959981918, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025896867737174034, "grad_norm": 6.814043998718262, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8559858798980713, "num_tokens": 433274674.0, "step": 11355 }, { "epoch": 1.4445999236738327, "ewc_loss": 0.05732877552509308, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026078775408677757, "grad_norm": 6.873901844024658, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8672268986701965, "num_tokens": 433314511.0, "step": 11356 }, { "epoch": 1.4447271339524232, "ewc_loss": 0.0572851225733757, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000260351225733757, "grad_norm": 6.840373992919922, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8652936220169067, "num_tokens": 433349338.0, "step": 11357 }, { "epoch": 1.4448543442310138, "ewc_loss": 0.05727938562631607, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026029383298009634, "grad_norm": 6.8269453048706055, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8727084994316101, "num_tokens": 433382609.0, "step": 11358 }, { "epoch": 1.4449815545096043, "ewc_loss": 0.05728907510638237, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026039074873551726, "grad_norm": 6.882655143737793, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8779768943786621, "num_tokens": 433411553.0, "step": 11359 }, { "epoch": 1.4451087647881948, "ewc_loss": 0.057209692895412445, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00025959694175980985, "grad_norm": 6.796487808227539, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8600480556488037, "num_tokens": 433450571.0, "step": 11360 }, { "epoch": 1.4452359750667854, "ewc_loss": 0.05732128769159317, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002607128699310124, "grad_norm": 6.849684238433838, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.858377993106842, "num_tokens": 433486994.0, "step": 11361 }, { "epoch": 1.445363185345376, "ewc_loss": 0.05727776139974594, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026027762214653194, "grad_norm": 6.771838665008545, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8749309182167053, "num_tokens": 433521407.0, "step": 11362 }, { "epoch": 1.4454903956239664, "ewc_loss": 0.05740983784198761, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002615983539726585, "grad_norm": 6.790476322174072, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8756230473518372, "num_tokens": 433561338.0, "step": 11363 }, { "epoch": 1.445617605902557, "ewc_loss": 0.05733712017536163, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608712238725275, "grad_norm": 6.83023738861084, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8713656663894653, "num_tokens": 433598892.0, "step": 11364 }, { "epoch": 1.4457448161811475, "ewc_loss": 0.05733618140220642, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026086182333528996, "grad_norm": 6.803825855255127, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8539345860481262, "num_tokens": 433641423.0, "step": 11365 }, { "epoch": 1.445872026459738, "ewc_loss": 0.05763866379857063, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026144523872062564, "grad_norm": 7.49303674697876, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8665523529052734, "num_tokens": 433677385.0, "step": 11366 }, { "epoch": 1.4459992367383285, "ewc_loss": 0.05748288333415985, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002598874270915985, "grad_norm": 6.700306415557861, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8614858388900757, "num_tokens": 433716901.0, "step": 11367 }, { "epoch": 1.446126447016919, "ewc_loss": 0.05758431553840637, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026334315771237016, "grad_norm": 6.937741756439209, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8594210147857666, "num_tokens": 433755099.0, "step": 11368 }, { "epoch": 1.4462536572955096, "ewc_loss": 0.05698008090257645, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002573008241597563, "grad_norm": 6.6833672523498535, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8593862056732178, "num_tokens": 433794012.0, "step": 11369 }, { "epoch": 1.4463808675741001, "ewc_loss": 0.05765211582183838, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002640211605466902, "grad_norm": 6.9533915519714355, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8670266270637512, "num_tokens": 433831965.0, "step": 11370 }, { "epoch": 1.4465080778526904, "ewc_loss": 0.057189952582120895, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002593995304778218, "grad_norm": 6.761768817901611, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.863978385925293, "num_tokens": 433870819.0, "step": 11371 }, { "epoch": 1.446635288131281, "ewc_loss": 0.057521477341651917, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026271474780514836, "grad_norm": 6.85823917388916, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8577682971954346, "num_tokens": 433909197.0, "step": 11372 }, { "epoch": 1.4467624984098715, "ewc_loss": 0.057302914559841156, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026052913744933903, "grad_norm": 6.77838134765625, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8699271082878113, "num_tokens": 433946026.0, "step": 11373 }, { "epoch": 1.446889708688462, "ewc_loss": 0.05743420496582985, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026184204034507275, "grad_norm": 6.784544944763184, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8845161199569702, "num_tokens": 433986239.0, "step": 11374 }, { "epoch": 1.4470169189670525, "ewc_loss": 0.057446062564849854, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026196063845418394, "grad_norm": 6.81455135345459, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8681557178497314, "num_tokens": 434022557.0, "step": 11375 }, { "epoch": 1.447144129245643, "ewc_loss": 0.057489775121212006, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002623977488838136, "grad_norm": 6.79188346862793, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8688153028488159, "num_tokens": 434059258.0, "step": 11376 }, { "epoch": 1.4472713395242336, "ewc_loss": 0.057434309273958206, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618430880829692, "grad_norm": 6.863310813903809, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8489155769348145, "num_tokens": 434094872.0, "step": 11377 }, { "epoch": 1.4473985498028241, "ewc_loss": 0.05734466761350632, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026094666100107133, "grad_norm": 6.875147819519043, "learning_rate": 1e-06, "loss": 0.5341, "mean_token_accuracy": 0.8395757675170898, "num_tokens": 434134842.0, "step": 11378 }, { "epoch": 1.4475257600814146, "ewc_loss": 0.05741775408387184, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616775454953313, "grad_norm": 6.800406455993652, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8749479055404663, "num_tokens": 434173152.0, "step": 11379 }, { "epoch": 1.447652970360005, "ewc_loss": 0.05735766142606735, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026107660960406065, "grad_norm": 6.852824687957764, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.865749180316925, "num_tokens": 434208318.0, "step": 11380 }, { "epoch": 1.4477801806385955, "ewc_loss": 0.05741887539625168, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616887795738876, "grad_norm": 6.852785110473633, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8608808517456055, "num_tokens": 434243693.0, "step": 11381 }, { "epoch": 1.447907390917186, "ewc_loss": 0.05729830265045166, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026048303698189557, "grad_norm": 6.798233509063721, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8520361185073853, "num_tokens": 434283172.0, "step": 11382 }, { "epoch": 1.4480346011957765, "ewc_loss": 0.057445064187049866, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002619506558403373, "grad_norm": 6.834937572479248, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8647298216819763, "num_tokens": 434318789.0, "step": 11383 }, { "epoch": 1.448161811474367, "ewc_loss": 0.0573563426733017, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026106342556886375, "grad_norm": 6.761708736419678, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8561861515045166, "num_tokens": 434358943.0, "step": 11384 }, { "epoch": 1.4482890217529576, "ewc_loss": 0.057476818561553955, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026226817863062024, "grad_norm": 6.796504020690918, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8650487065315247, "num_tokens": 434398264.0, "step": 11385 }, { "epoch": 1.4484162320315481, "ewc_loss": 0.057368818670511246, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026118819369003177, "grad_norm": 6.825242042541504, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8739681839942932, "num_tokens": 434433602.0, "step": 11386 }, { "epoch": 1.4485434423101387, "ewc_loss": 0.057410504668951035, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026160504785366356, "grad_norm": 6.741921901702881, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8574024438858032, "num_tokens": 434480001.0, "step": 11387 }, { "epoch": 1.4486706525887292, "ewc_loss": 0.05751693248748779, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026266934582963586, "grad_norm": 6.8184003829956055, "learning_rate": 1e-06, "loss": 0.4069, "mean_token_accuracy": 0.8788975477218628, "num_tokens": 434519285.0, "step": 11388 }, { "epoch": 1.4487978628673197, "ewc_loss": 0.05745857581496239, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026208575582131743, "grad_norm": 6.836500644683838, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8628567457199097, "num_tokens": 434556118.0, "step": 11389 }, { "epoch": 1.4489250731459102, "ewc_loss": 0.05747372657060623, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002622372703626752, "grad_norm": 6.825427055358887, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8482105731964111, "num_tokens": 434595350.0, "step": 11390 }, { "epoch": 1.4490522834245008, "ewc_loss": 0.05741231143474579, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026162309222854674, "grad_norm": 6.789754867553711, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8612077832221985, "num_tokens": 434632988.0, "step": 11391 }, { "epoch": 1.4491794937030913, "ewc_loss": 0.05752136558294296, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000262713641859591, "grad_norm": 6.823371887207031, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8709962368011475, "num_tokens": 434673040.0, "step": 11392 }, { "epoch": 1.4493067039816818, "ewc_loss": 0.057442426681518555, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000261924258666113, "grad_norm": 6.765898704528809, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8805923461914062, "num_tokens": 434712201.0, "step": 11393 }, { "epoch": 1.4494339142602723, "ewc_loss": 0.05756513029336929, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026315130526199937, "grad_norm": 6.819094181060791, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8776655197143555, "num_tokens": 434756676.0, "step": 11394 }, { "epoch": 1.4495611245388629, "ewc_loss": 0.05749792978167534, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002624792978167534, "grad_norm": 6.831791877746582, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8676104545593262, "num_tokens": 434797890.0, "step": 11395 }, { "epoch": 1.4496883348174532, "ewc_loss": 0.05756889656186104, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002631889656186104, "grad_norm": 6.871836185455322, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8761094808578491, "num_tokens": 434836933.0, "step": 11396 }, { "epoch": 1.4498155450960437, "ewc_loss": 0.05740805342793465, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000261580542428419, "grad_norm": 6.840902328491211, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8775438070297241, "num_tokens": 434868098.0, "step": 11397 }, { "epoch": 1.4499427553746342, "ewc_loss": 0.05754522979259491, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002629523223731667, "grad_norm": 6.842952728271484, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8583208322525024, "num_tokens": 434907239.0, "step": 11398 }, { "epoch": 1.4500699656532248, "ewc_loss": 0.05743734538555145, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000261873472481966, "grad_norm": 6.844610214233398, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8702933192253113, "num_tokens": 434945672.0, "step": 11399 }, { "epoch": 1.4501971759318153, "ewc_loss": 0.05754285678267479, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000262928573647514, "grad_norm": 6.840709209442139, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8541226983070374, "num_tokens": 434981693.0, "step": 11400 }, { "epoch": 1.4503243862104058, "ewc_loss": 0.05752978101372719, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002627978101372719, "grad_norm": 6.8489274978637695, "learning_rate": 1e-06, "loss": 0.3784, "mean_token_accuracy": 0.8866013288497925, "num_tokens": 435012452.0, "step": 11401 }, { "epoch": 1.4504515964889964, "ewc_loss": 0.05750970542430878, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002625970810186118, "grad_norm": 6.822204113006592, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8580684065818787, "num_tokens": 435052711.0, "step": 11402 }, { "epoch": 1.4505788067675869, "ewc_loss": 0.0575675442814827, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026317546144127846, "grad_norm": 6.831106185913086, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8624156713485718, "num_tokens": 435092766.0, "step": 11403 }, { "epoch": 1.4507060170461774, "ewc_loss": 0.057444777339696884, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026194777456112206, "grad_norm": 6.839001655578613, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8489446640014648, "num_tokens": 435132093.0, "step": 11404 }, { "epoch": 1.4508332273247677, "ewc_loss": 0.057558298110961914, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026308296946808696, "grad_norm": 6.859511852264404, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8605595231056213, "num_tokens": 435169071.0, "step": 11405 }, { "epoch": 1.4509604376033582, "ewc_loss": 0.05740562081336975, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026155621162615716, "grad_norm": 6.852932453155518, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8736428022384644, "num_tokens": 435202665.0, "step": 11406 }, { "epoch": 1.4510876478819488, "ewc_loss": 0.057515185326337814, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026265185442753136, "grad_norm": 6.83378267288208, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8782660365104675, "num_tokens": 435234719.0, "step": 11407 }, { "epoch": 1.4512148581605393, "ewc_loss": 0.05749641731381416, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002624641638249159, "grad_norm": 6.879278182983398, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8792531490325928, "num_tokens": 435267090.0, "step": 11408 }, { "epoch": 1.4513420684391298, "ewc_loss": 0.05744399130344391, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002619398874230683, "grad_norm": 6.810976505279541, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8718729019165039, "num_tokens": 435307243.0, "step": 11409 }, { "epoch": 1.4514692787177204, "ewc_loss": 0.05752791464328766, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026277912547811866, "grad_norm": 6.8557915687561035, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8721661567687988, "num_tokens": 435347507.0, "step": 11410 }, { "epoch": 1.4515964889963109, "ewc_loss": 0.05741043761372566, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026160437846556306, "grad_norm": 6.870894432067871, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8542695045471191, "num_tokens": 435383499.0, "step": 11411 }, { "epoch": 1.4517236992749014, "ewc_loss": 0.05743253231048584, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618253347463906, "grad_norm": 6.801932334899902, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8635885715484619, "num_tokens": 435423500.0, "step": 11412 }, { "epoch": 1.451850909553492, "ewc_loss": 0.05747031047940254, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000262203102465719, "grad_norm": 6.894351482391357, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8673189282417297, "num_tokens": 435457075.0, "step": 11413 }, { "epoch": 1.4519781198320825, "ewc_loss": 0.05733134597539902, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002608134818729013, "grad_norm": 6.80886697769165, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.862913966178894, "num_tokens": 435498933.0, "step": 11414 }, { "epoch": 1.452105330110673, "ewc_loss": 0.057484693825244904, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002623469626996666, "grad_norm": 6.874989032745361, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8557038307189941, "num_tokens": 435535949.0, "step": 11415 }, { "epoch": 1.4522325403892635, "ewc_loss": 0.057360872626304626, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026110871112905443, "grad_norm": 6.850553512573242, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8570442795753479, "num_tokens": 435565252.0, "step": 11416 }, { "epoch": 1.452359750667854, "ewc_loss": 0.057453371584415436, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002620337181724608, "grad_norm": 6.890456199645996, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.844576358795166, "num_tokens": 435604663.0, "step": 11417 }, { "epoch": 1.4524869609464446, "ewc_loss": 0.05733879655599594, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026088798767887056, "grad_norm": 6.813614368438721, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8585758209228516, "num_tokens": 435640883.0, "step": 11418 }, { "epoch": 1.4526141712250351, "ewc_loss": 0.05746585875749588, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002621585736051202, "grad_norm": 6.825872898101807, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.867402195930481, "num_tokens": 435682787.0, "step": 11419 }, { "epoch": 1.4527413815036254, "ewc_loss": 0.05738666653633118, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002613666874822229, "grad_norm": 6.855594158172607, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8611621856689453, "num_tokens": 435719982.0, "step": 11420 }, { "epoch": 1.452868591782216, "ewc_loss": 0.0574946403503418, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002624464104883373, "grad_norm": 6.893550395965576, "learning_rate": 1e-06, "loss": 0.5234, "mean_token_accuracy": 0.8467632532119751, "num_tokens": 435755153.0, "step": 11421 }, { "epoch": 1.4529958020608065, "ewc_loss": 0.05735494941473007, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000261049484834075, "grad_norm": 6.758847236633301, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.872687578201294, "num_tokens": 435797513.0, "step": 11422 }, { "epoch": 1.453123012339397, "ewc_loss": 0.05777072161436081, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002627657959237695, "grad_norm": 6.900244235992432, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8637056350708008, "num_tokens": 435836835.0, "step": 11423 }, { "epoch": 1.4532502226179875, "ewc_loss": 0.05740293115377426, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026152931968681514, "grad_norm": 6.826377868652344, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.855101466178894, "num_tokens": 435875124.0, "step": 11424 }, { "epoch": 1.453377432896578, "ewc_loss": 0.057675402611494064, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000261812616372481, "grad_norm": 6.847834587097168, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8758534789085388, "num_tokens": 435914993.0, "step": 11425 }, { "epoch": 1.4535046431751686, "ewc_loss": 0.05765564739704132, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026161508867517114, "grad_norm": 6.906036853790283, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8727896213531494, "num_tokens": 435950916.0, "step": 11426 }, { "epoch": 1.4536318534537591, "ewc_loss": 0.0575670450925827, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002607290225569159, "grad_norm": 6.818517684936523, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8580222725868225, "num_tokens": 435991986.0, "step": 11427 }, { "epoch": 1.4537590637323496, "ewc_loss": 0.05776719003915787, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026273049297742546, "grad_norm": 7.024831295013428, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8725252151489258, "num_tokens": 436024603.0, "step": 11428 }, { "epoch": 1.45388627401094, "ewc_loss": 0.05715204030275345, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002590204239822924, "grad_norm": 6.747381687164307, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8592092990875244, "num_tokens": 436065540.0, "step": 11429 }, { "epoch": 1.4540134842895305, "ewc_loss": 0.05747252702713013, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000262225279584527, "grad_norm": 6.984024524688721, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8508090972900391, "num_tokens": 436104394.0, "step": 11430 }, { "epoch": 1.454140694568121, "ewc_loss": 0.05747867375612259, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002598453429527581, "grad_norm": 6.835469722747803, "learning_rate": 1e-06, "loss": 0.5163, "mean_token_accuracy": 0.8499600887298584, "num_tokens": 436139977.0, "step": 11431 }, { "epoch": 1.4542679048467115, "ewc_loss": 0.05743233487010002, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618233556859195, "grad_norm": 6.913329124450684, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8789311647415161, "num_tokens": 436178047.0, "step": 11432 }, { "epoch": 1.454395115125302, "ewc_loss": 0.057456716895103455, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000259625754551962, "grad_norm": 6.800591468811035, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8621904850006104, "num_tokens": 436218331.0, "step": 11433 }, { "epoch": 1.4545223254038926, "ewc_loss": 0.05762826278805733, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026134122163057327, "grad_norm": 6.854894638061523, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8618937730789185, "num_tokens": 436255663.0, "step": 11434 }, { "epoch": 1.4546495356824831, "ewc_loss": 0.0576116144657135, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026117474772036076, "grad_norm": 6.803247928619385, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.863960325717926, "num_tokens": 436301328.0, "step": 11435 }, { "epoch": 1.4547767459610736, "ewc_loss": 0.05735669285058975, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026106691802851856, "grad_norm": 6.846011638641357, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8718155026435852, "num_tokens": 436336487.0, "step": 11436 }, { "epoch": 1.4549039562396642, "ewc_loss": 0.05743476748466492, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002618476573843509, "grad_norm": 6.896005153656006, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8664641380310059, "num_tokens": 436377629.0, "step": 11437 }, { "epoch": 1.4550311665182547, "ewc_loss": 0.057412635535001755, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616263518575579, "grad_norm": 6.807360649108887, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8791212439537048, "num_tokens": 436421590.0, "step": 11438 }, { "epoch": 1.4551583767968452, "ewc_loss": 0.05744241923093796, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026192417135462165, "grad_norm": 6.827822208404541, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8564620614051819, "num_tokens": 436461172.0, "step": 11439 }, { "epoch": 1.4552855870754358, "ewc_loss": 0.0574427992105484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002619279839564115, "grad_norm": 6.867593288421631, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8728424906730652, "num_tokens": 436497536.0, "step": 11440 }, { "epoch": 1.4554127973540263, "ewc_loss": 0.05741683393716812, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616683195810765, "grad_norm": 6.820853233337402, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8609709143638611, "num_tokens": 436543906.0, "step": 11441 }, { "epoch": 1.4555400076326168, "ewc_loss": 0.05747227370738983, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002622227184474468, "grad_norm": 6.854278564453125, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8663874864578247, "num_tokens": 436579462.0, "step": 11442 }, { "epoch": 1.4556672179112073, "ewc_loss": 0.057481154799461365, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002623115433380008, "grad_norm": 6.912461280822754, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.85628741979599, "num_tokens": 436618840.0, "step": 11443 }, { "epoch": 1.4557944281897979, "ewc_loss": 0.05738763138651848, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002613763208501041, "grad_norm": 6.869384765625, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8524494171142578, "num_tokens": 436654176.0, "step": 11444 }, { "epoch": 1.4559216384683882, "ewc_loss": 0.05738566815853119, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002613566757645458, "grad_norm": 6.8889570236206055, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8782016038894653, "num_tokens": 436685507.0, "step": 11445 }, { "epoch": 1.4560488487469787, "ewc_loss": 0.057422325015068054, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026172323850914836, "grad_norm": 6.840538501739502, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8716944456100464, "num_tokens": 436729881.0, "step": 11446 }, { "epoch": 1.4561760590255692, "ewc_loss": 0.0574612133204937, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026211212389171124, "grad_norm": 6.880155563354492, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8631595373153687, "num_tokens": 436772355.0, "step": 11447 }, { "epoch": 1.4563032693041598, "ewc_loss": 0.05736679583787918, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026116796652786434, "grad_norm": 6.817680835723877, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8569929599761963, "num_tokens": 436808701.0, "step": 11448 }, { "epoch": 1.4564304795827503, "ewc_loss": 0.057536669075489044, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002628666698001325, "grad_norm": 6.918086528778076, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8734246492385864, "num_tokens": 436845441.0, "step": 11449 }, { "epoch": 1.4565576898613408, "ewc_loss": 0.05766803398728371, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026173892547376454, "grad_norm": 6.8728718757629395, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8669334053993225, "num_tokens": 436885316.0, "step": 11450 }, { "epoch": 1.4566849001399313, "ewc_loss": 0.05749610811471939, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026246107881888747, "grad_norm": 6.889647960662842, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8574633598327637, "num_tokens": 436918315.0, "step": 11451 }, { "epoch": 1.4568121104185219, "ewc_loss": 0.05747107043862343, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002622106985654682, "grad_norm": 6.9373979568481445, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8564451932907104, "num_tokens": 436956039.0, "step": 11452 }, { "epoch": 1.4569393206971124, "ewc_loss": 0.05741463974118233, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002616464043967426, "grad_norm": 6.905430316925049, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8495895266532898, "num_tokens": 436991274.0, "step": 11453 }, { "epoch": 1.4570665309757027, "ewc_loss": 0.057447075843811035, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002619707665871829, "grad_norm": 6.842204570770264, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8749586343765259, "num_tokens": 437029963.0, "step": 11454 }, { "epoch": 1.4571937412542932, "ewc_loss": 0.057494085282087326, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026244085165672004, "grad_norm": 6.9274492263793945, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8569980263710022, "num_tokens": 437067030.0, "step": 11455 }, { "epoch": 1.4573209515328838, "ewc_loss": 0.05738802254199982, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026138024986721575, "grad_norm": 6.870769023895264, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8702593445777893, "num_tokens": 437103363.0, "step": 11456 }, { "epoch": 1.4574481618114743, "ewc_loss": 0.057514727115631104, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026264728512614965, "grad_norm": 6.891700267791748, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8713862895965576, "num_tokens": 437131574.0, "step": 11457 }, { "epoch": 1.4575753720900648, "ewc_loss": 0.05739659070968628, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002614659024402499, "grad_norm": 6.818650245666504, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8580003380775452, "num_tokens": 437169537.0, "step": 11458 }, { "epoch": 1.4577025823686554, "ewc_loss": 0.057536568492650986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000262865680269897, "grad_norm": 6.863708972930908, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8678503036499023, "num_tokens": 437208025.0, "step": 11459 }, { "epoch": 1.4578297926472459, "ewc_loss": 0.05749011039733887, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002624011249281466, "grad_norm": 6.831840991973877, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.875355064868927, "num_tokens": 437246642.0, "step": 11460 }, { "epoch": 1.4579570029258364, "ewc_loss": 0.05758574232459068, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026335741858929396, "grad_norm": 6.8807597160339355, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8600780963897705, "num_tokens": 437285274.0, "step": 11461 }, { "epoch": 1.458084213204427, "ewc_loss": 0.05750517174601555, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026255170814692974, "grad_norm": 6.764750957489014, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8798349499702454, "num_tokens": 437325865.0, "step": 11462 }, { "epoch": 1.4582114234830175, "ewc_loss": 0.0576072558760643, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026357255410403013, "grad_norm": 6.872317314147949, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8692423105239868, "num_tokens": 437364093.0, "step": 11463 }, { "epoch": 1.458338633761608, "ewc_loss": 0.057579271495342255, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026329272077418864, "grad_norm": 6.851004123687744, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8515506982803345, "num_tokens": 437402413.0, "step": 11464 }, { "epoch": 1.4584658440401985, "ewc_loss": 0.0576322078704834, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026382209034636617, "grad_norm": 6.872764587402344, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8507180213928223, "num_tokens": 437442583.0, "step": 11465 }, { "epoch": 1.458593054318789, "ewc_loss": 0.057615313678979874, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026365314261056483, "grad_norm": 6.8579607009887695, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8631902933120728, "num_tokens": 437485682.0, "step": 11466 }, { "epoch": 1.4587202645973796, "ewc_loss": 0.05759800970554352, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026348012033849955, "grad_norm": 6.852108955383301, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8541842699050903, "num_tokens": 437524859.0, "step": 11467 }, { "epoch": 1.45884747487597, "ewc_loss": 0.0576544813811779, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026404482196085155, "grad_norm": 6.867175102233887, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8559914827346802, "num_tokens": 437563590.0, "step": 11468 }, { "epoch": 1.4589746851545604, "ewc_loss": 0.05764281377196312, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002639281447045505, "grad_norm": 6.815262317657471, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8803945779800415, "num_tokens": 437601429.0, "step": 11469 }, { "epoch": 1.459101895433151, "ewc_loss": 0.05769192427396774, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026441924273967743, "grad_norm": 6.905966281890869, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8517107367515564, "num_tokens": 437639144.0, "step": 11470 }, { "epoch": 1.4592291057117415, "ewc_loss": 0.05755946785211563, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026309466920793056, "grad_norm": 6.779283046722412, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8722190260887146, "num_tokens": 437680846.0, "step": 11471 }, { "epoch": 1.459356315990332, "ewc_loss": 0.05770251154899597, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000264525122474879, "grad_norm": 6.814539432525635, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8650635480880737, "num_tokens": 437722156.0, "step": 11472 }, { "epoch": 1.4594835262689225, "ewc_loss": 0.05761798471212387, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002636798599269241, "grad_norm": 6.821113586425781, "learning_rate": 1e-06, "loss": 0.5371, "mean_token_accuracy": 0.8410735130310059, "num_tokens": 437766905.0, "step": 11473 }, { "epoch": 1.459610736547513, "ewc_loss": 0.05778369680047035, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002653369738254696, "grad_norm": 6.907987594604492, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8643712401390076, "num_tokens": 437801425.0, "step": 11474 }, { "epoch": 1.4597379468261036, "ewc_loss": 0.057692721486091614, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002644271880853921, "grad_norm": 6.816927433013916, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8541361093521118, "num_tokens": 437838126.0, "step": 11475 }, { "epoch": 1.4598651571046941, "ewc_loss": 0.05781446024775505, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002656446013133973, "grad_norm": 6.874258995056152, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8584738373756409, "num_tokens": 437876597.0, "step": 11476 }, { "epoch": 1.4599923673832846, "ewc_loss": 0.05773504078388214, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002648503868840635, "grad_norm": 6.871542930603027, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8752741813659668, "num_tokens": 437911852.0, "step": 11477 }, { "epoch": 1.460119577661875, "ewc_loss": 0.05775420367717743, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002650420356076211, "grad_norm": 6.848062515258789, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8650586605072021, "num_tokens": 437953418.0, "step": 11478 }, { "epoch": 1.4602467879404655, "ewc_loss": 0.057725004851818085, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002647500659804791, "grad_norm": 6.835449695587158, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8699283599853516, "num_tokens": 437989561.0, "step": 11479 }, { "epoch": 1.460373998219056, "ewc_loss": 0.05772818624973297, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026478187646716833, "grad_norm": 6.840151786804199, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8729776740074158, "num_tokens": 438033464.0, "step": 11480 }, { "epoch": 1.4605012084976465, "ewc_loss": 0.05768971145153046, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002643971238285303, "grad_norm": 6.873587131500244, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8679097890853882, "num_tokens": 438068676.0, "step": 11481 }, { "epoch": 1.460628418776237, "ewc_loss": 0.057769306004047394, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026519305538386106, "grad_norm": 6.881178379058838, "learning_rate": 1e-06, "loss": 0.5177, "mean_token_accuracy": 0.8464885950088501, "num_tokens": 438107867.0, "step": 11482 }, { "epoch": 1.4607556290548276, "ewc_loss": 0.057677969336509705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002642796898726374, "grad_norm": 6.812252521514893, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8800568580627441, "num_tokens": 438144256.0, "step": 11483 }, { "epoch": 1.4608828393334181, "ewc_loss": 0.05772105231881142, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002647105138748884, "grad_norm": 6.849742412567139, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8733159303665161, "num_tokens": 438183458.0, "step": 11484 }, { "epoch": 1.4610100496120086, "ewc_loss": 0.05770723521709442, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026457232888787985, "grad_norm": 6.93182897567749, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8521272540092468, "num_tokens": 438216025.0, "step": 11485 }, { "epoch": 1.4611372598905992, "ewc_loss": 0.05770014226436615, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026450143195688725, "grad_norm": 6.832663059234619, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8744006156921387, "num_tokens": 438258185.0, "step": 11486 }, { "epoch": 1.4612644701691897, "ewc_loss": 0.0577203631401062, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002647036162670702, "grad_norm": 6.853062629699707, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8595767021179199, "num_tokens": 438298189.0, "step": 11487 }, { "epoch": 1.4613916804477802, "ewc_loss": 0.05760207027196884, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026352069107815623, "grad_norm": 6.796924114227295, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8532265424728394, "num_tokens": 438336593.0, "step": 11488 }, { "epoch": 1.4615188907263708, "ewc_loss": 0.05781581252813339, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026565813459455967, "grad_norm": 6.84478235244751, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8700835108757019, "num_tokens": 438374110.0, "step": 11489 }, { "epoch": 1.4616461010049613, "ewc_loss": 0.057705819606781006, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002645581844262779, "grad_norm": 6.840559482574463, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8470321893692017, "num_tokens": 438413062.0, "step": 11490 }, { "epoch": 1.4617733112835518, "ewc_loss": 0.05777304619550705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026523045380599797, "grad_norm": 6.911139011383057, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8664678335189819, "num_tokens": 438452055.0, "step": 11491 }, { "epoch": 1.4619005215621423, "ewc_loss": 0.05771263688802719, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026462634559720755, "grad_norm": 6.841290473937988, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8662428855895996, "num_tokens": 438494224.0, "step": 11492 }, { "epoch": 1.4620277318407329, "ewc_loss": 0.057757169008255005, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002650716924108565, "grad_norm": 6.856746673583984, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8605471849441528, "num_tokens": 438529790.0, "step": 11493 }, { "epoch": 1.4621549421193232, "ewc_loss": 0.057743869721889496, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026493871700949967, "grad_norm": 6.860041618347168, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8670909404754639, "num_tokens": 438571371.0, "step": 11494 }, { "epoch": 1.4622821523979137, "ewc_loss": 0.05772440880537033, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002647440996952355, "grad_norm": 6.831326484680176, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8505643606185913, "num_tokens": 438612383.0, "step": 11495 }, { "epoch": 1.4624093626765042, "ewc_loss": 0.05775880068540573, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026508799055591226, "grad_norm": 6.850492000579834, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.854193389415741, "num_tokens": 438653443.0, "step": 11496 }, { "epoch": 1.4625365729550948, "ewc_loss": 0.05776131898164749, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026511316536925733, "grad_norm": 6.886671543121338, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8515375256538391, "num_tokens": 438689565.0, "step": 11497 }, { "epoch": 1.4626637832336853, "ewc_loss": 0.057822085916996, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002657208824530244, "grad_norm": 6.870413303375244, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8640949130058289, "num_tokens": 438727753.0, "step": 11498 }, { "epoch": 1.4627909935122758, "ewc_loss": 0.05778481438755989, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000265348149696365, "grad_norm": 6.883874893188477, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8618967533111572, "num_tokens": 438764667.0, "step": 11499 }, { "epoch": 1.4629182037908663, "ewc_loss": 0.057798631489276886, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002654863055795431, "grad_norm": 6.867593765258789, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.869982898235321, "num_tokens": 438801739.0, "step": 11500 }, { "epoch": 1.4630454140694569, "ewc_loss": 0.05780146270990372, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026551465271040797, "grad_norm": 6.848654270172119, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8653374910354614, "num_tokens": 438840028.0, "step": 11501 }, { "epoch": 1.4631726243480474, "ewc_loss": 0.057821352034807205, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026571351918391883, "grad_norm": 6.965447425842285, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8621689677238464, "num_tokens": 438880632.0, "step": 11502 }, { "epoch": 1.4632998346266377, "ewc_loss": 0.057658061385154724, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026408061967231333, "grad_norm": 6.817769527435303, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8605179786682129, "num_tokens": 438923040.0, "step": 11503 }, { "epoch": 1.4634270449052282, "ewc_loss": 0.05780532956123352, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026555327349342406, "grad_norm": 6.872805595397949, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8646561503410339, "num_tokens": 438958597.0, "step": 11504 }, { "epoch": 1.4635542551838188, "ewc_loss": 0.05775491148233414, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002650491369422525, "grad_norm": 6.866980075836182, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8515667915344238, "num_tokens": 438997909.0, "step": 11505 }, { "epoch": 1.4636814654624093, "ewc_loss": 0.05780977010726929, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002655977150425315, "grad_norm": 6.892408847808838, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8715450763702393, "num_tokens": 439030084.0, "step": 11506 }, { "epoch": 1.4638086757409998, "ewc_loss": 0.057813458144664764, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002656345604918897, "grad_norm": 6.873606204986572, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8672598004341125, "num_tokens": 439067770.0, "step": 11507 }, { "epoch": 1.4639358860195903, "ewc_loss": 0.05782116949558258, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002657117147464305, "grad_norm": 6.905581951141357, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8666746616363525, "num_tokens": 439102476.0, "step": 11508 }, { "epoch": 1.4640630962981809, "ewc_loss": 0.05780305340886116, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026553054340183735, "grad_norm": 6.839620590209961, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8737101554870605, "num_tokens": 439138660.0, "step": 11509 }, { "epoch": 1.4641903065767714, "ewc_loss": 0.057902902364730835, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026652903761714697, "grad_norm": 6.877696990966797, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8715830445289612, "num_tokens": 439181690.0, "step": 11510 }, { "epoch": 1.464317516855362, "ewc_loss": 0.057804353535175323, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002655435528140515, "grad_norm": 6.900075912475586, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8690754771232605, "num_tokens": 439218825.0, "step": 11511 }, { "epoch": 1.4644447271339525, "ewc_loss": 0.05808991193771362, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026595770032145083, "grad_norm": 6.865993022918701, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8692849278450012, "num_tokens": 439253505.0, "step": 11512 }, { "epoch": 1.464571937412543, "ewc_loss": 0.05814607813954353, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026651937514543533, "grad_norm": 6.907393932342529, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8584307432174683, "num_tokens": 439292764.0, "step": 11513 }, { "epoch": 1.4646991476911335, "ewc_loss": 0.057989344000816345, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026495204656384885, "grad_norm": 6.873518943786621, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8733797073364258, "num_tokens": 439335269.0, "step": 11514 }, { "epoch": 1.464826357969724, "ewc_loss": 0.05789683014154434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026646829792298377, "grad_norm": 6.935427188873291, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8654821515083313, "num_tokens": 439371977.0, "step": 11515 }, { "epoch": 1.4649535682483146, "ewc_loss": 0.05775956064462662, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002650956157594919, "grad_norm": 6.861015796661377, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8646064400672913, "num_tokens": 439412710.0, "step": 11516 }, { "epoch": 1.465080778526905, "ewc_loss": 0.058022640645504, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026528502348810434, "grad_norm": 6.887165069580078, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8627010583877563, "num_tokens": 439449637.0, "step": 11517 }, { "epoch": 1.4652079888054954, "ewc_loss": 0.05779403820633888, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002654403797350824, "grad_norm": 6.835853576660156, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8583637475967407, "num_tokens": 439495143.0, "step": 11518 }, { "epoch": 1.465335199084086, "ewc_loss": 0.05779334157705307, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026543339481577277, "grad_norm": 6.958427906036377, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8516001105308533, "num_tokens": 439532721.0, "step": 11519 }, { "epoch": 1.4654624093626765, "ewc_loss": 0.05770717188715935, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026457171770744026, "grad_norm": 6.794135570526123, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.873867392539978, "num_tokens": 439567938.0, "step": 11520 }, { "epoch": 1.465589619641267, "ewc_loss": 0.0579107403755188, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026660741423256695, "grad_norm": 7.028936386108398, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8723831176757812, "num_tokens": 439595720.0, "step": 11521 }, { "epoch": 1.4657168299198575, "ewc_loss": 0.057633526623249054, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002638352452777326, "grad_norm": 6.774827480316162, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8644657135009766, "num_tokens": 439634091.0, "step": 11522 }, { "epoch": 1.465844040198448, "ewc_loss": 0.05822339653968811, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002672925766091794, "grad_norm": 6.941146373748779, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8526574373245239, "num_tokens": 439671978.0, "step": 11523 }, { "epoch": 1.4659712504770386, "ewc_loss": 0.05786379799246788, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002636965655256063, "grad_norm": 6.821247577667236, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8591722249984741, "num_tokens": 439707748.0, "step": 11524 }, { "epoch": 1.466098460755629, "ewc_loss": 0.05820590630173683, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002671176625881344, "grad_norm": 6.88508939743042, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8718405961990356, "num_tokens": 439744060.0, "step": 11525 }, { "epoch": 1.4662256710342196, "ewc_loss": 0.05778096243739128, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002653096162248403, "grad_norm": 6.804791450500488, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8577613830566406, "num_tokens": 439789277.0, "step": 11526 }, { "epoch": 1.46635288131281, "ewc_loss": 0.057956598699092865, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026706597418524325, "grad_norm": 6.8588151931762695, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8809711933135986, "num_tokens": 439827524.0, "step": 11527 }, { "epoch": 1.4664800915914005, "ewc_loss": 0.058123134076595306, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002662899496499449, "grad_norm": 6.879544734954834, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8606552481651306, "num_tokens": 439865181.0, "step": 11528 }, { "epoch": 1.466607301869991, "ewc_loss": 0.05790122225880623, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000266512215603143, "grad_norm": 6.927577495574951, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.856579065322876, "num_tokens": 439900071.0, "step": 11529 }, { "epoch": 1.4667345121485815, "ewc_loss": 0.057834576815366745, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026584576698951423, "grad_norm": 6.8270063400268555, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8622914552688599, "num_tokens": 439940839.0, "step": 11530 }, { "epoch": 1.466861722427172, "ewc_loss": 0.057908378541469574, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002665837819222361, "grad_norm": 6.892349720001221, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8599239587783813, "num_tokens": 439978908.0, "step": 11531 }, { "epoch": 1.4669889327057626, "ewc_loss": 0.057816676795482635, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026566674932837486, "grad_norm": 6.836101531982422, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8621220588684082, "num_tokens": 440013448.0, "step": 11532 }, { "epoch": 1.467116142984353, "ewc_loss": 0.05786373093724251, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026613730005919933, "grad_norm": 6.892388343811035, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8670376539230347, "num_tokens": 440048377.0, "step": 11533 }, { "epoch": 1.4672433532629436, "ewc_loss": 0.05781060457229614, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002656060387380421, "grad_norm": 6.858314037322998, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8581569790840149, "num_tokens": 440089806.0, "step": 11534 }, { "epoch": 1.4673705635415342, "ewc_loss": 0.05785123258829117, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002660123282112181, "grad_norm": 6.941293716430664, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8691529035568237, "num_tokens": 440124209.0, "step": 11535 }, { "epoch": 1.4674977738201247, "ewc_loss": 0.05772240087389946, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026472401805222034, "grad_norm": 6.8813018798828125, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8661361932754517, "num_tokens": 440158349.0, "step": 11536 }, { "epoch": 1.4676249840987152, "ewc_loss": 0.05807304009795189, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026578898541629314, "grad_norm": 6.927230358123779, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8651612401008606, "num_tokens": 440195220.0, "step": 11537 }, { "epoch": 1.4677521943773058, "ewc_loss": 0.05791127681732178, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026417133631184697, "grad_norm": 6.804276943206787, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8637903928756714, "num_tokens": 440235158.0, "step": 11538 }, { "epoch": 1.4678794046558963, "ewc_loss": 0.05806171894073486, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026567577151581645, "grad_norm": 6.959310531616211, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8582058548927307, "num_tokens": 440271879.0, "step": 11539 }, { "epoch": 1.4680066149344868, "ewc_loss": 0.05790391564369202, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026409776182845235, "grad_norm": 6.841328144073486, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8622403144836426, "num_tokens": 440306454.0, "step": 11540 }, { "epoch": 1.4681338252130773, "ewc_loss": 0.058037202805280685, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002654306299518794, "grad_norm": 6.906622409820557, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8529744744300842, "num_tokens": 440341185.0, "step": 11541 }, { "epoch": 1.4682610354916679, "ewc_loss": 0.057947635650634766, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026453493046574295, "grad_norm": 6.859739303588867, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8627972602844238, "num_tokens": 440379168.0, "step": 11542 }, { "epoch": 1.4683882457702582, "ewc_loss": 0.057992033660411835, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002649789385031909, "grad_norm": 6.841109752655029, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8664040565490723, "num_tokens": 440417945.0, "step": 11543 }, { "epoch": 1.4685154560488487, "ewc_loss": 0.058032698929309845, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002653856063261628, "grad_norm": 6.813853740692139, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8487368822097778, "num_tokens": 440457864.0, "step": 11544 }, { "epoch": 1.4686426663274392, "ewc_loss": 0.05807190388441086, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026577760581858456, "grad_norm": 6.923208236694336, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8589783310890198, "num_tokens": 440491709.0, "step": 11545 }, { "epoch": 1.4687698766060298, "ewc_loss": 0.05798894912004471, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002649480593390763, "grad_norm": 6.809685707092285, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8694568276405334, "num_tokens": 440533570.0, "step": 11546 }, { "epoch": 1.4688970868846203, "ewc_loss": 0.05812506377696991, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026630921638570726, "grad_norm": 6.887484550476074, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8689770698547363, "num_tokens": 440568360.0, "step": 11547 }, { "epoch": 1.4690242971632108, "ewc_loss": 0.057983286678791046, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026489145238883793, "grad_norm": 6.870075225830078, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8744128346443176, "num_tokens": 440606848.0, "step": 11548 }, { "epoch": 1.4691515074418013, "ewc_loss": 0.05807575210928917, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002658161101862788, "grad_norm": 6.941183567047119, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8763858675956726, "num_tokens": 440638254.0, "step": 11549 }, { "epoch": 1.4692787177203919, "ewc_loss": 0.057969167828559875, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002647502697072923, "grad_norm": 6.86265754699707, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8703609108924866, "num_tokens": 440674122.0, "step": 11550 }, { "epoch": 1.4694059279989824, "ewc_loss": 0.05796317756175995, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002646903449203819, "grad_norm": 6.809438705444336, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8779812455177307, "num_tokens": 440718362.0, "step": 11551 }, { "epoch": 1.4695331382775727, "ewc_loss": 0.057984963059425354, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000264908216195181, "grad_norm": 6.856058120727539, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8703018426895142, "num_tokens": 440755971.0, "step": 11552 }, { "epoch": 1.4696603485561632, "ewc_loss": 0.057940542697906494, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002644640044309199, "grad_norm": 6.848365783691406, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8614859580993652, "num_tokens": 440794328.0, "step": 11553 }, { "epoch": 1.4697875588347538, "ewc_loss": 0.058029502630233765, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026535362121649086, "grad_norm": 6.904842853546143, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8516364097595215, "num_tokens": 440833383.0, "step": 11554 }, { "epoch": 1.4699147691133443, "ewc_loss": 0.05801211670041084, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026517975493334234, "grad_norm": 6.872334957122803, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8705016374588013, "num_tokens": 440875502.0, "step": 11555 }, { "epoch": 1.4700419793919348, "ewc_loss": 0.05770900100469589, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026459002401679754, "grad_norm": 6.875288486480713, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8679108619689941, "num_tokens": 440915259.0, "step": 11556 }, { "epoch": 1.4701691896705253, "ewc_loss": 0.05791179835796356, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002641765750013292, "grad_norm": 6.870081424713135, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8615151047706604, "num_tokens": 440951660.0, "step": 11557 }, { "epoch": 1.4702963999491159, "ewc_loss": 0.057977546006441116, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026483405963517725, "grad_norm": 6.842955112457275, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8639386892318726, "num_tokens": 440992533.0, "step": 11558 }, { "epoch": 1.4704236102277064, "ewc_loss": 0.05797150731086731, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002647736400831491, "grad_norm": 6.876712799072266, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8527450561523438, "num_tokens": 441027360.0, "step": 11559 }, { "epoch": 1.470550820506297, "ewc_loss": 0.05797765403985977, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026483513647690415, "grad_norm": 6.8498125076293945, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8705165982246399, "num_tokens": 441071746.0, "step": 11560 }, { "epoch": 1.4706780307848875, "ewc_loss": 0.05805923044681549, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026565088774077594, "grad_norm": 6.913476467132568, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8629852533340454, "num_tokens": 441105378.0, "step": 11561 }, { "epoch": 1.470805241063478, "ewc_loss": 0.05793764442205429, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000264435017015785, "grad_norm": 6.852162837982178, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.864544153213501, "num_tokens": 441150000.0, "step": 11562 }, { "epoch": 1.4709324513420685, "ewc_loss": 0.05812033265829086, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002662618935573846, "grad_norm": 6.883012294769287, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8574492335319519, "num_tokens": 441188426.0, "step": 11563 }, { "epoch": 1.471059661620659, "ewc_loss": 0.058009322732686996, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002651518152561039, "grad_norm": 6.916415691375732, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.871105968952179, "num_tokens": 441226028.0, "step": 11564 }, { "epoch": 1.4711868718992496, "ewc_loss": 0.0580107718706131, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002651663089636713, "grad_norm": 6.907793998718262, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8651093244552612, "num_tokens": 441265788.0, "step": 11565 }, { "epoch": 1.47131408217784, "ewc_loss": 0.05804690718650818, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002655276912264526, "grad_norm": 6.899515628814697, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8723975419998169, "num_tokens": 441297280.0, "step": 11566 }, { "epoch": 1.4714412924564304, "ewc_loss": 0.05799907445907593, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002650493406690657, "grad_norm": 6.910688400268555, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8644613027572632, "num_tokens": 441331229.0, "step": 11567 }, { "epoch": 1.471568502735021, "ewc_loss": 0.05799746513366699, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026503324625082314, "grad_norm": 6.8328118324279785, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8675781488418579, "num_tokens": 441371064.0, "step": 11568 }, { "epoch": 1.4716957130136115, "ewc_loss": 0.05805013328790665, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002655599091667682, "grad_norm": 6.945997714996338, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8454673886299133, "num_tokens": 441405706.0, "step": 11569 }, { "epoch": 1.471822923292202, "ewc_loss": 0.05768687278032303, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026436871849000454, "grad_norm": 6.836179733276367, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8552918434143066, "num_tokens": 441445966.0, "step": 11570 }, { "epoch": 1.4719501335707925, "ewc_loss": 0.05786791443824768, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026617912226356566, "grad_norm": 6.930247783660889, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.849164605140686, "num_tokens": 441478916.0, "step": 11571 }, { "epoch": 1.472077343849383, "ewc_loss": 0.057718344032764435, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026468344731256366, "grad_norm": 6.919055938720703, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8602840900421143, "num_tokens": 441519245.0, "step": 11572 }, { "epoch": 1.4722045541279736, "ewc_loss": 0.05772092565894127, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002647092624101788, "grad_norm": 6.869121551513672, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8572211265563965, "num_tokens": 441557485.0, "step": 11573 }, { "epoch": 1.472331764406564, "ewc_loss": 0.05806849151849747, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002657435252331197, "grad_norm": 6.953736782073975, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8744983673095703, "num_tokens": 441596042.0, "step": 11574 }, { "epoch": 1.4724589746851546, "ewc_loss": 0.05799590051174164, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002650175883900374, "grad_norm": 6.915227890014648, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8743258118629456, "num_tokens": 441634391.0, "step": 11575 }, { "epoch": 1.472586184963745, "ewc_loss": 0.058033719658851624, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026539579266682267, "grad_norm": 6.885770797729492, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8562945127487183, "num_tokens": 441672201.0, "step": 11576 }, { "epoch": 1.4727133952423355, "ewc_loss": 0.05794268101453781, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002644853957463056, "grad_norm": 7.097870826721191, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8620094060897827, "num_tokens": 441713350.0, "step": 11577 }, { "epoch": 1.472840605520926, "ewc_loss": 0.057800643146038055, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026306501240469515, "grad_norm": 6.8164777755737305, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8621313571929932, "num_tokens": 441754677.0, "step": 11578 }, { "epoch": 1.4729678157995165, "ewc_loss": 0.058017030358314514, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026522891130298376, "grad_norm": 6.893000602722168, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8527557849884033, "num_tokens": 441797037.0, "step": 11579 }, { "epoch": 1.473095026078107, "ewc_loss": 0.057844243943691254, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026350104599259794, "grad_norm": 6.890986919403076, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8614296913146973, "num_tokens": 441831661.0, "step": 11580 }, { "epoch": 1.4732222363566976, "ewc_loss": 0.05806243047118187, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026568290195427835, "grad_norm": 6.92128849029541, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8676726222038269, "num_tokens": 441876933.0, "step": 11581 }, { "epoch": 1.473349446635288, "ewc_loss": 0.05794977396726608, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002645563508849591, "grad_norm": 6.859803676605225, "learning_rate": 1e-06, "loss": 0.3986, "mean_token_accuracy": 0.885516881942749, "num_tokens": 441918015.0, "step": 11582 }, { "epoch": 1.4734766569138786, "ewc_loss": 0.057900652289390564, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026406513643451035, "grad_norm": 6.841518402099609, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8566291332244873, "num_tokens": 441956573.0, "step": 11583 }, { "epoch": 1.4736038671924692, "ewc_loss": 0.057828933000564575, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026578930555842817, "grad_norm": 6.901895046234131, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8772466778755188, "num_tokens": 442002064.0, "step": 11584 }, { "epoch": 1.4737310774710597, "ewc_loss": 0.058014657348394394, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026520516257733107, "grad_norm": 6.882625579833984, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8706042766571045, "num_tokens": 442040122.0, "step": 11585 }, { "epoch": 1.4738582877496502, "ewc_loss": 0.057729341089725494, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002647934015840292, "grad_norm": 6.9143757820129395, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8667205572128296, "num_tokens": 442079698.0, "step": 11586 }, { "epoch": 1.4739854980282407, "ewc_loss": 0.057736534625291824, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026486534625291824, "grad_norm": 6.890408515930176, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8770626187324524, "num_tokens": 442118252.0, "step": 11587 }, { "epoch": 1.4741127083068313, "ewc_loss": 0.057811059057712555, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026561057893559337, "grad_norm": 6.956654071807861, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8682562112808228, "num_tokens": 442154675.0, "step": 11588 }, { "epoch": 1.4742399185854218, "ewc_loss": 0.057721056044101715, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026471057208254933, "grad_norm": 6.852133274078369, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8707822561264038, "num_tokens": 442189449.0, "step": 11589 }, { "epoch": 1.4743671288640123, "ewc_loss": 0.05781523510813713, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002656523429322988, "grad_norm": 6.963310241699219, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8669255971908569, "num_tokens": 442222844.0, "step": 11590 }, { "epoch": 1.4744943391426026, "ewc_loss": 0.05764882639050484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002639882732182741, "grad_norm": 6.8195977210998535, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8710143566131592, "num_tokens": 442258599.0, "step": 11591 }, { "epoch": 1.4746215494211932, "ewc_loss": 0.057892583310604095, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026642580633051693, "grad_norm": 6.929509162902832, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8716603517532349, "num_tokens": 442294775.0, "step": 11592 }, { "epoch": 1.4747487596997837, "ewc_loss": 0.057611532509326935, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002636153076309711, "grad_norm": 6.862870693206787, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8751980066299438, "num_tokens": 442330313.0, "step": 11593 }, { "epoch": 1.4748759699783742, "ewc_loss": 0.05783461779356003, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002658462035469711, "grad_norm": 7.0017523765563965, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8638816475868225, "num_tokens": 442364988.0, "step": 11594 }, { "epoch": 1.4750031802569648, "ewc_loss": 0.057654500007629395, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026404502568766475, "grad_norm": 6.843963146209717, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.865290105342865, "num_tokens": 442406927.0, "step": 11595 }, { "epoch": 1.4751303905355553, "ewc_loss": 0.05779726058244705, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000265472597675398, "grad_norm": 6.984207630157471, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8677518963813782, "num_tokens": 442442092.0, "step": 11596 }, { "epoch": 1.4752576008141458, "ewc_loss": 0.057563416659832, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026313416310586035, "grad_norm": 6.839660167694092, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8721413612365723, "num_tokens": 442483013.0, "step": 11597 }, { "epoch": 1.4753848110927363, "ewc_loss": 0.057899415493011475, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002664941712282598, "grad_norm": 6.868427753448486, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8694614171981812, "num_tokens": 442525052.0, "step": 11598 }, { "epoch": 1.4755120213713269, "ewc_loss": 0.057701654732227325, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002645165368448943, "grad_norm": 7.323516845703125, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.856148898601532, "num_tokens": 442562543.0, "step": 11599 }, { "epoch": 1.4756392316499174, "ewc_loss": 0.05748921260237694, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026239213184453547, "grad_norm": 6.708805561065674, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8689828515052795, "num_tokens": 442604976.0, "step": 11600 }, { "epoch": 1.4757664419285077, "ewc_loss": 0.058081500232219696, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002683150232769549, "grad_norm": 7.046635627746582, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8766031861305237, "num_tokens": 442642271.0, "step": 11601 }, { "epoch": 1.4758936522070982, "ewc_loss": 0.05745363235473633, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026203630841337144, "grad_norm": 6.819077014923096, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8607314229011536, "num_tokens": 442680508.0, "step": 11602 }, { "epoch": 1.4760208624856888, "ewc_loss": 0.058022767305374146, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026772767887450755, "grad_norm": 6.989827632904053, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8542090654373169, "num_tokens": 442718391.0, "step": 11603 }, { "epoch": 1.4761480727642793, "ewc_loss": 0.05759535729885101, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000263453577645123, "grad_norm": 6.77783727645874, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8746562004089355, "num_tokens": 442757688.0, "step": 11604 }, { "epoch": 1.4762752830428698, "ewc_loss": 0.05797556787729263, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026725567295216024, "grad_norm": 6.958810806274414, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.877874493598938, "num_tokens": 442796704.0, "step": 11605 }, { "epoch": 1.4764024933214603, "ewc_loss": 0.057774968445301056, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026524969143792987, "grad_norm": 6.79891300201416, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8754609823226929, "num_tokens": 442838791.0, "step": 11606 }, { "epoch": 1.4765297036000509, "ewc_loss": 0.05793368071317673, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026683681062422693, "grad_norm": 6.903848648071289, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8658413290977478, "num_tokens": 442882368.0, "step": 11607 }, { "epoch": 1.4766569138786414, "ewc_loss": 0.057836201041936874, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002658620069269091, "grad_norm": 6.897818565368652, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8657736778259277, "num_tokens": 442923768.0, "step": 11608 }, { "epoch": 1.476784124157232, "ewc_loss": 0.05797599256038666, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002672599221114069, "grad_norm": 6.930386066436768, "learning_rate": 1e-06, "loss": 0.3962, "mean_token_accuracy": 0.8797472715377808, "num_tokens": 442955656.0, "step": 11609 }, { "epoch": 1.4769113344358225, "ewc_loss": 0.05816863104701042, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026674490072764456, "grad_norm": 6.9235968589782715, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.855553388595581, "num_tokens": 442997323.0, "step": 11610 }, { "epoch": 1.477038544714413, "ewc_loss": 0.05785336345434189, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026603363221511245, "grad_norm": 6.924587726593018, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8512883186340332, "num_tokens": 443033454.0, "step": 11611 }, { "epoch": 1.4771657549930035, "ewc_loss": 0.05781812220811844, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002656812430359423, "grad_norm": 6.800948143005371, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8643485307693481, "num_tokens": 443077902.0, "step": 11612 }, { "epoch": 1.477292965271594, "ewc_loss": 0.05803772062063217, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002678771852515638, "grad_norm": 7.003901958465576, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8562995195388794, "num_tokens": 443116515.0, "step": 11613 }, { "epoch": 1.4774201755501846, "ewc_loss": 0.05802041292190552, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026526270085014403, "grad_norm": 6.842936992645264, "learning_rate": 1e-06, "loss": 0.3673, "mean_token_accuracy": 0.8900823593139648, "num_tokens": 443154124.0, "step": 11614 }, { "epoch": 1.477547385828775, "ewc_loss": 0.05824676901102066, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026752628036774695, "grad_norm": 6.927571773529053, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8739755153656006, "num_tokens": 443195852.0, "step": 11615 }, { "epoch": 1.4776745961073654, "ewc_loss": 0.05809498950839043, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026600848650559783, "grad_norm": 6.960724830627441, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8566184639930725, "num_tokens": 443229850.0, "step": 11616 }, { "epoch": 1.477801806385956, "ewc_loss": 0.05814206227660179, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026647921185940504, "grad_norm": 6.964755058288574, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8577033281326294, "num_tokens": 443274481.0, "step": 11617 }, { "epoch": 1.4779290166645465, "ewc_loss": 0.058045849204063416, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002655170683283359, "grad_norm": 6.899716854095459, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8710579872131348, "num_tokens": 443311737.0, "step": 11618 }, { "epoch": 1.478056226943137, "ewc_loss": 0.05789264291524887, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000266426446614787, "grad_norm": 6.981772422790527, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8575179576873779, "num_tokens": 443345310.0, "step": 11619 }, { "epoch": 1.4781834372217275, "ewc_loss": 0.05782078951597214, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002657079021446407, "grad_norm": 6.951071262359619, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8632551431655884, "num_tokens": 443392869.0, "step": 11620 }, { "epoch": 1.478310647500318, "ewc_loss": 0.05785292387008667, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002660292375367135, "grad_norm": 6.9381489753723145, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8625260591506958, "num_tokens": 443432842.0, "step": 11621 }, { "epoch": 1.4784378577789086, "ewc_loss": 0.05780516192317009, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000265551614575088, "grad_norm": 6.904558181762695, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8667380213737488, "num_tokens": 443465954.0, "step": 11622 }, { "epoch": 1.478565068057499, "ewc_loss": 0.05809463933110237, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000266004994045943, "grad_norm": 6.862330436706543, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8551406860351562, "num_tokens": 443508506.0, "step": 11623 }, { "epoch": 1.4786922783360896, "ewc_loss": 0.05785217136144638, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026602172874845564, "grad_norm": 6.930631160736084, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8629297018051147, "num_tokens": 443543575.0, "step": 11624 }, { "epoch": 1.47881948861468, "ewc_loss": 0.058079443871974945, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026585301384329796, "grad_norm": 6.932181358337402, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8675467371940613, "num_tokens": 443578504.0, "step": 11625 }, { "epoch": 1.4789466988932705, "ewc_loss": 0.058153145015239716, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002665900392457843, "grad_norm": 6.912307262420654, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8777744770050049, "num_tokens": 443614647.0, "step": 11626 }, { "epoch": 1.479073909171861, "ewc_loss": 0.05806099623441696, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026566858286969364, "grad_norm": 6.929527759552002, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8612390756607056, "num_tokens": 443649244.0, "step": 11627 }, { "epoch": 1.4792011194504515, "ewc_loss": 0.05791144818067551, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002666144573595375, "grad_norm": 6.953786849975586, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8609431982040405, "num_tokens": 443688374.0, "step": 11628 }, { "epoch": 1.479328329729042, "ewc_loss": 0.05783090740442276, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002658090670593083, "grad_norm": 6.8590264320373535, "learning_rate": 1e-06, "loss": 0.5526, "mean_token_accuracy": 0.8354292511940002, "num_tokens": 443730929.0, "step": 11629 }, { "epoch": 1.4794555400076326, "ewc_loss": 0.05793903023004532, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002668903034646064, "grad_norm": 6.950226783752441, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8575569987297058, "num_tokens": 443763067.0, "step": 11630 }, { "epoch": 1.479582750286223, "ewc_loss": 0.05810973793268204, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002661559556145221, "grad_norm": 6.893459796905518, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8522495031356812, "num_tokens": 443801273.0, "step": 11631 }, { "epoch": 1.4797099605648136, "ewc_loss": 0.05825746804475784, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026763329515233636, "grad_norm": 6.959662914276123, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8526773452758789, "num_tokens": 443840394.0, "step": 11632 }, { "epoch": 1.4798371708434042, "ewc_loss": 0.05809667706489563, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002660253376234323, "grad_norm": 6.867928981781006, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.860096275806427, "num_tokens": 443877486.0, "step": 11633 }, { "epoch": 1.4799643811219947, "ewc_loss": 0.05805592983961105, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026805928791873157, "grad_norm": 6.963074207305908, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8635183572769165, "num_tokens": 443913763.0, "step": 11634 }, { "epoch": 1.4800915914005852, "ewc_loss": 0.05804796516895294, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002655382268130779, "grad_norm": 6.878002643585205, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8729051351547241, "num_tokens": 443946840.0, "step": 11635 }, { "epoch": 1.4802188016791757, "ewc_loss": 0.05820779502391815, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026713652187027037, "grad_norm": 6.878896713256836, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8494021892547607, "num_tokens": 443992439.0, "step": 11636 }, { "epoch": 1.4803460119577663, "ewc_loss": 0.05815950036048889, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026665357290767133, "grad_norm": 6.87485933303833, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8737683892250061, "num_tokens": 444029348.0, "step": 11637 }, { "epoch": 1.4804732222363568, "ewc_loss": 0.058191537857055664, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002669739769771695, "grad_norm": 6.8926239013671875, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8735085725784302, "num_tokens": 444068386.0, "step": 11638 }, { "epoch": 1.4806004325149473, "ewc_loss": 0.05816216021776199, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002666801738087088, "grad_norm": 6.850986480712891, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8522759675979614, "num_tokens": 444110307.0, "step": 11639 }, { "epoch": 1.4807276427935376, "ewc_loss": 0.058227088302373886, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002673294802661985, "grad_norm": 6.888761520385742, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8641977310180664, "num_tokens": 444153919.0, "step": 11640 }, { "epoch": 1.4808548530721282, "ewc_loss": 0.05813203752040863, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026637897826731205, "grad_norm": 6.897876262664795, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8662699460983276, "num_tokens": 444190575.0, "step": 11641 }, { "epoch": 1.4809820633507187, "ewc_loss": 0.05820079147815704, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002670665271580219, "grad_norm": 6.945713520050049, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8575142621994019, "num_tokens": 444226108.0, "step": 11642 }, { "epoch": 1.4811092736293092, "ewc_loss": 0.058184631168842316, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026690488448366523, "grad_norm": 6.876127243041992, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8614975214004517, "num_tokens": 444268162.0, "step": 11643 }, { "epoch": 1.4812364839078997, "ewc_loss": 0.0581599697470665, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002666582877282053, "grad_norm": 6.848384380340576, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.850997269153595, "num_tokens": 444309273.0, "step": 11644 }, { "epoch": 1.4813636941864903, "ewc_loss": 0.05812404304742813, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026629900094121695, "grad_norm": 6.901501655578613, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8684557676315308, "num_tokens": 444338697.0, "step": 11645 }, { "epoch": 1.4814909044650808, "ewc_loss": 0.05813438072800636, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002664024068508297, "grad_norm": 6.9183220863342285, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8732648491859436, "num_tokens": 444377954.0, "step": 11646 }, { "epoch": 1.4816181147436713, "ewc_loss": 0.05795123428106308, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002670123358257115, "grad_norm": 6.850898742675781, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8652227520942688, "num_tokens": 444416996.0, "step": 11647 }, { "epoch": 1.4817453250222619, "ewc_loss": 0.05797090381383896, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002672090195119381, "grad_norm": 6.885979175567627, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8708129525184631, "num_tokens": 444455550.0, "step": 11648 }, { "epoch": 1.4818725353008524, "ewc_loss": 0.05801200121641159, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002676200238056481, "grad_norm": 6.876716613769531, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8597270250320435, "num_tokens": 444493979.0, "step": 11649 }, { "epoch": 1.4819997455794427, "ewc_loss": 0.058017998933792114, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026767997769638896, "grad_norm": 6.931856155395508, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8621459603309631, "num_tokens": 444540622.0, "step": 11650 }, { "epoch": 1.4821269558580332, "ewc_loss": 0.057983797043561935, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002673379785846919, "grad_norm": 6.921267986297607, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.861619234085083, "num_tokens": 444577772.0, "step": 11651 }, { "epoch": 1.4822541661366238, "ewc_loss": 0.05798391252756119, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026733914273791015, "grad_norm": 6.946528911590576, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8514132499694824, "num_tokens": 444615523.0, "step": 11652 }, { "epoch": 1.4823813764152143, "ewc_loss": 0.05790770798921585, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002665770589374006, "grad_norm": 6.887166976928711, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8623307943344116, "num_tokens": 444659785.0, "step": 11653 }, { "epoch": 1.4825085866938048, "ewc_loss": 0.05825532600283623, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026761184562928975, "grad_norm": 6.885012626647949, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8594160079956055, "num_tokens": 444700806.0, "step": 11654 }, { "epoch": 1.4826357969723953, "ewc_loss": 0.05793384462594986, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002668384404387325, "grad_norm": 6.904638290405273, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8654933571815491, "num_tokens": 444734320.0, "step": 11655 }, { "epoch": 1.4827630072509859, "ewc_loss": 0.058032453060150146, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026782453642226756, "grad_norm": 7.011861801147461, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8578681945800781, "num_tokens": 444770215.0, "step": 11656 }, { "epoch": 1.4828902175295764, "ewc_loss": 0.05814135819673538, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026647219783626497, "grad_norm": 6.909021377563477, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8625913858413696, "num_tokens": 444805432.0, "step": 11657 }, { "epoch": 1.483017427808167, "ewc_loss": 0.057913362979888916, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026663363678380847, "grad_norm": 7.013824939727783, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8626916408538818, "num_tokens": 444832398.0, "step": 11658 }, { "epoch": 1.4831446380867574, "ewc_loss": 0.05787703022360802, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002662703045643866, "grad_norm": 6.8561835289001465, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.86806720495224, "num_tokens": 444876128.0, "step": 11659 }, { "epoch": 1.483271848365348, "ewc_loss": 0.05825018137693405, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026756039005704224, "grad_norm": 6.897112846374512, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8587363958358765, "num_tokens": 444915188.0, "step": 11660 }, { "epoch": 1.4833990586439385, "ewc_loss": 0.058179788291454315, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002668564557097852, "grad_norm": 6.906704425811768, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8683950901031494, "num_tokens": 444953528.0, "step": 11661 }, { "epoch": 1.483526268922529, "ewc_loss": 0.05825036019086838, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026756219449453056, "grad_norm": 6.910237789154053, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8578622341156006, "num_tokens": 444996469.0, "step": 11662 }, { "epoch": 1.4836534792011196, "ewc_loss": 0.05815908685326576, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026664946926757693, "grad_norm": 6.873859405517578, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8806711435317993, "num_tokens": 445034298.0, "step": 11663 }, { "epoch": 1.48378068947971, "ewc_loss": 0.05817567929625511, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026681539020501077, "grad_norm": 6.866888999938965, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8631641864776611, "num_tokens": 445079010.0, "step": 11664 }, { "epoch": 1.4839078997583004, "ewc_loss": 0.05802340805530548, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026773408171720803, "grad_norm": 6.906527042388916, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8768817186355591, "num_tokens": 445116009.0, "step": 11665 }, { "epoch": 1.484035110036891, "ewc_loss": 0.05795524641871452, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002670524700079113, "grad_norm": 6.875775337219238, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8781530261039734, "num_tokens": 445156313.0, "step": 11666 }, { "epoch": 1.4841623203154815, "ewc_loss": 0.05800411105155945, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002675410942174494, "grad_norm": 6.977024078369141, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8641707301139832, "num_tokens": 445187424.0, "step": 11667 }, { "epoch": 1.484289530594072, "ewc_loss": 0.05791943147778511, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026669431827031076, "grad_norm": 6.968854904174805, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8649883270263672, "num_tokens": 445233448.0, "step": 11668 }, { "epoch": 1.4844167408726625, "ewc_loss": 0.05790683627128601, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000266568356892094, "grad_norm": 6.8710036277771, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8735884428024292, "num_tokens": 445273663.0, "step": 11669 }, { "epoch": 1.484543951151253, "ewc_loss": 0.05793225020170212, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026682252064347267, "grad_norm": 6.926442623138428, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.86940598487854, "num_tokens": 445313790.0, "step": 11670 }, { "epoch": 1.4846711614298436, "ewc_loss": 0.05786014720797539, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002661014732439071, "grad_norm": 6.935618877410889, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8747186660766602, "num_tokens": 445351818.0, "step": 11671 }, { "epoch": 1.484798371708434, "ewc_loss": 0.058143164962530136, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026649024221114814, "grad_norm": 6.933040142059326, "learning_rate": 1e-06, "loss": 0.5372, "mean_token_accuracy": 0.8400360345840454, "num_tokens": 445391763.0, "step": 11672 }, { "epoch": 1.4849255819870246, "ewc_loss": 0.0580698661506176, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002657572622410953, "grad_norm": 6.920660495758057, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8581568598747253, "num_tokens": 445426723.0, "step": 11673 }, { "epoch": 1.485052792265615, "ewc_loss": 0.058158762753009796, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026664623874239624, "grad_norm": 6.894665241241455, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8686556220054626, "num_tokens": 445465635.0, "step": 11674 }, { "epoch": 1.4851800025442055, "ewc_loss": 0.05794859305024147, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026698593865148723, "grad_norm": 6.922764301300049, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8642711639404297, "num_tokens": 445502819.0, "step": 11675 }, { "epoch": 1.485307212822796, "ewc_loss": 0.057908326387405396, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002665832871571183, "grad_norm": 6.950552463531494, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8746029138565063, "num_tokens": 445541861.0, "step": 11676 }, { "epoch": 1.4854344231013865, "ewc_loss": 0.057917214930057526, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002666721702553332, "grad_norm": 6.925659656524658, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8790279626846313, "num_tokens": 445578058.0, "step": 11677 }, { "epoch": 1.485561633379977, "ewc_loss": 0.05790218710899353, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002665218780748546, "grad_norm": 6.949775695800781, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8507421016693115, "num_tokens": 445612896.0, "step": 11678 }, { "epoch": 1.4856888436585676, "ewc_loss": 0.0579354465007782, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002668544475454837, "grad_norm": 6.9699883460998535, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8572096824645996, "num_tokens": 445647849.0, "step": 11679 }, { "epoch": 1.485816053937158, "ewc_loss": 0.0581609308719635, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026666789199225605, "grad_norm": 6.931684494018555, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8502677083015442, "num_tokens": 445689199.0, "step": 11680 }, { "epoch": 1.4859432642157486, "ewc_loss": 0.05791527032852173, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026665269979275763, "grad_norm": 6.953169822692871, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8630441427230835, "num_tokens": 445727869.0, "step": 11681 }, { "epoch": 1.4860704744943392, "ewc_loss": 0.05781557410955429, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026565571897663176, "grad_norm": 6.873976707458496, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8684757351875305, "num_tokens": 445770105.0, "step": 11682 }, { "epoch": 1.4861976847729297, "ewc_loss": 0.058310359716415405, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002681621699593961, "grad_norm": 6.980971336364746, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8437858819961548, "num_tokens": 445812950.0, "step": 11683 }, { "epoch": 1.4863248950515202, "ewc_loss": 0.058029741048812866, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002653560077305883, "grad_norm": 6.916438579559326, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8625392317771912, "num_tokens": 445850811.0, "step": 11684 }, { "epoch": 1.4864521053301107, "ewc_loss": 0.05827495455741882, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026780812186188996, "grad_norm": 6.9697585105896, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.866741955280304, "num_tokens": 445891967.0, "step": 11685 }, { "epoch": 1.4865793156087013, "ewc_loss": 0.058145783841609955, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002665164356585592, "grad_norm": 6.927977561950684, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8655757904052734, "num_tokens": 445927936.0, "step": 11686 }, { "epoch": 1.4867065258872918, "ewc_loss": 0.058136191219091415, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002664205094333738, "grad_norm": 6.931534767150879, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8771024942398071, "num_tokens": 445962481.0, "step": 11687 }, { "epoch": 1.4868337361658823, "ewc_loss": 0.05829998105764389, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002680583856999874, "grad_norm": 6.933015823364258, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.877974808216095, "num_tokens": 446001445.0, "step": 11688 }, { "epoch": 1.4869609464444726, "ewc_loss": 0.05812223255634308, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002662809274625033, "grad_norm": 6.962685585021973, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8840651512145996, "num_tokens": 446037904.0, "step": 11689 }, { "epoch": 1.4870881567230632, "ewc_loss": 0.05808822810649872, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002659408492036164, "grad_norm": 7.1603217124938965, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8733302354812622, "num_tokens": 446077583.0, "step": 11690 }, { "epoch": 1.4872153670016537, "ewc_loss": 0.05799300596117973, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002649886591825634, "grad_norm": 6.951093673706055, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8616602420806885, "num_tokens": 446110123.0, "step": 11691 }, { "epoch": 1.4873425772802442, "ewc_loss": 0.05809560418128967, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002660146274138242, "grad_norm": 6.981202125549316, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8625041246414185, "num_tokens": 446149832.0, "step": 11692 }, { "epoch": 1.4874697875588347, "ewc_loss": 0.057899340987205505, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002640519815031439, "grad_norm": 6.955361366271973, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8709478974342346, "num_tokens": 446184211.0, "step": 11693 }, { "epoch": 1.4875969978374253, "ewc_loss": 0.05802597105503082, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002653182891663164, "grad_norm": 6.946760654449463, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8577203154563904, "num_tokens": 446229620.0, "step": 11694 }, { "epoch": 1.4877242081160158, "ewc_loss": 0.0577894002199173, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002653940173331648, "grad_norm": 6.9429216384887695, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8663859963417053, "num_tokens": 446267134.0, "step": 11695 }, { "epoch": 1.4878514183946063, "ewc_loss": 0.05774613097310066, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002649613015819341, "grad_norm": 6.967841625213623, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8511192798614502, "num_tokens": 446303001.0, "step": 11696 }, { "epoch": 1.4879786286731969, "ewc_loss": 0.05777093768119812, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002652093826327473, "grad_norm": 6.968012809753418, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8624937534332275, "num_tokens": 446334286.0, "step": 11697 }, { "epoch": 1.4881058389517874, "ewc_loss": 0.057818904519081116, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002656890428625047, "grad_norm": 6.929655075073242, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8825150728225708, "num_tokens": 446369547.0, "step": 11698 }, { "epoch": 1.4882330492303777, "ewc_loss": 0.05785137787461281, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026601378340274096, "grad_norm": 6.974255561828613, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8489630222320557, "num_tokens": 446408179.0, "step": 11699 }, { "epoch": 1.4883602595089682, "ewc_loss": 0.05804584547877312, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002655170392245054, "grad_norm": 6.909286975860596, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8669575452804565, "num_tokens": 446448533.0, "step": 11700 }, { "epoch": 1.4884874697875587, "ewc_loss": 0.05810883641242981, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026614696253091097, "grad_norm": 6.91668701171875, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8624539375305176, "num_tokens": 446489445.0, "step": 11701 }, { "epoch": 1.4886146800661493, "ewc_loss": 0.05808788537979126, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026593744405545294, "grad_norm": 6.918518543243408, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.87704998254776, "num_tokens": 446526530.0, "step": 11702 }, { "epoch": 1.4887418903447398, "ewc_loss": 0.058264024555683136, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002676988369785249, "grad_norm": 7.034740447998047, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8655567765235901, "num_tokens": 446558315.0, "step": 11703 }, { "epoch": 1.4888691006233303, "ewc_loss": 0.058097921311855316, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002660378231666982, "grad_norm": 6.896176338195801, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8671280145645142, "num_tokens": 446598548.0, "step": 11704 }, { "epoch": 1.4889963109019209, "ewc_loss": 0.058265335857868195, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002677119628060609, "grad_norm": 7.041701793670654, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8603132367134094, "num_tokens": 446632111.0, "step": 11705 }, { "epoch": 1.4891235211805114, "ewc_loss": 0.057997506111860275, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026503365370444953, "grad_norm": 6.846551418304443, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8727116584777832, "num_tokens": 446669296.0, "step": 11706 }, { "epoch": 1.489250731459102, "ewc_loss": 0.058347731828689575, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026853589224629104, "grad_norm": 6.999028205871582, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8663128018379211, "num_tokens": 446707891.0, "step": 11707 }, { "epoch": 1.4893779417376924, "ewc_loss": 0.058166418224573135, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026672278181649745, "grad_norm": 6.914730548858643, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8526036739349365, "num_tokens": 446749455.0, "step": 11708 }, { "epoch": 1.489505152016283, "ewc_loss": 0.058233797550201416, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026739659369923174, "grad_norm": 7.0146870613098145, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8774996399879456, "num_tokens": 446787001.0, "step": 11709 }, { "epoch": 1.4896323622948735, "ewc_loss": 0.058142196387052536, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026648055063560605, "grad_norm": 6.8991594314575195, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8626719117164612, "num_tokens": 446827262.0, "step": 11710 }, { "epoch": 1.489759572573464, "ewc_loss": 0.058162786066532135, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000266686431132257, "grad_norm": 7.063201427459717, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8794184923171997, "num_tokens": 446860514.0, "step": 11711 }, { "epoch": 1.4898867828520546, "ewc_loss": 0.058005109429359436, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002651096729096025, "grad_norm": 6.891205787658691, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.85496985912323, "num_tokens": 446902605.0, "step": 11712 }, { "epoch": 1.490013993130645, "ewc_loss": 0.05822069197893143, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002672655100468546, "grad_norm": 7.027249336242676, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8706713914871216, "num_tokens": 446938638.0, "step": 11713 }, { "epoch": 1.4901412034092354, "ewc_loss": 0.0579436793923378, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026449537836015224, "grad_norm": 6.857999801635742, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8688067197799683, "num_tokens": 446979286.0, "step": 11714 }, { "epoch": 1.490268413687826, "ewc_loss": 0.058260466903448105, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002676632720977068, "grad_norm": 7.050310134887695, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.849096417427063, "num_tokens": 447017405.0, "step": 11715 }, { "epoch": 1.4903956239664164, "ewc_loss": 0.058006346225738525, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002651220711413771, "grad_norm": 6.866557598114014, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8669384717941284, "num_tokens": 447056733.0, "step": 11716 }, { "epoch": 1.490522834245007, "ewc_loss": 0.05822548270225525, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002673134149517864, "grad_norm": 6.987997531890869, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8589944839477539, "num_tokens": 447094493.0, "step": 11717 }, { "epoch": 1.4906500445235975, "ewc_loss": 0.0580768883228302, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002658274897839874, "grad_norm": 6.900020599365234, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.88011634349823, "num_tokens": 447134039.0, "step": 11718 }, { "epoch": 1.490777254802188, "ewc_loss": 0.057952798902988434, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002670279936864972, "grad_norm": 6.93356466293335, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8716047406196594, "num_tokens": 447168320.0, "step": 11719 }, { "epoch": 1.4909044650807786, "ewc_loss": 0.058195941150188446, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002670179819688201, "grad_norm": 6.998531341552734, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8493090867996216, "num_tokens": 447205640.0, "step": 11720 }, { "epoch": 1.491031675359369, "ewc_loss": 0.058101192116737366, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002660705358721316, "grad_norm": 6.938817977905273, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.8773937225341797, "num_tokens": 447239831.0, "step": 11721 }, { "epoch": 1.4911588856379596, "ewc_loss": 0.05813860520720482, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026644463650882244, "grad_norm": 6.938872814178467, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8500399589538574, "num_tokens": 447281100.0, "step": 11722 }, { "epoch": 1.49128609591655, "ewc_loss": 0.05811049044132233, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002661634935066104, "grad_norm": 6.998883247375488, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8451611995697021, "num_tokens": 447316137.0, "step": 11723 }, { "epoch": 1.4914133061951405, "ewc_loss": 0.05806543678045273, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002657129371073097, "grad_norm": 6.869233131408691, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8576876521110535, "num_tokens": 447357922.0, "step": 11724 }, { "epoch": 1.491540516473731, "ewc_loss": 0.05821043998003006, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026716297725215554, "grad_norm": 6.958103656768799, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8606414794921875, "num_tokens": 447394905.0, "step": 11725 }, { "epoch": 1.4916677267523215, "ewc_loss": 0.05815175548195839, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002665761567186564, "grad_norm": 6.896094799041748, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.875400960445404, "num_tokens": 447435379.0, "step": 11726 }, { "epoch": 1.491794937030912, "ewc_loss": 0.058191485702991486, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002669734531082213, "grad_norm": 6.951293468475342, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8563847541809082, "num_tokens": 447473058.0, "step": 11727 }, { "epoch": 1.4919221473095026, "ewc_loss": 0.058164723217487335, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002667058433871716, "grad_norm": 6.922146320343018, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8734537959098816, "num_tokens": 447510966.0, "step": 11728 }, { "epoch": 1.492049357588093, "ewc_loss": 0.058216795325279236, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002672265691217035, "grad_norm": 6.8687591552734375, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8673652410507202, "num_tokens": 447554747.0, "step": 11729 }, { "epoch": 1.4921765678666836, "ewc_loss": 0.05831349641084671, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026819357299245894, "grad_norm": 6.942436695098877, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8673694133758545, "num_tokens": 447594426.0, "step": 11730 }, { "epoch": 1.4923037781452742, "ewc_loss": 0.05822927504777908, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002673513372428715, "grad_norm": 6.923802375793457, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.857845664024353, "num_tokens": 447634786.0, "step": 11731 }, { "epoch": 1.4924309884238647, "ewc_loss": 0.05832120031118393, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002682706108316779, "grad_norm": 6.918547630310059, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8615321516990662, "num_tokens": 447668682.0, "step": 11732 }, { "epoch": 1.4925581987024552, "ewc_loss": 0.05826183035969734, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026767689269036055, "grad_norm": 7.013113975524902, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8553736209869385, "num_tokens": 447705566.0, "step": 11733 }, { "epoch": 1.4926854089810457, "ewc_loss": 0.05801201984286308, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026762019842863083, "grad_norm": 6.9240827560424805, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8612759113311768, "num_tokens": 447746385.0, "step": 11734 }, { "epoch": 1.4928126192596363, "ewc_loss": 0.058038391172885895, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002678839082363993, "grad_norm": 6.915063381195068, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8689397573471069, "num_tokens": 447790483.0, "step": 11735 }, { "epoch": 1.4929398295382268, "ewc_loss": 0.05803069844841957, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026780698681250215, "grad_norm": 6.963130950927734, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8628377318382263, "num_tokens": 447826872.0, "step": 11736 }, { "epoch": 1.4930670398168173, "ewc_loss": 0.05789496749639511, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002664497005753219, "grad_norm": 6.91825008392334, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.870093822479248, "num_tokens": 447864071.0, "step": 11737 }, { "epoch": 1.4931942500954076, "ewc_loss": 0.058054424822330475, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002680442703422159, "grad_norm": 6.948575496673584, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8471567630767822, "num_tokens": 447903320.0, "step": 11738 }, { "epoch": 1.4933214603739982, "ewc_loss": 0.058260269463062286, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026766129303723574, "grad_norm": 6.932469844818115, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8784489631652832, "num_tokens": 447940253.0, "step": 11739 }, { "epoch": 1.4934486706525887, "ewc_loss": 0.0579981654882431, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002674816350918263, "grad_norm": 6.9218902587890625, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8698509335517883, "num_tokens": 447978277.0, "step": 11740 }, { "epoch": 1.4935758809311792, "ewc_loss": 0.058290936052799225, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026796796009875834, "grad_norm": 6.9209089279174805, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8536633849143982, "num_tokens": 448018798.0, "step": 11741 }, { "epoch": 1.4937030912097697, "ewc_loss": 0.05824118107557297, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002674704301171005, "grad_norm": 6.920374393463135, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8529043793678284, "num_tokens": 448057469.0, "step": 11742 }, { "epoch": 1.4938303014883603, "ewc_loss": 0.05801862105727196, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002676862059161067, "grad_norm": 6.971415996551514, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.851015567779541, "num_tokens": 448089032.0, "step": 11743 }, { "epoch": 1.4939575117669508, "ewc_loss": 0.05800113081932068, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026751129189506173, "grad_norm": 6.927290439605713, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8600708842277527, "num_tokens": 448122438.0, "step": 11744 }, { "epoch": 1.4940847220455413, "ewc_loss": 0.05831608176231384, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026821938809007406, "grad_norm": 6.912203788757324, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8639737963676453, "num_tokens": 448166844.0, "step": 11745 }, { "epoch": 1.4942119323241319, "ewc_loss": 0.058223091065883636, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002672895207069814, "grad_norm": 7.014838218688965, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8555408716201782, "num_tokens": 448197766.0, "step": 11746 }, { "epoch": 1.4943391426027224, "ewc_loss": 0.05821036174893379, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026716222055256367, "grad_norm": 6.92915153503418, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8513876795768738, "num_tokens": 448240850.0, "step": 11747 }, { "epoch": 1.4944663528813127, "ewc_loss": 0.05829591676592827, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002680177567526698, "grad_norm": 6.902620792388916, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8739450573921204, "num_tokens": 448274187.0, "step": 11748 }, { "epoch": 1.4945935631599032, "ewc_loss": 0.05821393430233002, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002671979309525341, "grad_norm": 6.879523277282715, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8700622916221619, "num_tokens": 448311069.0, "step": 11749 }, { "epoch": 1.4947207734384937, "ewc_loss": 0.05838260054588318, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026888458523899317, "grad_norm": 6.926270961761475, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.864047646522522, "num_tokens": 448351536.0, "step": 11750 }, { "epoch": 1.4948479837170843, "ewc_loss": 0.05817678943276405, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002668264787644148, "grad_norm": 6.8999223709106445, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8609824776649475, "num_tokens": 448385108.0, "step": 11751 }, { "epoch": 1.4949751939956748, "ewc_loss": 0.05834139883518219, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002684725623112172, "grad_norm": 6.876882553100586, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8756870031356812, "num_tokens": 448423272.0, "step": 11752 }, { "epoch": 1.4951024042742653, "ewc_loss": 0.05836531147360802, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026871170848608017, "grad_norm": 6.9134650230407715, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8684588074684143, "num_tokens": 448462904.0, "step": 11753 }, { "epoch": 1.4952296145528559, "ewc_loss": 0.05837438628077507, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026880245422944427, "grad_norm": 6.924570083618164, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8604939579963684, "num_tokens": 448503257.0, "step": 11754 }, { "epoch": 1.4953568248314464, "ewc_loss": 0.058368533849716187, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026874395553022623, "grad_norm": 6.940553188323975, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.869387686252594, "num_tokens": 448535299.0, "step": 11755 }, { "epoch": 1.495484035110037, "ewc_loss": 0.05821622163057327, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026722083566710353, "grad_norm": 6.8827223777771, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8704010248184204, "num_tokens": 448568878.0, "step": 11756 }, { "epoch": 1.4956112453886274, "ewc_loss": 0.0584096722304821, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002691553090699017, "grad_norm": 6.911340236663818, "learning_rate": 1e-06, "loss": 0.3988, "mean_token_accuracy": 0.8827325105667114, "num_tokens": 448604959.0, "step": 11757 }, { "epoch": 1.495738455667218, "ewc_loss": 0.05826403573155403, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026769895339384675, "grad_norm": 7.025158405303955, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8764569163322449, "num_tokens": 448638487.0, "step": 11758 }, { "epoch": 1.4958656659458085, "ewc_loss": 0.05826476216316223, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026770622935146093, "grad_norm": 6.873430252075195, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8696479797363281, "num_tokens": 448677212.0, "step": 11759 }, { "epoch": 1.495992876224399, "ewc_loss": 0.05834706872701645, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026852928567677736, "grad_norm": 6.912461757659912, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8593223094940186, "num_tokens": 448719654.0, "step": 11760 }, { "epoch": 1.4961200865029896, "ewc_loss": 0.05823085084557533, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026736711151897907, "grad_norm": 6.868849277496338, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8685310482978821, "num_tokens": 448760783.0, "step": 11761 }, { "epoch": 1.49624729678158, "ewc_loss": 0.0583299919962883, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026835850439965725, "grad_norm": 6.915352821350098, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8537997007369995, "num_tokens": 448802843.0, "step": 11762 }, { "epoch": 1.4963745070601704, "ewc_loss": 0.058295197784900665, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000268010568106547, "grad_norm": 6.956733703613281, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8604884743690491, "num_tokens": 448838430.0, "step": 11763 }, { "epoch": 1.496501717338761, "ewc_loss": 0.05827545374631882, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002678131277207285, "grad_norm": 6.898556709289551, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8551287055015564, "num_tokens": 448880450.0, "step": 11764 }, { "epoch": 1.4966289276173514, "ewc_loss": 0.05830688774585724, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002681274781934917, "grad_norm": 6.915499210357666, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8659138679504395, "num_tokens": 448919352.0, "step": 11765 }, { "epoch": 1.496756137895942, "ewc_loss": 0.05820969119668007, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026715549756772816, "grad_norm": 6.837772846221924, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8398717045783997, "num_tokens": 448965402.0, "step": 11766 }, { "epoch": 1.4968833481745325, "ewc_loss": 0.058338712900877, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002684457285795361, "grad_norm": 6.989564895629883, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.876288652420044, "num_tokens": 448996983.0, "step": 11767 }, { "epoch": 1.497010558453123, "ewc_loss": 0.05822968855500221, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002673554699867964, "grad_norm": 6.894272804260254, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8506624102592468, "num_tokens": 449036555.0, "step": 11768 }, { "epoch": 1.4971377687317136, "ewc_loss": 0.05824164301156998, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026747502852231264, "grad_norm": 6.892868995666504, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.875803530216217, "num_tokens": 449070545.0, "step": 11769 }, { "epoch": 1.497264979010304, "ewc_loss": 0.058178290724754333, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026684149634093046, "grad_norm": 6.913608074188232, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.84964919090271, "num_tokens": 449108859.0, "step": 11770 }, { "epoch": 1.4973921892888946, "ewc_loss": 0.058254845440387726, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002676070434972644, "grad_norm": 6.9458723068237305, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8622691631317139, "num_tokens": 449145333.0, "step": 11771 }, { "epoch": 1.497519399567485, "ewc_loss": 0.05816637724637985, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002667223452590406, "grad_norm": 6.893314361572266, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.857280433177948, "num_tokens": 449181292.0, "step": 11772 }, { "epoch": 1.4976466098460754, "ewc_loss": 0.058324962854385376, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000268308212980628, "grad_norm": 6.98928689956665, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8673311471939087, "num_tokens": 449213940.0, "step": 11773 }, { "epoch": 1.497773820124666, "ewc_loss": 0.05813411623239517, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002663997292984277, "grad_norm": 6.944425106048584, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8766124248504639, "num_tokens": 449252487.0, "step": 11774 }, { "epoch": 1.4979010304032565, "ewc_loss": 0.05825125426054001, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002675711293704808, "grad_norm": 6.939042091369629, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8563648462295532, "num_tokens": 449287268.0, "step": 11775 }, { "epoch": 1.498028240681847, "ewc_loss": 0.05820556357502937, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002671142283361405, "grad_norm": 6.94341516494751, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8598719835281372, "num_tokens": 449326975.0, "step": 11776 }, { "epoch": 1.4981554509604376, "ewc_loss": 0.05808871239423752, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002659457386471331, "grad_norm": 6.860657691955566, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8636465072631836, "num_tokens": 449366061.0, "step": 11777 }, { "epoch": 1.498282661239028, "ewc_loss": 0.05824144184589386, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026747299125418067, "grad_norm": 6.865201473236084, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8729234337806702, "num_tokens": 449401763.0, "step": 11778 }, { "epoch": 1.4984098715176186, "ewc_loss": 0.05821169912815094, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026717560831457376, "grad_norm": 6.932927131652832, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8836953639984131, "num_tokens": 449436605.0, "step": 11779 }, { "epoch": 1.4985370817962091, "ewc_loss": 0.05824272334575653, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002674858260434121, "grad_norm": 6.931186676025391, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8596683740615845, "num_tokens": 449474523.0, "step": 11780 }, { "epoch": 1.4986642920747997, "ewc_loss": 0.05824811011552811, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026753966812975705, "grad_norm": 6.935770511627197, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.8750995397567749, "num_tokens": 449513301.0, "step": 11781 }, { "epoch": 1.4987915023533902, "ewc_loss": 0.058215588331222534, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002672144619282335, "grad_norm": 6.980538368225098, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8600921034812927, "num_tokens": 449547826.0, "step": 11782 }, { "epoch": 1.4989187126319807, "ewc_loss": 0.058148160576820374, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026654021348804235, "grad_norm": 6.947150230407715, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.863994300365448, "num_tokens": 449584818.0, "step": 11783 }, { "epoch": 1.4990459229105713, "ewc_loss": 0.058117374777793884, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000266232353169471, "grad_norm": 6.8741455078125, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8453701138496399, "num_tokens": 449625368.0, "step": 11784 }, { "epoch": 1.4991731331891618, "ewc_loss": 0.05819060653448105, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002669646346475929, "grad_norm": 6.966302871704102, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.850403904914856, "num_tokens": 449666320.0, "step": 11785 }, { "epoch": 1.4993003434677523, "ewc_loss": 0.05817282944917679, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026678686845116317, "grad_norm": 6.909512996673584, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8573368191719055, "num_tokens": 449705993.0, "step": 11786 }, { "epoch": 1.4994275537463426, "ewc_loss": 0.05819031223654747, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026696172426454723, "grad_norm": 6.884551048278809, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8580218553543091, "num_tokens": 449747088.0, "step": 11787 }, { "epoch": 1.4995547640249332, "ewc_loss": 0.05823613330721855, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026741993497125804, "grad_norm": 6.9929280281066895, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8656699657440186, "num_tokens": 449789840.0, "step": 11788 }, { "epoch": 1.4996819743035237, "ewc_loss": 0.058217521756887436, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026723381597548723, "grad_norm": 6.881734371185303, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8636767864227295, "num_tokens": 449830599.0, "step": 11789 }, { "epoch": 1.4998091845821142, "ewc_loss": 0.05826564133167267, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026771501870825887, "grad_norm": 7.123465538024902, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8679279685020447, "num_tokens": 449864476.0, "step": 11790 }, { "epoch": 1.4999363948607047, "ewc_loss": 0.05821935832500458, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000267252151388675, "grad_norm": 7.140803337097168, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.84722900390625, "num_tokens": 449903791.0, "step": 11791 }, { "epoch": 1.5000636051392953, "ewc_loss": 0.05810334533452988, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026609207270666957, "grad_norm": 6.891046524047852, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.864051103591919, "num_tokens": 449945478.0, "step": 11792 }, { "epoch": 1.5001908154178858, "ewc_loss": 0.058118559420108795, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026624416932463646, "grad_norm": 6.966495037078857, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8610814809799194, "num_tokens": 449986175.0, "step": 11793 }, { "epoch": 1.5003180256964763, "ewc_loss": 0.058057934045791626, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026563790743239224, "grad_norm": 6.915592193603516, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8602373600006104, "num_tokens": 450025537.0, "step": 11794 }, { "epoch": 1.5004452359750666, "ewc_loss": 0.05822667479515076, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026732534752227366, "grad_norm": 6.975174903869629, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8497470617294312, "num_tokens": 450068269.0, "step": 11795 }, { "epoch": 1.5005724462536572, "ewc_loss": 0.05805133655667305, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002655719581525773, "grad_norm": 6.8830246925354, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8804331421852112, "num_tokens": 450106234.0, "step": 11796 }, { "epoch": 1.5006996565322477, "ewc_loss": 0.05801715701818466, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002676715957932174, "grad_norm": 6.945394515991211, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8737730979919434, "num_tokens": 450141916.0, "step": 11797 }, { "epoch": 1.5008268668108382, "ewc_loss": 0.05815201997756958, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002665787760633975, "grad_norm": 6.9795427322387695, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8492374420166016, "num_tokens": 450175427.0, "step": 11798 }, { "epoch": 1.5009540770894287, "ewc_loss": 0.0582038015127182, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002670966205187142, "grad_norm": 6.91265344619751, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8660314083099365, "num_tokens": 450211104.0, "step": 11799 }, { "epoch": 1.5010812873680193, "ewc_loss": 0.058296993374824524, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026802855427376926, "grad_norm": 7.0149431228637695, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8561811447143555, "num_tokens": 450244844.0, "step": 11800 }, { "epoch": 1.5012084976466098, "ewc_loss": 0.05794200301170349, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002669200475793332, "grad_norm": 6.88789701461792, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8631631135940552, "num_tokens": 450286094.0, "step": 11801 }, { "epoch": 1.5013357079252003, "ewc_loss": 0.058125171810388565, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002687517262529582, "grad_norm": 6.966935157775879, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8544769287109375, "num_tokens": 450324938.0, "step": 11802 }, { "epoch": 1.5014629182037909, "ewc_loss": 0.0579022578895092, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002665225765667856, "grad_norm": 6.902991771697998, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.864022433757782, "num_tokens": 450368572.0, "step": 11803 }, { "epoch": 1.5015901284823814, "ewc_loss": 0.05811785161495209, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026867850101552904, "grad_norm": 6.962666988372803, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8421494960784912, "num_tokens": 450410926.0, "step": 11804 }, { "epoch": 1.501717338760972, "ewc_loss": 0.05817955732345581, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002668541856110096, "grad_norm": 6.900264739990234, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8674483299255371, "num_tokens": 450450206.0, "step": 11805 }, { "epoch": 1.5018445490395624, "ewc_loss": 0.058329205960035324, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026835064636543393, "grad_norm": 6.954354763031006, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8430948257446289, "num_tokens": 450488998.0, "step": 11806 }, { "epoch": 1.501971759318153, "ewc_loss": 0.0583144836127758, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026820343919098377, "grad_norm": 6.912160396575928, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8743519186973572, "num_tokens": 450530476.0, "step": 11807 }, { "epoch": 1.5020989695967435, "ewc_loss": 0.058427613228559494, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026933473418466747, "grad_norm": 7.000603199005127, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8712126016616821, "num_tokens": 450560655.0, "step": 11808 }, { "epoch": 1.502226179875334, "ewc_loss": 0.05805784463882446, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026807846734300256, "grad_norm": 6.901119709014893, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8660253286361694, "num_tokens": 450601816.0, "step": 11809 }, { "epoch": 1.5023533901539246, "ewc_loss": 0.05846363678574562, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026969495229423046, "grad_norm": 7.003373146057129, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8671215772628784, "num_tokens": 450634182.0, "step": 11810 }, { "epoch": 1.502480600432515, "ewc_loss": 0.05827873945236206, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002678460150491446, "grad_norm": 6.896787166595459, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8590095639228821, "num_tokens": 450672705.0, "step": 11811 }, { "epoch": 1.5026078107111056, "ewc_loss": 0.0584016889333725, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002690754772629589, "grad_norm": 7.013213157653809, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8515079617500305, "num_tokens": 450705636.0, "step": 11812 }, { "epoch": 1.502735020989696, "ewc_loss": 0.05823427066206932, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026740130851976573, "grad_norm": 6.985513210296631, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8557682633399963, "num_tokens": 450736165.0, "step": 11813 }, { "epoch": 1.5028622312682864, "ewc_loss": 0.05825968086719513, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002676554140634835, "grad_norm": 6.930230140686035, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8651435971260071, "num_tokens": 450776939.0, "step": 11814 }, { "epoch": 1.502989441546877, "ewc_loss": 0.0582650825381279, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002677094307728112, "grad_norm": 6.920804977416992, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8679323196411133, "num_tokens": 450811994.0, "step": 11815 }, { "epoch": 1.5031166518254675, "ewc_loss": 0.058250293135643005, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026756152510643005, "grad_norm": 6.870299816131592, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8648964762687683, "num_tokens": 450853688.0, "step": 11816 }, { "epoch": 1.503243862104058, "ewc_loss": 0.058326806873083115, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002683266648091376, "grad_norm": 6.937417507171631, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8640369772911072, "num_tokens": 450892100.0, "step": 11817 }, { "epoch": 1.5033710723826486, "ewc_loss": 0.05827775597572327, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002678361488506198, "grad_norm": 6.944781303405762, "learning_rate": 1e-06, "loss": 0.3992, "mean_token_accuracy": 0.8799290657043457, "num_tokens": 450936383.0, "step": 11818 }, { "epoch": 1.503498282661239, "ewc_loss": 0.05830829590559006, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026814156444743276, "grad_norm": 6.973851680755615, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8703781366348267, "num_tokens": 450965525.0, "step": 11819 }, { "epoch": 1.5036254929398294, "ewc_loss": 0.05830509215593338, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002681095211300999, "grad_norm": 6.883119106292725, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.852429986000061, "num_tokens": 451005874.0, "step": 11820 }, { "epoch": 1.50375270321842, "ewc_loss": 0.05834747850894928, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002685333602130413, "grad_norm": 7.016915798187256, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8454991579055786, "num_tokens": 451040668.0, "step": 11821 }, { "epoch": 1.5038799134970104, "ewc_loss": 0.05792296305298805, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002667296212166548, "grad_norm": 6.8508172035217285, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8608014583587646, "num_tokens": 451082747.0, "step": 11822 }, { "epoch": 1.504007123775601, "ewc_loss": 0.05845475569367409, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026960615650750697, "grad_norm": 7.033581256866455, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.870991587638855, "num_tokens": 451120344.0, "step": 11823 }, { "epoch": 1.5041343340541915, "ewc_loss": 0.05817759409546852, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002668345405254513, "grad_norm": 6.891407012939453, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8557642698287964, "num_tokens": 451156564.0, "step": 11824 }, { "epoch": 1.504261544332782, "ewc_loss": 0.058411676436662674, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002691753616090864, "grad_norm": 6.926932334899902, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8587561845779419, "num_tokens": 451197252.0, "step": 11825 }, { "epoch": 1.5043887546113726, "ewc_loss": 0.05828714370727539, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002679300378076732, "grad_norm": 6.9400835037231445, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8526979684829712, "num_tokens": 451236150.0, "step": 11826 }, { "epoch": 1.504515964889963, "ewc_loss": 0.058305490761995316, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026811350835487247, "grad_norm": 6.887435436248779, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8599731922149658, "num_tokens": 451275751.0, "step": 11827 }, { "epoch": 1.5046431751685536, "ewc_loss": 0.05825216323137283, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002675802097655833, "grad_norm": 6.908400058746338, "learning_rate": 1e-06, "loss": 0.388, "mean_token_accuracy": 0.8836495876312256, "num_tokens": 451316778.0, "step": 11828 }, { "epoch": 1.5047703854471441, "ewc_loss": 0.058345481753349304, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000268513394985348, "grad_norm": 6.8647918701171875, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8635034561157227, "num_tokens": 451359921.0, "step": 11829 }, { "epoch": 1.5048975957257347, "ewc_loss": 0.05835961550474167, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026865475228987634, "grad_norm": 6.946856498718262, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8696779012680054, "num_tokens": 451395616.0, "step": 11830 }, { "epoch": 1.5050248060043252, "ewc_loss": 0.058284588158130646, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026790445554070175, "grad_norm": 6.937427043914795, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8640162348747253, "num_tokens": 451427766.0, "step": 11831 }, { "epoch": 1.5051520162829157, "ewc_loss": 0.05848190188407898, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002698776079341769, "grad_norm": 6.997971057891846, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8725763559341431, "num_tokens": 451463073.0, "step": 11832 }, { "epoch": 1.5052792265615063, "ewc_loss": 0.05831029638648033, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026816155877895653, "grad_norm": 6.896396636962891, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8821994066238403, "num_tokens": 451503039.0, "step": 11833 }, { "epoch": 1.5054064368400968, "ewc_loss": 0.058421649038791656, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026927507133223116, "grad_norm": 6.9870405197143555, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.846748411655426, "num_tokens": 451541599.0, "step": 11834 }, { "epoch": 1.5055336471186873, "ewc_loss": 0.0583561435341835, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026862003142014146, "grad_norm": 6.926643371582031, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8500030636787415, "num_tokens": 451579278.0, "step": 11835 }, { "epoch": 1.5056608573972778, "ewc_loss": 0.05843421071767807, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002694007125683129, "grad_norm": 6.949112892150879, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8672877550125122, "num_tokens": 451618591.0, "step": 11836 }, { "epoch": 1.5057880676758684, "ewc_loss": 0.05833985656499863, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002684571663849056, "grad_norm": 6.992796421051025, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8637365102767944, "num_tokens": 451654453.0, "step": 11837 }, { "epoch": 1.5059152779544587, "ewc_loss": 0.058300238102674484, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026806097594089806, "grad_norm": 6.954653263092041, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8619692325592041, "num_tokens": 451688302.0, "step": 11838 }, { "epoch": 1.5060424882330492, "ewc_loss": 0.05831093713641167, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000268167961621657, "grad_norm": 6.92825984954834, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8692827224731445, "num_tokens": 451727301.0, "step": 11839 }, { "epoch": 1.5061696985116397, "ewc_loss": 0.058266542851924896, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026772404089570045, "grad_norm": 6.977758407592773, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8499181270599365, "num_tokens": 451770471.0, "step": 11840 }, { "epoch": 1.5062969087902303, "ewc_loss": 0.05825556069612503, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002676141739357263, "grad_norm": 6.938668727874756, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8513146638870239, "num_tokens": 451809413.0, "step": 11841 }, { "epoch": 1.5064241190688208, "ewc_loss": 0.05821007490158081, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026715933927334845, "grad_norm": 6.9192962646484375, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8606886267662048, "num_tokens": 451850177.0, "step": 11842 }, { "epoch": 1.5065513293474113, "ewc_loss": 0.05829158425331116, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026797442114911973, "grad_norm": 6.972248554229736, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8509818315505981, "num_tokens": 451887977.0, "step": 11843 }, { "epoch": 1.5066785396260016, "ewc_loss": 0.05824536085128784, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026751222321763635, "grad_norm": 6.907840251922607, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8734229803085327, "num_tokens": 451927403.0, "step": 11844 }, { "epoch": 1.5068057499045922, "ewc_loss": 0.058343663811683655, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026849520509131253, "grad_norm": 6.955938816070557, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8647730350494385, "num_tokens": 451967350.0, "step": 11845 }, { "epoch": 1.5069329601831827, "ewc_loss": 0.05830201506614685, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002680787583813071, "grad_norm": 6.903436660766602, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8635825514793396, "num_tokens": 452008664.0, "step": 11846 }, { "epoch": 1.5070601704617732, "ewc_loss": 0.05839843302965164, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026904293918050826, "grad_norm": 6.965554714202881, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8684961199760437, "num_tokens": 452047303.0, "step": 11847 }, { "epoch": 1.5071873807403637, "ewc_loss": 0.058270178735256195, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002677603915799409, "grad_norm": 6.950195789337158, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8645448684692383, "num_tokens": 452087359.0, "step": 11848 }, { "epoch": 1.5073145910189543, "ewc_loss": 0.058365367352962494, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002687122905626893, "grad_norm": 6.984035491943359, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8496475219726562, "num_tokens": 452129426.0, "step": 11849 }, { "epoch": 1.5074418012975448, "ewc_loss": 0.05825810879468918, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002676396688912064, "grad_norm": 6.922370910644531, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8683935403823853, "num_tokens": 452169134.0, "step": 11850 }, { "epoch": 1.5075690115761353, "ewc_loss": 0.058385156095027924, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026891016750596464, "grad_norm": 6.906589031219482, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8541889190673828, "num_tokens": 452209962.0, "step": 11851 }, { "epoch": 1.5076962218547258, "ewc_loss": 0.058356352150440216, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026862212689593434, "grad_norm": 6.9275312423706055, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8668327927589417, "num_tokens": 452245978.0, "step": 11852 }, { "epoch": 1.5078234321333164, "ewc_loss": 0.058347079902887344, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002685294020920992, "grad_norm": 6.944116115570068, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8719534873962402, "num_tokens": 452281013.0, "step": 11853 }, { "epoch": 1.507950642411907, "ewc_loss": 0.058361247181892395, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026867107953876257, "grad_norm": 6.94060754776001, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8769746422767639, "num_tokens": 452317174.0, "step": 11854 }, { "epoch": 1.5080778526904974, "ewc_loss": 0.058353643864393234, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002685950312297791, "grad_norm": 6.884915828704834, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8597642183303833, "num_tokens": 452362974.0, "step": 11855 }, { "epoch": 1.508205062969088, "ewc_loss": 0.058176878839731216, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002692687849048525, "grad_norm": 6.972177982330322, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8748542070388794, "num_tokens": 452400701.0, "step": 11856 }, { "epoch": 1.5083322732476785, "ewc_loss": 0.05839892476797104, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026904785772785544, "grad_norm": 6.96560001373291, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8728304505348206, "num_tokens": 452438541.0, "step": 11857 }, { "epoch": 1.508459483526269, "ewc_loss": 0.05824708938598633, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002699709148146212, "grad_norm": 6.999884605407715, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8580117225646973, "num_tokens": 452476918.0, "step": 11858 }, { "epoch": 1.5085866938048595, "ewc_loss": 0.058027252554893494, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002677725278772414, "grad_norm": 6.932905673980713, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8667452335357666, "num_tokens": 452509231.0, "step": 11859 }, { "epoch": 1.50871390408345, "ewc_loss": 0.05816352367401123, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026913522742688656, "grad_norm": 6.931585788726807, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8595657348632812, "num_tokens": 452547079.0, "step": 11860 }, { "epoch": 1.5088411143620406, "ewc_loss": 0.05816206336021423, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002691206173039973, "grad_norm": 6.926239967346191, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8745170831680298, "num_tokens": 452580215.0, "step": 11861 }, { "epoch": 1.508968324640631, "ewc_loss": 0.05818960815668106, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026939608505927026, "grad_norm": 6.992071151733398, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8699914216995239, "num_tokens": 452615778.0, "step": 11862 }, { "epoch": 1.5090955349192214, "ewc_loss": 0.058153100311756134, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026903097750619054, "grad_norm": 6.966939449310303, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8661937713623047, "num_tokens": 452652184.0, "step": 11863 }, { "epoch": 1.509222745197812, "ewc_loss": 0.05818181484937668, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026931814500130713, "grad_norm": 6.954833030700684, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8495900630950928, "num_tokens": 452691345.0, "step": 11864 }, { "epoch": 1.5093499554764025, "ewc_loss": 0.05838318169116974, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002688904060050845, "grad_norm": 6.9189019203186035, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8658102750778198, "num_tokens": 452729982.0, "step": 11865 }, { "epoch": 1.509477165754993, "ewc_loss": 0.0581522136926651, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002690221299417317, "grad_norm": 6.968491554260254, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8614029884338379, "num_tokens": 452771272.0, "step": 11866 }, { "epoch": 1.5096043760335836, "ewc_loss": 0.0581408366560936, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002689083921723068, "grad_norm": 6.913569450378418, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8594428300857544, "num_tokens": 452812231.0, "step": 11867 }, { "epoch": 1.509731586312174, "ewc_loss": 0.05820745229721069, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002695745206438005, "grad_norm": 6.949250221252441, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8530028462409973, "num_tokens": 452858406.0, "step": 11868 }, { "epoch": 1.5098587965907644, "ewc_loss": 0.05817826837301254, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00026928266743198037, "grad_norm": 6.969836711883545, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8548310995101929, "num_tokens": 452898793.0, "step": 11869 }, { "epoch": 1.509986006869355, "ewc_loss": 0.05843377858400345, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002693964052014053, "grad_norm": 7.013148307800293, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8695876002311707, "num_tokens": 452937929.0, "step": 11870 }, { "epoch": 1.5101132171479454, "ewc_loss": 0.05837121605873108, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002687707601580769, "grad_norm": 6.964963436126709, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8665310144424438, "num_tokens": 452975436.0, "step": 11871 }, { "epoch": 1.510240427426536, "ewc_loss": 0.058488182723522186, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000269940443104133, "grad_norm": 7.061074256896973, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8633694648742676, "num_tokens": 453016334.0, "step": 11872 }, { "epoch": 1.5103676377051265, "ewc_loss": 0.05830974131822586, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002681559999473393, "grad_norm": 7.0280256271362305, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8679482936859131, "num_tokens": 453053630.0, "step": 11873 }, { "epoch": 1.510494847983717, "ewc_loss": 0.05839712917804718, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002690298715606332, "grad_norm": 7.015273094177246, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8448871374130249, "num_tokens": 453092341.0, "step": 11874 }, { "epoch": 1.5106220582623076, "ewc_loss": 0.05832454934716225, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026830408023670316, "grad_norm": 7.0263848304748535, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8722862005233765, "num_tokens": 453126859.0, "step": 11875 }, { "epoch": 1.510749268540898, "ewc_loss": 0.058272022753953934, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026777881430462003, "grad_norm": 7.04218864440918, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8600139617919922, "num_tokens": 453167327.0, "step": 11876 }, { "epoch": 1.5108764788194886, "ewc_loss": 0.05830466002225876, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002681052137631923, "grad_norm": 7.020700454711914, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8692669868469238, "num_tokens": 453205901.0, "step": 11877 }, { "epoch": 1.5110036890980791, "ewc_loss": 0.05824222415685654, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026748082018457353, "grad_norm": 7.091644763946533, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8619898557662964, "num_tokens": 453241856.0, "step": 11878 }, { "epoch": 1.5111308993766697, "ewc_loss": 0.058125417679548264, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000266312767053023, "grad_norm": 7.0201497077941895, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8719319701194763, "num_tokens": 453282528.0, "step": 11879 }, { "epoch": 1.5112581096552602, "ewc_loss": 0.05817181617021561, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002667767694219947, "grad_norm": 7.079214572906494, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8536773324012756, "num_tokens": 453318762.0, "step": 11880 }, { "epoch": 1.5113853199338507, "ewc_loss": 0.058070696890354156, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026576558593660593, "grad_norm": 7.020449638366699, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8783595561981201, "num_tokens": 453352482.0, "step": 11881 }, { "epoch": 1.5115125302124413, "ewc_loss": 0.05820982903242111, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002671568945515901, "grad_norm": 7.064411640167236, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8625873923301697, "num_tokens": 453393692.0, "step": 11882 }, { "epoch": 1.5116397404910318, "ewc_loss": 0.05808297544717789, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002658883749973029, "grad_norm": 6.939579486846924, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8665952682495117, "num_tokens": 453429217.0, "step": 11883 }, { "epoch": 1.5117669507696223, "ewc_loss": 0.05831363424658775, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002681949408724904, "grad_norm": 7.2743096351623535, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8482632637023926, "num_tokens": 453468190.0, "step": 11884 }, { "epoch": 1.5118941610482128, "ewc_loss": 0.058041561394929886, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002654741983860731, "grad_norm": 6.832180500030518, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8569199442863464, "num_tokens": 453509893.0, "step": 11885 }, { "epoch": 1.5120213713268034, "ewc_loss": 0.0584564208984375, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002696228038985282, "grad_norm": 7.174346923828125, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8670198321342468, "num_tokens": 453546972.0, "step": 11886 }, { "epoch": 1.5121485816053937, "ewc_loss": 0.05806265026330948, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026568511384539306, "grad_norm": 6.8944878578186035, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8758677244186401, "num_tokens": 453578480.0, "step": 11887 }, { "epoch": 1.5122757918839842, "ewc_loss": 0.05847422778606415, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002698008611332625, "grad_norm": 7.04410982131958, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8646219968795776, "num_tokens": 453623248.0, "step": 11888 }, { "epoch": 1.5124030021625747, "ewc_loss": 0.05825638771057129, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002676224976312369, "grad_norm": 6.894347667694092, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8841217160224915, "num_tokens": 453660088.0, "step": 11889 }, { "epoch": 1.5125302124411653, "ewc_loss": 0.05842634290456772, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026932204491458833, "grad_norm": 6.998619079589844, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8669601082801819, "num_tokens": 453698931.0, "step": 11890 }, { "epoch": 1.5126574227197558, "ewc_loss": 0.058351099491119385, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002685695653781295, "grad_norm": 7.033576011657715, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.866849422454834, "num_tokens": 453734220.0, "step": 11891 }, { "epoch": 1.5127846329983463, "ewc_loss": 0.05829111486673355, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026796976453624666, "grad_norm": 6.931268215179443, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.862977147102356, "num_tokens": 453774056.0, "step": 11892 }, { "epoch": 1.5129118432769366, "ewc_loss": 0.0583357959985733, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026841656654141843, "grad_norm": 7.026185035705566, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8698561787605286, "num_tokens": 453806616.0, "step": 11893 }, { "epoch": 1.5130390535555271, "ewc_loss": 0.05829767882823944, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026803536457009614, "grad_norm": 6.994676113128662, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8719057440757751, "num_tokens": 453843838.0, "step": 11894 }, { "epoch": 1.5131662638341177, "ewc_loss": 0.05836434289813042, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002687020169105381, "grad_norm": 7.002522945404053, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8721601963043213, "num_tokens": 453876888.0, "step": 11895 }, { "epoch": 1.5132934741127082, "ewc_loss": 0.05826664716005325, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002677250886335969, "grad_norm": 6.93845796585083, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8791317343711853, "num_tokens": 453915334.0, "step": 11896 }, { "epoch": 1.5134206843912987, "ewc_loss": 0.05840420722961426, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002691006811801344, "grad_norm": 6.9841413497924805, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8733125925064087, "num_tokens": 453950590.0, "step": 11897 }, { "epoch": 1.5135478946698893, "ewc_loss": 0.05833974480628967, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026845603133551776, "grad_norm": 6.992465496063232, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8675488233566284, "num_tokens": 453992590.0, "step": 11898 }, { "epoch": 1.5136751049484798, "ewc_loss": 0.05831915885210037, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002682501799426973, "grad_norm": 6.927563667297363, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8683724403381348, "num_tokens": 454038619.0, "step": 11899 }, { "epoch": 1.5138023152270703, "ewc_loss": 0.05838935822248459, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026895219343714416, "grad_norm": 6.979860305786133, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8542991876602173, "num_tokens": 454081564.0, "step": 11900 }, { "epoch": 1.5139295255056608, "ewc_loss": 0.05833559110760689, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000268414500169456, "grad_norm": 7.011418342590332, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8519108891487122, "num_tokens": 454125581.0, "step": 11901 }, { "epoch": 1.5140567357842514, "ewc_loss": 0.05834251642227173, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026848376728594303, "grad_norm": 6.920966625213623, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8725112676620483, "num_tokens": 454163961.0, "step": 11902 }, { "epoch": 1.514183946062842, "ewc_loss": 0.05843797326087952, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000269438314717263, "grad_norm": 6.990599632263184, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8560932874679565, "num_tokens": 454200410.0, "step": 11903 }, { "epoch": 1.5143111563414324, "ewc_loss": 0.05832351744174957, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002682937774807215, "grad_norm": 6.94489049911499, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8496667742729187, "num_tokens": 454237143.0, "step": 11904 }, { "epoch": 1.514438366620023, "ewc_loss": 0.058468226343393326, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026974084903486073, "grad_norm": 6.965817451477051, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8712211847305298, "num_tokens": 454274022.0, "step": 11905 }, { "epoch": 1.5145655768986135, "ewc_loss": 0.05847505107522011, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026980909751728177, "grad_norm": 6.972099781036377, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8587483763694763, "num_tokens": 454307955.0, "step": 11906 }, { "epoch": 1.514692787177204, "ewc_loss": 0.058412909507751465, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026918770163320005, "grad_norm": 6.987924098968506, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8723903894424438, "num_tokens": 454345449.0, "step": 11907 }, { "epoch": 1.5148199974557945, "ewc_loss": 0.058498792350292206, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027004649746231735, "grad_norm": 6.994534015655518, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8515458106994629, "num_tokens": 454383096.0, "step": 11908 }, { "epoch": 1.514947207734385, "ewc_loss": 0.05838216841220856, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026888027787208557, "grad_norm": 6.981115818023682, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8520232439041138, "num_tokens": 454420142.0, "step": 11909 }, { "epoch": 1.5150744180129756, "ewc_loss": 0.05845482647418976, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002696068841032684, "grad_norm": 6.949597358703613, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8627407550811768, "num_tokens": 454454991.0, "step": 11910 }, { "epoch": 1.515201628291566, "ewc_loss": 0.05844360962510109, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002694946888368577, "grad_norm": 7.009530067443848, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8608685731887817, "num_tokens": 454490161.0, "step": 11911 }, { "epoch": 1.5153288385701564, "ewc_loss": 0.05847853049635887, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000269843905698508, "grad_norm": 6.883761882781982, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8557787537574768, "num_tokens": 454534767.0, "step": 11912 }, { "epoch": 1.515456048848747, "ewc_loss": 0.05854519456624985, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705105289351195, "grad_norm": 7.009916305541992, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8733680248260498, "num_tokens": 454573246.0, "step": 11913 }, { "epoch": 1.5155832591273375, "ewc_loss": 0.05843191221356392, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026937772054225206, "grad_norm": 6.9273858070373535, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8775224089622498, "num_tokens": 454607626.0, "step": 11914 }, { "epoch": 1.515710469405928, "ewc_loss": 0.058606866747140884, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027112726820632815, "grad_norm": 7.025763511657715, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.858296275138855, "num_tokens": 454639840.0, "step": 11915 }, { "epoch": 1.5158376796845185, "ewc_loss": 0.05838877707719803, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026894634356722236, "grad_norm": 6.947829723358154, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8553699254989624, "num_tokens": 454681445.0, "step": 11916 }, { "epoch": 1.515964889963109, "ewc_loss": 0.058524154126644135, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002703001373447478, "grad_norm": 6.993370056152344, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8569358587265015, "num_tokens": 454726191.0, "step": 11917 }, { "epoch": 1.5160921002416994, "ewc_loss": 0.058512140065431595, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002701799967326224, "grad_norm": 6.902119159698486, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.870255708694458, "num_tokens": 454765112.0, "step": 11918 }, { "epoch": 1.51621931052029, "ewc_loss": 0.05857658386230469, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002708244137465954, "grad_norm": 6.992824077606201, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8601475358009338, "num_tokens": 454805254.0, "step": 11919 }, { "epoch": 1.5163465207988804, "ewc_loss": 0.05838019400835037, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026886051637120545, "grad_norm": 6.9501543045043945, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8573105335235596, "num_tokens": 454841344.0, "step": 11920 }, { "epoch": 1.516473731077471, "ewc_loss": 0.05860266461968422, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027108524227514863, "grad_norm": 6.9998087882995605, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8762953281402588, "num_tokens": 454879986.0, "step": 11921 }, { "epoch": 1.5166009413560615, "ewc_loss": 0.058408014476299286, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002691387489903718, "grad_norm": 6.990274429321289, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8549315929412842, "num_tokens": 454917731.0, "step": 11922 }, { "epoch": 1.516728151634652, "ewc_loss": 0.05854221433401108, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002704807266127318, "grad_norm": 6.9140753746032715, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8696693778038025, "num_tokens": 454952837.0, "step": 11923 }, { "epoch": 1.5168553619132426, "ewc_loss": 0.05854488164186478, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705073857214302, "grad_norm": 7.133054733276367, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8666121959686279, "num_tokens": 454992751.0, "step": 11924 }, { "epoch": 1.516982572191833, "ewc_loss": 0.05837292596697807, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000268787844106555, "grad_norm": 6.897673606872559, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8628299236297607, "num_tokens": 455029829.0, "step": 11925 }, { "epoch": 1.5171097824704236, "ewc_loss": 0.0586271807551384, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002713303838390857, "grad_norm": 6.940820217132568, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8641456365585327, "num_tokens": 455072167.0, "step": 11926 }, { "epoch": 1.5172369927490141, "ewc_loss": 0.05836998671293259, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026875847834162414, "grad_norm": 6.9494099617004395, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.870710015296936, "num_tokens": 455108433.0, "step": 11927 }, { "epoch": 1.5173642030276047, "ewc_loss": 0.05861502140760422, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027120878803543746, "grad_norm": 6.934762954711914, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8675963878631592, "num_tokens": 455148943.0, "step": 11928 }, { "epoch": 1.5174914133061952, "ewc_loss": 0.05853379890322685, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002703965874388814, "grad_norm": 6.977681636810303, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8597938418388367, "num_tokens": 455185099.0, "step": 11929 }, { "epoch": 1.5176186235847857, "ewc_loss": 0.05859458073973656, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002710043918341398, "grad_norm": 6.9618611335754395, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8785247802734375, "num_tokens": 455225322.0, "step": 11930 }, { "epoch": 1.5177458338633762, "ewc_loss": 0.05853375047445297, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002703961217775941, "grad_norm": 6.991454601287842, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8609398603439331, "num_tokens": 455262830.0, "step": 11931 }, { "epoch": 1.5178730441419668, "ewc_loss": 0.058509089052677155, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027014949591830373, "grad_norm": 6.91215181350708, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8711761236190796, "num_tokens": 455300825.0, "step": 11932 }, { "epoch": 1.5180002544205573, "ewc_loss": 0.05858372896909714, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002708958927541971, "grad_norm": 6.995162487030029, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8598729372024536, "num_tokens": 455336209.0, "step": 11933 }, { "epoch": 1.5181274646991478, "ewc_loss": 0.05848495662212372, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002699081669561565, "grad_norm": 6.927216053009033, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.863025426864624, "num_tokens": 455374909.0, "step": 11934 }, { "epoch": 1.5182546749777384, "ewc_loss": 0.05863449349999428, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027140352176502347, "grad_norm": 7.021210670471191, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.876724123954773, "num_tokens": 455417179.0, "step": 11935 }, { "epoch": 1.5183818852563287, "ewc_loss": 0.058466728776693344, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026972588966600597, "grad_norm": 6.926942825317383, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8503482341766357, "num_tokens": 455457923.0, "step": 11936 }, { "epoch": 1.5185090955349192, "ewc_loss": 0.05859120190143585, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027097063139081, "grad_norm": 6.995155334472656, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8551559448242188, "num_tokens": 455492932.0, "step": 11937 }, { "epoch": 1.5186363058135097, "ewc_loss": 0.05854620784521103, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705206861719489, "grad_norm": 6.901499271392822, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8542238473892212, "num_tokens": 455540392.0, "step": 11938 }, { "epoch": 1.5187635160921003, "ewc_loss": 0.05862872302532196, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027134580886922777, "grad_norm": 7.065144062042236, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8578676581382751, "num_tokens": 455574461.0, "step": 11939 }, { "epoch": 1.5188907263706908, "ewc_loss": 0.058436039835214615, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002694189897738397, "grad_norm": 6.9392924308776855, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8606418967247009, "num_tokens": 455608315.0, "step": 11940 }, { "epoch": 1.5190179366492813, "ewc_loss": 0.05863455682992935, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002714041620492935, "grad_norm": 6.987079620361328, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.86800616979599, "num_tokens": 455654927.0, "step": 11941 }, { "epoch": 1.5191451469278716, "ewc_loss": 0.058482229709625244, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002698808675631881, "grad_norm": 6.946022987365723, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8510745763778687, "num_tokens": 455695745.0, "step": 11942 }, { "epoch": 1.5192723572064621, "ewc_loss": 0.0586392767727375, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027145136846229434, "grad_norm": 6.989635944366455, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8639174699783325, "num_tokens": 455734652.0, "step": 11943 }, { "epoch": 1.5193995674850527, "ewc_loss": 0.058489616960287094, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002699547621887177, "grad_norm": 6.970589637756348, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8413790464401245, "num_tokens": 455776063.0, "step": 11944 }, { "epoch": 1.5195267777636432, "ewc_loss": 0.05854111164808273, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002704696962609887, "grad_norm": 6.95676326751709, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8582403659820557, "num_tokens": 455820595.0, "step": 11945 }, { "epoch": 1.5196539880422337, "ewc_loss": 0.058554813265800476, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000270606717094779, "grad_norm": 6.951571464538574, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8686854839324951, "num_tokens": 455859553.0, "step": 11946 }, { "epoch": 1.5197811983208243, "ewc_loss": 0.058514922857284546, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002702078490983695, "grad_norm": 7.016946315765381, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8648812770843506, "num_tokens": 455892676.0, "step": 11947 }, { "epoch": 1.5199084085994148, "ewc_loss": 0.05850737914443016, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027013238286599517, "grad_norm": 6.92282247543335, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8550822734832764, "num_tokens": 455932106.0, "step": 11948 }, { "epoch": 1.5200356188780053, "ewc_loss": 0.05867660045623779, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027182462508790195, "grad_norm": 7.013014316558838, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8686898946762085, "num_tokens": 455971555.0, "step": 11949 }, { "epoch": 1.5201628291565958, "ewc_loss": 0.05844827741384506, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002695413713809103, "grad_norm": 6.921252250671387, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8586238026618958, "num_tokens": 456014177.0, "step": 11950 }, { "epoch": 1.5202900394351864, "ewc_loss": 0.05872832238674164, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027234182925894856, "grad_norm": 7.041746616363525, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8683210015296936, "num_tokens": 456048229.0, "step": 11951 }, { "epoch": 1.520417249713777, "ewc_loss": 0.05847049504518509, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002697635209187865, "grad_norm": 6.915274620056152, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8828186392784119, "num_tokens": 456085815.0, "step": 11952 }, { "epoch": 1.5205444599923674, "ewc_loss": 0.058725085109472275, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002723094366956502, "grad_norm": 6.982424259185791, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8661152720451355, "num_tokens": 456128971.0, "step": 11953 }, { "epoch": 1.520671670270958, "ewc_loss": 0.058616235852241516, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002712209534365684, "grad_norm": 7.035862445831299, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8466986417770386, "num_tokens": 456159705.0, "step": 11954 }, { "epoch": 1.5207988805495485, "ewc_loss": 0.0585646852850914, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027070543728768826, "grad_norm": 6.980688095092773, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8539851903915405, "num_tokens": 456198251.0, "step": 11955 }, { "epoch": 1.520926090828139, "ewc_loss": 0.05864105373620987, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027146912179887295, "grad_norm": 7.0292253494262695, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8564205169677734, "num_tokens": 456237612.0, "step": 11956 }, { "epoch": 1.5210533011067295, "ewc_loss": 0.05847065895795822, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000269765208940953, "grad_norm": 6.922224998474121, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8626781105995178, "num_tokens": 456273665.0, "step": 11957 }, { "epoch": 1.52118051138532, "ewc_loss": 0.05868854373693466, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027194400900043547, "grad_norm": 7.1136908531188965, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8646855354309082, "num_tokens": 456305467.0, "step": 11958 }, { "epoch": 1.5213077216639106, "ewc_loss": 0.05840499699115753, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026910853921435773, "grad_norm": 6.91246223449707, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.870161771774292, "num_tokens": 456342951.0, "step": 11959 }, { "epoch": 1.521434931942501, "ewc_loss": 0.05870750546455383, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027213364955969155, "grad_norm": 7.093679428100586, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8629273176193237, "num_tokens": 456375846.0, "step": 11960 }, { "epoch": 1.5215621422210914, "ewc_loss": 0.05838534235954285, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002689120010472834, "grad_norm": 7.0081071853637695, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8727002143859863, "num_tokens": 456407993.0, "step": 11961 }, { "epoch": 1.521689352499682, "ewc_loss": 0.058599576354026794, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002710543340072036, "grad_norm": 7.081226348876953, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8449583649635315, "num_tokens": 456444278.0, "step": 11962 }, { "epoch": 1.5218165627782725, "ewc_loss": 0.05842382833361626, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026929687010124326, "grad_norm": 6.9234538078308105, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8716212511062622, "num_tokens": 456483950.0, "step": 11963 }, { "epoch": 1.521943773056863, "ewc_loss": 0.05860696732997894, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027112828684039414, "grad_norm": 7.038327217102051, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8646292686462402, "num_tokens": 456527438.0, "step": 11964 }, { "epoch": 1.5220709833354535, "ewc_loss": 0.058479003608226776, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026984864962287247, "grad_norm": 6.965416431427002, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8746293783187866, "num_tokens": 456560572.0, "step": 11965 }, { "epoch": 1.5221981936140438, "ewc_loss": 0.058650903403759, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271567638264969, "grad_norm": 7.002800464630127, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8541603684425354, "num_tokens": 456604781.0, "step": 11966 }, { "epoch": 1.5223254038926344, "ewc_loss": 0.05844396352767944, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002694982395041734, "grad_norm": 7.01181173324585, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.876763641834259, "num_tokens": 456639415.0, "step": 11967 }, { "epoch": 1.522452614171225, "ewc_loss": 0.058545637875795364, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705149818211794, "grad_norm": 7.032516002655029, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8767831325531006, "num_tokens": 456671183.0, "step": 11968 }, { "epoch": 1.5225798244498154, "ewc_loss": 0.05853043869137764, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027036297251470387, "grad_norm": 7.0109357833862305, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8512874841690063, "num_tokens": 456712995.0, "step": 11969 }, { "epoch": 1.522707034728406, "ewc_loss": 0.058561116456985474, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027066978509537876, "grad_norm": 6.994594097137451, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8581250905990601, "num_tokens": 456750587.0, "step": 11970 }, { "epoch": 1.5228342450069965, "ewc_loss": 0.05854598805308342, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705184742808342, "grad_norm": 7.0280537605285645, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.870943009853363, "num_tokens": 456787503.0, "step": 11971 }, { "epoch": 1.522961455285587, "ewc_loss": 0.058429840952157974, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026935699861496687, "grad_norm": 6.946128845214844, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8412512540817261, "num_tokens": 456831645.0, "step": 11972 }, { "epoch": 1.5230886655641775, "ewc_loss": 0.0586627721786499, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027168632368557155, "grad_norm": 7.010272026062012, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8705631494522095, "num_tokens": 456869143.0, "step": 11973 }, { "epoch": 1.523215875842768, "ewc_loss": 0.058437466621398926, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026943327975459397, "grad_norm": 6.967333793640137, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8615520000457764, "num_tokens": 456906465.0, "step": 11974 }, { "epoch": 1.5233430861213586, "ewc_loss": 0.0585443452000618, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027050203061662614, "grad_norm": 6.9754958152771, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8629364967346191, "num_tokens": 456942048.0, "step": 11975 }, { "epoch": 1.5234702963999491, "ewc_loss": 0.05853551998734474, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002704137878026813, "grad_norm": 6.9855732917785645, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8817068338394165, "num_tokens": 456975710.0, "step": 11976 }, { "epoch": 1.5235975066785397, "ewc_loss": 0.05851684510707855, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002702270576264709, "grad_norm": 6.979089736938477, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8526390194892883, "num_tokens": 457022295.0, "step": 11977 }, { "epoch": 1.5237247169571302, "ewc_loss": 0.05850059539079666, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000270064570941031, "grad_norm": 7.014286518096924, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8551310896873474, "num_tokens": 457060433.0, "step": 11978 }, { "epoch": 1.5238519272357207, "ewc_loss": 0.05851364508271217, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002701950434129685, "grad_norm": 6.990042209625244, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.869827151298523, "num_tokens": 457101760.0, "step": 11979 }, { "epoch": 1.5239791375143112, "ewc_loss": 0.05854614078998566, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027051998768001795, "grad_norm": 6.957890033721924, "learning_rate": 1e-06, "loss": 0.4039, "mean_token_accuracy": 0.8794541358947754, "num_tokens": 457142676.0, "step": 11980 }, { "epoch": 1.5241063477929018, "ewc_loss": 0.05846547335386276, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002697133459150791, "grad_norm": 7.005327224731445, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8706930875778198, "num_tokens": 457179507.0, "step": 11981 }, { "epoch": 1.5242335580714923, "ewc_loss": 0.05853693187236786, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027042790316045284, "grad_norm": 6.9740095138549805, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8661051392555237, "num_tokens": 457221563.0, "step": 11982 }, { "epoch": 1.5243607683500828, "ewc_loss": 0.0583832785487175, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026889139553532004, "grad_norm": 6.98301887512207, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8612185120582581, "num_tokens": 457258634.0, "step": 11983 }, { "epoch": 1.5244879786286734, "ewc_loss": 0.05857837572693825, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027084234170615673, "grad_norm": 6.987786293029785, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8784770369529724, "num_tokens": 457293097.0, "step": 11984 }, { "epoch": 1.5246151889072637, "ewc_loss": 0.05841861665248871, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002692447742447257, "grad_norm": 6.974225997924805, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8776949048042297, "num_tokens": 457333525.0, "step": 11985 }, { "epoch": 1.5247423991858542, "ewc_loss": 0.05856543034315109, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002707128878682852, "grad_norm": 6.951449394226074, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8500310182571411, "num_tokens": 457379772.0, "step": 11986 }, { "epoch": 1.5248696094644447, "ewc_loss": 0.05848731845617294, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026993179926648736, "grad_norm": 6.952611446380615, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8687907457351685, "num_tokens": 457415852.0, "step": 11987 }, { "epoch": 1.5249968197430352, "ewc_loss": 0.05860890820622444, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002711476699914783, "grad_norm": 7.035516738891602, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8686366081237793, "num_tokens": 457449771.0, "step": 11988 }, { "epoch": 1.5251240300216258, "ewc_loss": 0.058358777314424515, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002686463703867048, "grad_norm": 6.89863920211792, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8605060577392578, "num_tokens": 457489601.0, "step": 11989 }, { "epoch": 1.5252512403002163, "ewc_loss": 0.05864918977022171, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002715504670049995, "grad_norm": 7.017615795135498, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8785570859909058, "num_tokens": 457530516.0, "step": 11990 }, { "epoch": 1.5253784505788066, "ewc_loss": 0.05850323289632797, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002700909099075943, "grad_norm": 7.0092549324035645, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8599080443382263, "num_tokens": 457568299.0, "step": 11991 }, { "epoch": 1.5255056608573971, "ewc_loss": 0.05850154161453247, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002700740296859294, "grad_norm": 7.026303291320801, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8624790906906128, "num_tokens": 457600476.0, "step": 11992 }, { "epoch": 1.5256328711359877, "ewc_loss": 0.05850344896316528, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002700930635910481, "grad_norm": 6.9881720542907715, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8832741975784302, "num_tokens": 457632024.0, "step": 11993 }, { "epoch": 1.5257600814145782, "ewc_loss": 0.05852110683917999, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002702696656342596, "grad_norm": 6.987234592437744, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8697011470794678, "num_tokens": 457670805.0, "step": 11994 }, { "epoch": 1.5258872916931687, "ewc_loss": 0.058597542345523834, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002710339904297143, "grad_norm": 7.039523124694824, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8649594783782959, "num_tokens": 457702468.0, "step": 11995 }, { "epoch": 1.5260145019717593, "ewc_loss": 0.05843910574913025, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002694496652111411, "grad_norm": 6.947021961212158, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8729796409606934, "num_tokens": 457740443.0, "step": 11996 }, { "epoch": 1.5261417122503498, "ewc_loss": 0.05851542949676514, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027021291316486895, "grad_norm": 6.98776912689209, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8667645454406738, "num_tokens": 457775833.0, "step": 11997 }, { "epoch": 1.5262689225289403, "ewc_loss": 0.05847953259944916, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026985391741618514, "grad_norm": 6.96491813659668, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.868999719619751, "num_tokens": 457815850.0, "step": 11998 }, { "epoch": 1.5263961328075308, "ewc_loss": 0.05861562490463257, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027121484163217247, "grad_norm": 7.01151704788208, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8716815114021301, "num_tokens": 457854946.0, "step": 11999 }, { "epoch": 1.5265233430861214, "ewc_loss": 0.05847013741731644, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002697599702514708, "grad_norm": 6.948462963104248, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8698444962501526, "num_tokens": 457890775.0, "step": 12000 }, { "epoch": 1.526650553364712, "ewc_loss": 0.058631740510463715, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027137601864524186, "grad_norm": 7.021510601043701, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8690242767333984, "num_tokens": 457928143.0, "step": 12001 }, { "epoch": 1.5267777636433024, "ewc_loss": 0.058458101004362106, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026963959680870175, "grad_norm": 6.940061569213867, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8706884384155273, "num_tokens": 457965999.0, "step": 12002 }, { "epoch": 1.526904973921893, "ewc_loss": 0.058660365641117096, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027166225481778383, "grad_norm": 7.025460720062256, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8634677529335022, "num_tokens": 458006638.0, "step": 12003 }, { "epoch": 1.5270321842004835, "ewc_loss": 0.05842748284339905, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002693334245122969, "grad_norm": 6.935357093811035, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8651247620582581, "num_tokens": 458047522.0, "step": 12004 }, { "epoch": 1.527159394479074, "ewc_loss": 0.058694906532764435, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027200765907764435, "grad_norm": 6.99829626083374, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8644054532051086, "num_tokens": 458091219.0, "step": 12005 }, { "epoch": 1.5272866047576645, "ewc_loss": 0.058444757014513016, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026950615574605763, "grad_norm": 7.001123428344727, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8531181812286377, "num_tokens": 458129208.0, "step": 12006 }, { "epoch": 1.527413815036255, "ewc_loss": 0.05856545269489288, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027071312069892883, "grad_norm": 7.0272746086120605, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8595836162567139, "num_tokens": 458163632.0, "step": 12007 }, { "epoch": 1.5275410253148456, "ewc_loss": 0.05855201184749603, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705786901060492, "grad_norm": 6.977182865142822, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8581317067146301, "num_tokens": 458202377.0, "step": 12008 }, { "epoch": 1.527668235593436, "ewc_loss": 0.058536119759082794, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002704197831917554, "grad_norm": 6.991061210632324, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8642219305038452, "num_tokens": 458239901.0, "step": 12009 }, { "epoch": 1.5277954458720264, "ewc_loss": 0.058548204600811005, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027054065139964223, "grad_norm": 6.964323997497559, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8582054972648621, "num_tokens": 458280291.0, "step": 12010 }, { "epoch": 1.527922656150617, "ewc_loss": 0.05857768654823303, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027083547320216894, "grad_norm": 6.992703437805176, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.869968831539154, "num_tokens": 458316931.0, "step": 12011 }, { "epoch": 1.5280498664292075, "ewc_loss": 0.0585503876209259, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705624792724848, "grad_norm": 6.9290056228637695, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8587477207183838, "num_tokens": 458356477.0, "step": 12012 }, { "epoch": 1.528177076707798, "ewc_loss": 0.05859243869781494, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002709830005187541, "grad_norm": 7.002878189086914, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8503438830375671, "num_tokens": 458401340.0, "step": 12013 }, { "epoch": 1.5283042869863885, "ewc_loss": 0.058557361364364624, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027063218294642866, "grad_norm": 6.97868013381958, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8711080551147461, "num_tokens": 458436190.0, "step": 12014 }, { "epoch": 1.5284314972649788, "ewc_loss": 0.05869860202074051, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002720446209423244, "grad_norm": 7.0174078941345215, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8650522232055664, "num_tokens": 458465513.0, "step": 12015 }, { "epoch": 1.5285587075435694, "ewc_loss": 0.058614857494831085, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002712071582209319, "grad_norm": 6.99639368057251, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8612560033798218, "num_tokens": 458505465.0, "step": 12016 }, { "epoch": 1.52868591782216, "ewc_loss": 0.05865245312452316, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027158315060660243, "grad_norm": 6.979137897491455, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8622466325759888, "num_tokens": 458545087.0, "step": 12017 }, { "epoch": 1.5288131281007504, "ewc_loss": 0.0587209053337574, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027226764359511435, "grad_norm": 6.990684986114502, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8727226257324219, "num_tokens": 458583274.0, "step": 12018 }, { "epoch": 1.528940338379341, "ewc_loss": 0.05867740511894226, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271832657745108, "grad_norm": 6.946915626525879, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8601043820381165, "num_tokens": 458626026.0, "step": 12019 }, { "epoch": 1.5290675486579315, "ewc_loss": 0.058732062578201294, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027237922768108547, "grad_norm": 6.94069242477417, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8773088455200195, "num_tokens": 458668774.0, "step": 12020 }, { "epoch": 1.529194758936522, "ewc_loss": 0.05875657498836517, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002726243401411921, "grad_norm": 6.999101161956787, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8684560060501099, "num_tokens": 458702256.0, "step": 12021 }, { "epoch": 1.5293219692151125, "ewc_loss": 0.058637768030166626, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027143629267811775, "grad_norm": 6.97443962097168, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8377039432525635, "num_tokens": 458741429.0, "step": 12022 }, { "epoch": 1.529449179493703, "ewc_loss": 0.05863653123378754, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002714238944463432, "grad_norm": 6.987455368041992, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8684762716293335, "num_tokens": 458775608.0, "step": 12023 }, { "epoch": 1.5295763897722936, "ewc_loss": 0.058638252317905426, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002714410948101431, "grad_norm": 6.969810485839844, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8675025701522827, "num_tokens": 458811451.0, "step": 12024 }, { "epoch": 1.5297036000508841, "ewc_loss": 0.05865871161222458, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002716457238420844, "grad_norm": 6.998129367828369, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8691086769104004, "num_tokens": 458851761.0, "step": 12025 }, { "epoch": 1.5298308103294747, "ewc_loss": 0.058649078011512756, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002715493901632726, "grad_norm": 6.962281703948975, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.858038067817688, "num_tokens": 458888948.0, "step": 12026 }, { "epoch": 1.5299580206080652, "ewc_loss": 0.05869217589497566, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027198035968467593, "grad_norm": 7.033257961273193, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8627634644508362, "num_tokens": 458922466.0, "step": 12027 }, { "epoch": 1.5300852308866557, "ewc_loss": 0.05864548310637474, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002715134178288281, "grad_norm": 6.974010944366455, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.860645592212677, "num_tokens": 458956947.0, "step": 12028 }, { "epoch": 1.5302124411652462, "ewc_loss": 0.05867009237408638, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027175951981917024, "grad_norm": 6.964736461639404, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8823022842407227, "num_tokens": 458993634.0, "step": 12029 }, { "epoch": 1.5303396514438368, "ewc_loss": 0.05866026505827904, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027166123618371785, "grad_norm": 6.971424579620361, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.873589277267456, "num_tokens": 459034314.0, "step": 12030 }, { "epoch": 1.5304668617224273, "ewc_loss": 0.05870988219976425, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027215739828534424, "grad_norm": 7.007463455200195, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8705190420150757, "num_tokens": 459072117.0, "step": 12031 }, { "epoch": 1.5305940720010178, "ewc_loss": 0.05863001197576523, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027135873096995056, "grad_norm": 6.961526870727539, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8593432903289795, "num_tokens": 459112382.0, "step": 12032 }, { "epoch": 1.5307212822796084, "ewc_loss": 0.05864658206701279, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027152441907674074, "grad_norm": 6.9532904624938965, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8769088983535767, "num_tokens": 459148973.0, "step": 12033 }, { "epoch": 1.5308484925581987, "ewc_loss": 0.05865488946437836, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027160748140886426, "grad_norm": 7.02333927154541, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8607010841369629, "num_tokens": 459183242.0, "step": 12034 }, { "epoch": 1.5309757028367892, "ewc_loss": 0.05866682156920433, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027172680711373687, "grad_norm": 6.954245090484619, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8832622170448303, "num_tokens": 459219606.0, "step": 12035 }, { "epoch": 1.5311029131153797, "ewc_loss": 0.058801740407943726, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027307600248605013, "grad_norm": 7.053910732269287, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8827194571495056, "num_tokens": 459253763.0, "step": 12036 }, { "epoch": 1.5312301233939702, "ewc_loss": 0.0585644394159317, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002707029925659299, "grad_norm": 6.977612018585205, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8679922819137573, "num_tokens": 459291187.0, "step": 12037 }, { "epoch": 1.5313573336725608, "ewc_loss": 0.058682944625616074, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027188804233446717, "grad_norm": 7.04664421081543, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8617435693740845, "num_tokens": 459327423.0, "step": 12038 }, { "epoch": 1.5314845439511513, "ewc_loss": 0.058599941432476044, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002710580301936716, "grad_norm": 6.934402942657471, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8603923320770264, "num_tokens": 459366207.0, "step": 12039 }, { "epoch": 1.5316117542297416, "ewc_loss": 0.058719076216220856, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027224933728575706, "grad_norm": 7.077366828918457, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8565570116043091, "num_tokens": 459408338.0, "step": 12040 }, { "epoch": 1.5317389645083321, "ewc_loss": 0.05859285965561867, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002709871914703399, "grad_norm": 7.03627347946167, "learning_rate": 1e-06, "loss": 0.4204, "mean_token_accuracy": 0.87547767162323, "num_tokens": 459443420.0, "step": 12041 }, { "epoch": 1.5318661747869227, "ewc_loss": 0.058590102940797806, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027095963014289737, "grad_norm": 7.058564186096191, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8482011556625366, "num_tokens": 459475835.0, "step": 12042 }, { "epoch": 1.5319933850655132, "ewc_loss": 0.05852947756648064, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027035336825065315, "grad_norm": 7.016788959503174, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.862650990486145, "num_tokens": 459511993.0, "step": 12043 }, { "epoch": 1.5321205953441037, "ewc_loss": 0.058602411299943924, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002710827102418989, "grad_norm": 7.017311096191406, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8722732067108154, "num_tokens": 459546221.0, "step": 12044 }, { "epoch": 1.5322478056226942, "ewc_loss": 0.05847538262605667, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002698124444577843, "grad_norm": 6.9436726570129395, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8686375617980957, "num_tokens": 459587271.0, "step": 12045 }, { "epoch": 1.5323750159012848, "ewc_loss": 0.058646298944950104, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027152159600518644, "grad_norm": 7.034125804901123, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8532037138938904, "num_tokens": 459623388.0, "step": 12046 }, { "epoch": 1.5325022261798753, "ewc_loss": 0.05852118134498596, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027027042233385146, "grad_norm": 6.952558994293213, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.865550696849823, "num_tokens": 459666440.0, "step": 12047 }, { "epoch": 1.5326294364584658, "ewc_loss": 0.05863553285598755, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271413940936327, "grad_norm": 7.0245842933654785, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.851909875869751, "num_tokens": 459708373.0, "step": 12048 }, { "epoch": 1.5327566467370564, "ewc_loss": 0.05855304002761841, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027058899286203086, "grad_norm": 6.993829250335693, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8753176331520081, "num_tokens": 459746707.0, "step": 12049 }, { "epoch": 1.532883857015647, "ewc_loss": 0.05861130356788635, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027117165154777467, "grad_norm": 6.947576999664307, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8701885938644409, "num_tokens": 459792493.0, "step": 12050 }, { "epoch": 1.5330110672942374, "ewc_loss": 0.05873303487896919, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000272388948360458, "grad_norm": 7.035261631011963, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8730446100234985, "num_tokens": 459828897.0, "step": 12051 }, { "epoch": 1.533138277572828, "ewc_loss": 0.05857301130890846, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027078870334662497, "grad_norm": 6.975185394287109, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8566105365753174, "num_tokens": 459866274.0, "step": 12052 }, { "epoch": 1.5332654878514185, "ewc_loss": 0.05870045721530914, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027206316008232534, "grad_norm": 7.011080741882324, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8802202343940735, "num_tokens": 459902442.0, "step": 12053 }, { "epoch": 1.533392698130009, "ewc_loss": 0.05871956795454025, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002722542849369347, "grad_norm": 7.009643077850342, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8723604679107666, "num_tokens": 459943331.0, "step": 12054 }, { "epoch": 1.5335199084085995, "ewc_loss": 0.058619964867830276, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027125823544338346, "grad_norm": 7.029610633850098, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8706637620925903, "num_tokens": 459975349.0, "step": 12055 }, { "epoch": 1.53364711868719, "ewc_loss": 0.05873383954167366, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002723969810176641, "grad_norm": 7.053025722503662, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8785176277160645, "num_tokens": 460008295.0, "step": 12056 }, { "epoch": 1.5337743289657806, "ewc_loss": 0.05858506262302399, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027090919320471585, "grad_norm": 7.0063676834106445, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8690158128738403, "num_tokens": 460045892.0, "step": 12057 }, { "epoch": 1.533901539244371, "ewc_loss": 0.05861344560980797, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027119304286316037, "grad_norm": 7.000438213348389, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.864670991897583, "num_tokens": 460084255.0, "step": 12058 }, { "epoch": 1.5340287495229614, "ewc_loss": 0.05861253663897514, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027118396246805787, "grad_norm": 6.976432800292969, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8514083623886108, "num_tokens": 460123706.0, "step": 12059 }, { "epoch": 1.534155959801552, "ewc_loss": 0.05865053832530975, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002715640002861619, "grad_norm": 7.052393436431885, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8512433767318726, "num_tokens": 460162351.0, "step": 12060 }, { "epoch": 1.5342831700801425, "ewc_loss": 0.058601152151823044, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027107010828331113, "grad_norm": 7.0204997062683105, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.855745255947113, "num_tokens": 460200135.0, "step": 12061 }, { "epoch": 1.534410380358733, "ewc_loss": 0.05857253074645996, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002707839012145996, "grad_norm": 6.970168113708496, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8586276769638062, "num_tokens": 460243585.0, "step": 12062 }, { "epoch": 1.5345375906373235, "ewc_loss": 0.05864797160029411, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002715383016038686, "grad_norm": 7.0114240646362305, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.857433557510376, "num_tokens": 460283695.0, "step": 12063 }, { "epoch": 1.5346648009159138, "ewc_loss": 0.05847527086734772, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000269811280304566, "grad_norm": 6.980048179626465, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8732333779335022, "num_tokens": 460317241.0, "step": 12064 }, { "epoch": 1.5347920111945044, "ewc_loss": 0.058616362512111664, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271222204901278, "grad_norm": 7.025353908538818, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8672693371772766, "num_tokens": 460351900.0, "step": 12065 }, { "epoch": 1.534919221473095, "ewc_loss": 0.058540839701890945, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027046698960475624, "grad_norm": 6.983643531799316, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8665536046028137, "num_tokens": 460391672.0, "step": 12066 }, { "epoch": 1.5350464317516854, "ewc_loss": 0.05868663638830185, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027192497509531677, "grad_norm": 7.031952857971191, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.857517421245575, "num_tokens": 460434120.0, "step": 12067 }, { "epoch": 1.535173642030276, "ewc_loss": 0.05855347961187363, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027059338754042983, "grad_norm": 7.026093006134033, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8660545945167542, "num_tokens": 460474214.0, "step": 12068 }, { "epoch": 1.5353008523088665, "ewc_loss": 0.05855651572346687, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002706237428355962, "grad_norm": 6.9687819480896, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8702487945556641, "num_tokens": 460507549.0, "step": 12069 }, { "epoch": 1.535428062587457, "ewc_loss": 0.058655764907598495, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027161624166183174, "grad_norm": 7.089359760284424, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8581190705299377, "num_tokens": 460546391.0, "step": 12070 }, { "epoch": 1.5355552728660475, "ewc_loss": 0.05848977714776993, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002699563920032233, "grad_norm": 6.94687032699585, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8537343740463257, "num_tokens": 460585232.0, "step": 12071 }, { "epoch": 1.535682483144638, "ewc_loss": 0.058798909187316895, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002730476844590157, "grad_norm": 7.069918155670166, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8718288540840149, "num_tokens": 460624398.0, "step": 12072 }, { "epoch": 1.5358096934232286, "ewc_loss": 0.05850697681307793, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027012836653739214, "grad_norm": 6.900386810302734, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8568065166473389, "num_tokens": 460667193.0, "step": 12073 }, { "epoch": 1.5359369037018191, "ewc_loss": 0.05881398916244507, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002731985005084425, "grad_norm": 7.075528144836426, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8408406972885132, "num_tokens": 460710614.0, "step": 12074 }, { "epoch": 1.5360641139804097, "ewc_loss": 0.05852610990405083, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002703196951188147, "grad_norm": 6.955540657043457, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8606166839599609, "num_tokens": 460751079.0, "step": 12075 }, { "epoch": 1.5361913242590002, "ewc_loss": 0.05875573307275772, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002726159291341901, "grad_norm": 7.032193660736084, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8656497001647949, "num_tokens": 460786768.0, "step": 12076 }, { "epoch": 1.5363185345375907, "ewc_loss": 0.05868270993232727, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002718856849242002, "grad_norm": 6.997818946838379, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8540799021720886, "num_tokens": 460824757.0, "step": 12077 }, { "epoch": 1.5364457448161812, "ewc_loss": 0.0586487278342247, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027154586859978735, "grad_norm": 6.996790409088135, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.857407808303833, "num_tokens": 460865024.0, "step": 12078 }, { "epoch": 1.5365729550947718, "ewc_loss": 0.05867315083742142, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027179010794498026, "grad_norm": 7.079866886138916, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8542419672012329, "num_tokens": 460900895.0, "step": 12079 }, { "epoch": 1.5367001653733623, "ewc_loss": 0.05857158452272415, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027077444246970117, "grad_norm": 6.953771114349365, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8697954416275024, "num_tokens": 460936876.0, "step": 12080 }, { "epoch": 1.5368273756519528, "ewc_loss": 0.05874297767877579, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002724883670452982, "grad_norm": 7.00374698638916, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8489775657653809, "num_tokens": 460975297.0, "step": 12081 }, { "epoch": 1.5369545859305433, "ewc_loss": 0.05859534442424774, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027101204614154994, "grad_norm": 6.979894161224365, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8529406785964966, "num_tokens": 461008603.0, "step": 12082 }, { "epoch": 1.5370817962091337, "ewc_loss": 0.058758415281772614, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027264273376204073, "grad_norm": 7.024142265319824, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8558175563812256, "num_tokens": 461047770.0, "step": 12083 }, { "epoch": 1.5372090064877242, "ewc_loss": 0.05870307236909866, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002720892953220755, "grad_norm": 7.015473365783691, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8513697385787964, "num_tokens": 461088554.0, "step": 12084 }, { "epoch": 1.5373362167663147, "ewc_loss": 0.05876901373267174, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027274872991256416, "grad_norm": 7.03713846206665, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8421943187713623, "num_tokens": 461126070.0, "step": 12085 }, { "epoch": 1.5374634270449052, "ewc_loss": 0.05865244194865227, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027158300508745015, "grad_norm": 7.049005031585693, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8614331483840942, "num_tokens": 461166394.0, "step": 12086 }, { "epoch": 1.5375906373234958, "ewc_loss": 0.058680325746536255, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002718618488870561, "grad_norm": 6.964309215545654, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8562475442886353, "num_tokens": 461208263.0, "step": 12087 }, { "epoch": 1.5377178476020863, "ewc_loss": 0.05871523916721344, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002722109784372151, "grad_norm": 7.020803928375244, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8745098114013672, "num_tokens": 461245054.0, "step": 12088 }, { "epoch": 1.5378450578806766, "ewc_loss": 0.058689624071121216, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002719548356253654, "grad_norm": 7.058094501495361, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8544054627418518, "num_tokens": 461281584.0, "step": 12089 }, { "epoch": 1.5379722681592671, "ewc_loss": 0.058709871023893356, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027215731097385287, "grad_norm": 6.981339454650879, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8728110194206238, "num_tokens": 461322247.0, "step": 12090 }, { "epoch": 1.5380994784378577, "ewc_loss": 0.05868940055370331, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027195262373425066, "grad_norm": 7.072755813598633, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8520016670227051, "num_tokens": 461358371.0, "step": 12091 }, { "epoch": 1.5382266887164482, "ewc_loss": 0.0585869662463665, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000270928256213665, "grad_norm": 6.948157787322998, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8714938759803772, "num_tokens": 461394666.0, "step": 12092 }, { "epoch": 1.5383538989950387, "ewc_loss": 0.05873638391494751, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027242241776548326, "grad_norm": 7.012322425842285, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.876657247543335, "num_tokens": 461430875.0, "step": 12093 }, { "epoch": 1.5384811092736292, "ewc_loss": 0.058549925684928894, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027055785176344216, "grad_norm": 7.004730701446533, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8597632050514221, "num_tokens": 461467412.0, "step": 12094 }, { "epoch": 1.5386083195522198, "ewc_loss": 0.058673568069934845, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027179429889656603, "grad_norm": 7.047763347625732, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8438597917556763, "num_tokens": 461501970.0, "step": 12095 }, { "epoch": 1.5387355298308103, "ewc_loss": 0.058608122169971466, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271139811957255, "grad_norm": 7.00139045715332, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8517806529998779, "num_tokens": 461542735.0, "step": 12096 }, { "epoch": 1.5388627401094008, "ewc_loss": 0.05870269238948822, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027208554092794657, "grad_norm": 7.032754421234131, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.86563640832901, "num_tokens": 461583883.0, "step": 12097 }, { "epoch": 1.5389899503879914, "ewc_loss": 0.05863627418875694, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271421333309263, "grad_norm": 6.997464179992676, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8560752868652344, "num_tokens": 461625124.0, "step": 12098 }, { "epoch": 1.5391171606665819, "ewc_loss": 0.05871044844388962, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027216310263611376, "grad_norm": 7.033327102661133, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8545217514038086, "num_tokens": 461660273.0, "step": 12099 }, { "epoch": 1.5392443709451724, "ewc_loss": 0.05862296372652054, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002712882123887539, "grad_norm": 7.0599684715271, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8590336441993713, "num_tokens": 461698890.0, "step": 12100 }, { "epoch": 1.539371581223763, "ewc_loss": 0.05863308161497116, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002713894355110824, "grad_norm": 7.008937358856201, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8629058003425598, "num_tokens": 461736927.0, "step": 12101 }, { "epoch": 1.5394987915023535, "ewc_loss": 0.05866080895066261, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027166667860001326, "grad_norm": 7.0746870040893555, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8606503009796143, "num_tokens": 461777796.0, "step": 12102 }, { "epoch": 1.539626001780944, "ewc_loss": 0.05859558656811714, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027101446175947785, "grad_norm": 6.980742931365967, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8722039461135864, "num_tokens": 461814656.0, "step": 12103 }, { "epoch": 1.5397532120595345, "ewc_loss": 0.05864211171865463, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002714797155931592, "grad_norm": 7.020992279052734, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8457515835762024, "num_tokens": 461852186.0, "step": 12104 }, { "epoch": 1.539880422338125, "ewc_loss": 0.058579400181770325, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002708525862544775, "grad_norm": 7.0072340965271, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8775144815444946, "num_tokens": 461893724.0, "step": 12105 }, { "epoch": 1.5400076326167156, "ewc_loss": 0.05867976322770119, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027185623184777796, "grad_norm": 7.006619453430176, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8507777452468872, "num_tokens": 461935380.0, "step": 12106 }, { "epoch": 1.5401348428953059, "ewc_loss": 0.0585249699652195, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027030828641727567, "grad_norm": 7.01968240737915, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8702006340026855, "num_tokens": 461971463.0, "step": 12107 }, { "epoch": 1.5402620531738964, "ewc_loss": 0.05861244350671768, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002711830020416528, "grad_norm": 7.034590721130371, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8701813817024231, "num_tokens": 462007514.0, "step": 12108 }, { "epoch": 1.540389263452487, "ewc_loss": 0.05851074308156967, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027016602689400315, "grad_norm": 6.949415683746338, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.876878559589386, "num_tokens": 462046022.0, "step": 12109 }, { "epoch": 1.5405164737310775, "ewc_loss": 0.058652572333812714, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002715843147598207, "grad_norm": 6.978534698486328, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8684650659561157, "num_tokens": 462089367.0, "step": 12110 }, { "epoch": 1.540643684009668, "ewc_loss": 0.0586138591170311, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027119717560708523, "grad_norm": 6.951698303222656, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8693071603775024, "num_tokens": 462128558.0, "step": 12111 }, { "epoch": 1.5407708942882585, "ewc_loss": 0.058656156063079834, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002716201706789434, "grad_norm": 7.067413330078125, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.871239185333252, "num_tokens": 462162639.0, "step": 12112 }, { "epoch": 1.5408981045668488, "ewc_loss": 0.05850902572274208, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002701488556340337, "grad_norm": 6.975846767425537, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8639863133430481, "num_tokens": 462199303.0, "step": 12113 }, { "epoch": 1.5410253148454394, "ewc_loss": 0.05870974436402321, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002721560304053128, "grad_norm": 7.03487491607666, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8601909875869751, "num_tokens": 462246095.0, "step": 12114 }, { "epoch": 1.54115252512403, "ewc_loss": 0.0585169792175293, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027022839640267193, "grad_norm": 6.999274253845215, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8606463670730591, "num_tokens": 462286653.0, "step": 12115 }, { "epoch": 1.5412797354026204, "ewc_loss": 0.05870194733142853, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027207809034734964, "grad_norm": 7.0561747550964355, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8612033128738403, "num_tokens": 462328295.0, "step": 12116 }, { "epoch": 1.541406945681211, "ewc_loss": 0.05857507139444351, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027080930885858834, "grad_norm": 6.988114833831787, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8646236062049866, "num_tokens": 462365175.0, "step": 12117 }, { "epoch": 1.5415341559598015, "ewc_loss": 0.05864632874727249, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271521887043491, "grad_norm": 7.046122074127197, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8568322658538818, "num_tokens": 462407044.0, "step": 12118 }, { "epoch": 1.541661366238392, "ewc_loss": 0.05854656919836998, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027052429504692554, "grad_norm": 7.0173773765563965, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8570538759231567, "num_tokens": 462443819.0, "step": 12119 }, { "epoch": 1.5417885765169825, "ewc_loss": 0.05868715047836304, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027193009736947715, "grad_norm": 7.039451599121094, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8734264969825745, "num_tokens": 462487319.0, "step": 12120 }, { "epoch": 1.541915786795573, "ewc_loss": 0.058655016124248505, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027160876197740436, "grad_norm": 7.0141425132751465, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8658433556556702, "num_tokens": 462527026.0, "step": 12121 }, { "epoch": 1.5420429970741636, "ewc_loss": 0.058620110154151917, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027125971973873675, "grad_norm": 7.138936996459961, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.856245756149292, "num_tokens": 462557552.0, "step": 12122 }, { "epoch": 1.5421702073527541, "ewc_loss": 0.05849341303110123, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002699927135836333, "grad_norm": 6.95233154296875, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8662806749343872, "num_tokens": 462595243.0, "step": 12123 }, { "epoch": 1.5422974176313446, "ewc_loss": 0.05869952589273453, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002720538468565792, "grad_norm": 7.104682922363281, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8669157028198242, "num_tokens": 462636077.0, "step": 12124 }, { "epoch": 1.5424246279099352, "ewc_loss": 0.05855289846658707, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705875667743385, "grad_norm": 6.972868919372559, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8755465149879456, "num_tokens": 462668407.0, "step": 12125 }, { "epoch": 1.5425518381885257, "ewc_loss": 0.058757126331329346, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027262986986897886, "grad_norm": 7.088717460632324, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8641258478164673, "num_tokens": 462707678.0, "step": 12126 }, { "epoch": 1.5426790484671162, "ewc_loss": 0.05853830277919769, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002704416401684284, "grad_norm": 6.997500419616699, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.856637716293335, "num_tokens": 462739722.0, "step": 12127 }, { "epoch": 1.5428062587457068, "ewc_loss": 0.05877431109547615, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002728016988839954, "grad_norm": 7.100863933563232, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8711709380149841, "num_tokens": 462770369.0, "step": 12128 }, { "epoch": 1.5429334690242973, "ewc_loss": 0.05857418477535248, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002708004612941295, "grad_norm": 7.043955326080322, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.870107889175415, "num_tokens": 462806126.0, "step": 12129 }, { "epoch": 1.5430606793028878, "ewc_loss": 0.058656930923461914, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002716279122978449, "grad_norm": 7.046375274658203, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8662390112876892, "num_tokens": 462846585.0, "step": 12130 }, { "epoch": 1.5431878895814783, "ewc_loss": 0.0586528480052948, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027158705051988363, "grad_norm": 7.051869869232178, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8598646521568298, "num_tokens": 462889892.0, "step": 12131 }, { "epoch": 1.5433150998600687, "ewc_loss": 0.058633144944906235, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000271390046691522, "grad_norm": 7.043831825256348, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8641974329948425, "num_tokens": 462927612.0, "step": 12132 }, { "epoch": 1.5434423101386592, "ewc_loss": 0.05862654373049736, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027132403920404613, "grad_norm": 7.025102615356445, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8676071166992188, "num_tokens": 462971661.0, "step": 12133 }, { "epoch": 1.5435695204172497, "ewc_loss": 0.058619920164346695, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002712577988859266, "grad_norm": 6.990262031555176, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8682147264480591, "num_tokens": 463011260.0, "step": 12134 }, { "epoch": 1.5436967306958402, "ewc_loss": 0.0587114542722702, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027217314345762134, "grad_norm": 7.120641708374023, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8695418238639832, "num_tokens": 463045395.0, "step": 12135 }, { "epoch": 1.5438239409744308, "ewc_loss": 0.05856578052043915, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027071640943177044, "grad_norm": 7.021388053894043, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8537241816520691, "num_tokens": 463081015.0, "step": 12136 }, { "epoch": 1.5439511512530213, "ewc_loss": 0.058803267776966095, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027309125289320946, "grad_norm": 7.206563472747803, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8630645871162415, "num_tokens": 463115658.0, "step": 12137 }, { "epoch": 1.5440783615316116, "ewc_loss": 0.058493275195360184, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026999134570360184, "grad_norm": 6.9418721199035645, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8752520084381104, "num_tokens": 463152084.0, "step": 12138 }, { "epoch": 1.5442055718102021, "ewc_loss": 0.058958500623703, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002746435930021107, "grad_norm": 7.139250755310059, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8553693294525146, "num_tokens": 463190137.0, "step": 12139 }, { "epoch": 1.5443327820887927, "ewc_loss": 0.05855283886194229, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027058698469772935, "grad_norm": 7.029160976409912, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8634724617004395, "num_tokens": 463227626.0, "step": 12140 }, { "epoch": 1.5444599923673832, "ewc_loss": 0.05884835124015808, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002735421003308147, "grad_norm": 7.110515594482422, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8650937080383301, "num_tokens": 463266451.0, "step": 12141 }, { "epoch": 1.5445872026459737, "ewc_loss": 0.058596447110176086, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002710230473894626, "grad_norm": 7.030591011047363, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8512164354324341, "num_tokens": 463305327.0, "step": 12142 }, { "epoch": 1.5447144129245642, "ewc_loss": 0.05873802304267883, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002724388032220304, "grad_norm": 7.129264831542969, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8608270287513733, "num_tokens": 463345890.0, "step": 12143 }, { "epoch": 1.5448416232031548, "ewc_loss": 0.058567486703395844, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027073343517258763, "grad_norm": 7.011202812194824, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8688954710960388, "num_tokens": 463381583.0, "step": 12144 }, { "epoch": 1.5449688334817453, "ewc_loss": 0.058638229966163635, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002714408910833299, "grad_norm": 7.087563991546631, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8729137182235718, "num_tokens": 463414442.0, "step": 12145 }, { "epoch": 1.5450960437603358, "ewc_loss": 0.05854853242635727, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027054394013248384, "grad_norm": 7.063345909118652, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.869953453540802, "num_tokens": 463449517.0, "step": 12146 }, { "epoch": 1.5452232540389264, "ewc_loss": 0.058651238679885864, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002715709852054715, "grad_norm": 7.0169291496276855, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8713430166244507, "num_tokens": 463487952.0, "step": 12147 }, { "epoch": 1.5453504643175169, "ewc_loss": 0.05854463577270508, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705049701035023, "grad_norm": 7.1583147048950195, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8651750087738037, "num_tokens": 463520724.0, "step": 12148 }, { "epoch": 1.5454776745961074, "ewc_loss": 0.05849793553352356, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002700379700399935, "grad_norm": 6.96760368347168, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8606473207473755, "num_tokens": 463564691.0, "step": 12149 }, { "epoch": 1.545604884874698, "ewc_loss": 0.05870163440704346, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027207491802982986, "grad_norm": 7.203112602233887, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8598787188529968, "num_tokens": 463599037.0, "step": 12150 }, { "epoch": 1.5457320951532885, "ewc_loss": 0.058497462421655655, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002700332261156291, "grad_norm": 7.1512770652771, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.850060224533081, "num_tokens": 463630744.0, "step": 12151 }, { "epoch": 1.545859305431879, "ewc_loss": 0.05843573808670044, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002694159629754722, "grad_norm": 7.003905296325684, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.861103892326355, "num_tokens": 463663551.0, "step": 12152 }, { "epoch": 1.5459865157104695, "ewc_loss": 0.058612145483493805, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027118006255477667, "grad_norm": 7.124000549316406, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8753108382225037, "num_tokens": 463702446.0, "step": 12153 }, { "epoch": 1.54611372598906, "ewc_loss": 0.05834149196743965, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00026847352273762226, "grad_norm": 7.030094623565674, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.854485809803009, "num_tokens": 463739324.0, "step": 12154 }, { "epoch": 1.5462409362676506, "ewc_loss": 0.05854703485965729, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002705289516597986, "grad_norm": 7.043741703033447, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8445521593093872, "num_tokens": 463780765.0, "step": 12155 }, { "epoch": 1.5463681465462409, "ewc_loss": 0.05844621732831001, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002695207658689469, "grad_norm": 7.036332130432129, "learning_rate": 1e-06, "loss": 0.5401, "mean_token_accuracy": 0.8430615663528442, "num_tokens": 463817727.0, "step": 12156 }, { "epoch": 1.5464953568248314, "ewc_loss": 0.058627840131521225, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002713369904085994, "grad_norm": 7.007315635681152, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8790527582168579, "num_tokens": 463865793.0, "step": 12157 }, { "epoch": 1.546622567103422, "ewc_loss": 0.0586051344871521, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027110992232337594, "grad_norm": 6.991202354431152, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8649761080741882, "num_tokens": 463908299.0, "step": 12158 }, { "epoch": 1.5467497773820125, "ewc_loss": 0.05860960856080055, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002711546840146184, "grad_norm": 7.080389976501465, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8600406646728516, "num_tokens": 463944513.0, "step": 12159 }, { "epoch": 1.546876987660603, "ewc_loss": 0.058535851538181305, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027041713474318385, "grad_norm": 7.049936294555664, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8565429449081421, "num_tokens": 463982862.0, "step": 12160 }, { "epoch": 1.5470041979391935, "ewc_loss": 0.05850948020815849, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027015339583158493, "grad_norm": 6.970065116882324, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.869024395942688, "num_tokens": 464023357.0, "step": 12161 }, { "epoch": 1.5471314082177838, "ewc_loss": 0.05864359438419342, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027149455854669213, "grad_norm": 7.124114990234375, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8633642196655273, "num_tokens": 464071889.0, "step": 12162 }, { "epoch": 1.5472586184963744, "ewc_loss": 0.05856788158416748, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002707374223973602, "grad_norm": 7.033517360687256, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8510823249816895, "num_tokens": 464112334.0, "step": 12163 }, { "epoch": 1.5473858287749649, "ewc_loss": 0.058615878224372864, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027121740276925266, "grad_norm": 7.068882942199707, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8584082126617432, "num_tokens": 464156833.0, "step": 12164 }, { "epoch": 1.5475130390535554, "ewc_loss": 0.05828698351979256, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027036984101869166, "grad_norm": 7.163625240325928, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8658028244972229, "num_tokens": 464192989.0, "step": 12165 }, { "epoch": 1.547640249332146, "ewc_loss": 0.05828625336289406, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000270362535957247, "grad_norm": 7.016097545623779, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8686277866363525, "num_tokens": 464232855.0, "step": 12166 }, { "epoch": 1.5477674596107365, "ewc_loss": 0.05865509808063507, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002716095477808267, "grad_norm": 7.0582075119018555, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8479371666908264, "num_tokens": 464271860.0, "step": 12167 }, { "epoch": 1.547894669889327, "ewc_loss": 0.05830003693699837, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002705003716982901, "grad_norm": 7.082056522369385, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.875647783279419, "num_tokens": 464312925.0, "step": 12168 }, { "epoch": 1.5480218801679175, "ewc_loss": 0.058514781296253204, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027020639390684664, "grad_norm": 7.039062023162842, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8583422303199768, "num_tokens": 464349190.0, "step": 12169 }, { "epoch": 1.548149090446508, "ewc_loss": 0.05868903547525406, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027194892754778266, "grad_norm": 7.054986476898193, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.859434962272644, "num_tokens": 464392031.0, "step": 12170 }, { "epoch": 1.5482763007250986, "ewc_loss": 0.05856472998857498, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002707058738451451, "grad_norm": 7.127670764923096, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8596771955490112, "num_tokens": 464424884.0, "step": 12171 }, { "epoch": 1.5484035110036891, "ewc_loss": 0.058501243591308594, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002700710319913924, "grad_norm": 7.03037166595459, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8585992455482483, "num_tokens": 464466047.0, "step": 12172 }, { "epoch": 1.5485307212822796, "ewc_loss": 0.05864168703556061, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002714754664339125, "grad_norm": 7.118074893951416, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8628516793251038, "num_tokens": 464497293.0, "step": 12173 }, { "epoch": 1.5486579315608702, "ewc_loss": 0.05849307030439377, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002699892793316394, "grad_norm": 7.0866312980651855, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8490300178527832, "num_tokens": 464541873.0, "step": 12174 }, { "epoch": 1.5487851418394607, "ewc_loss": 0.05865854397416115, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002716440358199179, "grad_norm": 7.060683250427246, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8631219863891602, "num_tokens": 464579712.0, "step": 12175 }, { "epoch": 1.5489123521180512, "ewc_loss": 0.058555882424116135, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002706174273043871, "grad_norm": 7.075364112854004, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.861677885055542, "num_tokens": 464612277.0, "step": 12176 }, { "epoch": 1.5490395623966418, "ewc_loss": 0.05866974964737892, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027175608556717634, "grad_norm": 7.075331211090088, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8593264818191528, "num_tokens": 464647062.0, "step": 12177 }, { "epoch": 1.5491667726752323, "ewc_loss": 0.058540016412734985, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000270458753220737, "grad_norm": 7.022741317749023, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8661705255508423, "num_tokens": 464677998.0, "step": 12178 }, { "epoch": 1.5492939829538228, "ewc_loss": 0.058737583458423615, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002724344376474619, "grad_norm": 7.056934356689453, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8559465408325195, "num_tokens": 464719205.0, "step": 12179 }, { "epoch": 1.5494211932324133, "ewc_loss": 0.05859873443841934, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002710459230002016, "grad_norm": 7.007903575897217, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8681822419166565, "num_tokens": 464760758.0, "step": 12180 }, { "epoch": 1.5495484035110036, "ewc_loss": 0.05877768248319626, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002728354011196643, "grad_norm": 7.0513596534729, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8666404485702515, "num_tokens": 464797422.0, "step": 12181 }, { "epoch": 1.5496756137895942, "ewc_loss": 0.05875331908464432, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000272591772954911, "grad_norm": 7.043200969696045, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8642280101776123, "num_tokens": 464832889.0, "step": 12182 }, { "epoch": 1.5498028240681847, "ewc_loss": 0.05881745368242264, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027323313406668603, "grad_norm": 7.078127861022949, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8565870523452759, "num_tokens": 464867743.0, "step": 12183 }, { "epoch": 1.5499300343467752, "ewc_loss": 0.05883046239614487, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002733631990849972, "grad_norm": 7.027724742889404, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8513243794441223, "num_tokens": 464904666.0, "step": 12184 }, { "epoch": 1.5500572446253658, "ewc_loss": 0.058950334787368774, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002745619567576796, "grad_norm": 7.0671515464782715, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8678441047668457, "num_tokens": 464942886.0, "step": 12185 }, { "epoch": 1.5501844549039563, "ewc_loss": 0.0588814802467823, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002738733892329037, "grad_norm": 7.030674934387207, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8637425899505615, "num_tokens": 464977463.0, "step": 12186 }, { "epoch": 1.5503116651825466, "ewc_loss": 0.058986928313970566, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002749278792180121, "grad_norm": 7.049569129943848, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8726136088371277, "num_tokens": 465015732.0, "step": 12187 }, { "epoch": 1.5504388754611371, "ewc_loss": 0.058909665793180466, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002741552598308772, "grad_norm": 7.025768756866455, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8732101321220398, "num_tokens": 465060789.0, "step": 12188 }, { "epoch": 1.5505660857397277, "ewc_loss": 0.058949925005435944, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002745578240137547, "grad_norm": 7.103598117828369, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8603456616401672, "num_tokens": 465097907.0, "step": 12189 }, { "epoch": 1.5506932960183182, "ewc_loss": 0.0588192380964756, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000273250974714756, "grad_norm": 7.021373748779297, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8718580007553101, "num_tokens": 465139401.0, "step": 12190 }, { "epoch": 1.5508205062969087, "ewc_loss": 0.058883629739284515, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027389489696361125, "grad_norm": 7.064472675323486, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8419941663742065, "num_tokens": 465176531.0, "step": 12191 }, { "epoch": 1.5509477165754992, "ewc_loss": 0.05873776227235794, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002724362420849502, "grad_norm": 7.014195442199707, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8726046681404114, "num_tokens": 465216074.0, "step": 12192 }, { "epoch": 1.5510749268540898, "ewc_loss": 0.05908169969916344, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758755872491747, "grad_norm": 7.112344264984131, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8779053092002869, "num_tokens": 465257986.0, "step": 12193 }, { "epoch": 1.5512021371326803, "ewc_loss": 0.058739908039569855, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027245766250416636, "grad_norm": 7.002674102783203, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8665323257446289, "num_tokens": 465296038.0, "step": 12194 }, { "epoch": 1.5513293474112708, "ewc_loss": 0.059003956615924835, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000275098136626184, "grad_norm": 7.117857456207275, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8401033282279968, "num_tokens": 465334215.0, "step": 12195 }, { "epoch": 1.5514565576898613, "ewc_loss": 0.0587700679898262, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027275929460301995, "grad_norm": 7.0586838722229, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8620181679725647, "num_tokens": 465373362.0, "step": 12196 }, { "epoch": 1.5515837679684519, "ewc_loss": 0.058875374495983124, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027381235850043595, "grad_norm": 7.055138111114502, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.857915997505188, "num_tokens": 465410181.0, "step": 12197 }, { "epoch": 1.5517109782470424, "ewc_loss": 0.05890018492937088, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002740604686550796, "grad_norm": 7.0926947593688965, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8700363636016846, "num_tokens": 465443056.0, "step": 12198 }, { "epoch": 1.551838188525633, "ewc_loss": 0.058880794793367386, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002738665498327464, "grad_norm": 7.043426036834717, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8667905330657959, "num_tokens": 465484509.0, "step": 12199 }, { "epoch": 1.5519653988042235, "ewc_loss": 0.05887128412723541, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002737714385148138, "grad_norm": 7.078723907470703, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8619840741157532, "num_tokens": 465520017.0, "step": 12200 }, { "epoch": 1.552092609082814, "ewc_loss": 0.05880480632185936, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027310664881952107, "grad_norm": 7.039158821105957, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8797101974487305, "num_tokens": 465557221.0, "step": 12201 }, { "epoch": 1.5522198193614045, "ewc_loss": 0.058887992054224014, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002739385236054659, "grad_norm": 7.083398342132568, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8723697662353516, "num_tokens": 465595455.0, "step": 12202 }, { "epoch": 1.552347029639995, "ewc_loss": 0.058782532811164856, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002728839172050357, "grad_norm": 7.012844562530518, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8646633625030518, "num_tokens": 465636436.0, "step": 12203 }, { "epoch": 1.5524742399185856, "ewc_loss": 0.05888114124536514, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002738699840847403, "grad_norm": 7.120123863220215, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8598644733428955, "num_tokens": 465668956.0, "step": 12204 }, { "epoch": 1.5526014501971759, "ewc_loss": 0.05877970904111862, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027285568648949265, "grad_norm": 7.03183650970459, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8451499342918396, "num_tokens": 465708812.0, "step": 12205 }, { "epoch": 1.5527286604757664, "ewc_loss": 0.058896083384752274, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002740194322541356, "grad_norm": 7.172367572784424, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.852660059928894, "num_tokens": 465742610.0, "step": 12206 }, { "epoch": 1.552855870754357, "ewc_loss": 0.05871658772230148, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027222445351071656, "grad_norm": 7.061492443084717, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8630818128585815, "num_tokens": 465776468.0, "step": 12207 }, { "epoch": 1.5529830810329475, "ewc_loss": 0.05883283168077469, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002733869187068194, "grad_norm": 7.159667015075684, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8794312477111816, "num_tokens": 465809300.0, "step": 12208 }, { "epoch": 1.553110291311538, "ewc_loss": 0.05867759883403778, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027183457859791815, "grad_norm": 7.00860071182251, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8650798797607422, "num_tokens": 465849847.0, "step": 12209 }, { "epoch": 1.5532375015901285, "ewc_loss": 0.058879584074020386, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002738544426392764, "grad_norm": 7.1145734786987305, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.873562753200531, "num_tokens": 465884986.0, "step": 12210 }, { "epoch": 1.5533647118687188, "ewc_loss": 0.05871860682964325, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000272244680672884, "grad_norm": 7.028820514678955, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8722617626190186, "num_tokens": 465919710.0, "step": 12211 }, { "epoch": 1.5534919221473094, "ewc_loss": 0.05880989506840706, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002731575514189899, "grad_norm": 7.063186168670654, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8558658361434937, "num_tokens": 465956041.0, "step": 12212 }, { "epoch": 1.5536191324258999, "ewc_loss": 0.05874885246157646, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027254712767899036, "grad_norm": 7.018032073974609, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8743975162506104, "num_tokens": 465994352.0, "step": 12213 }, { "epoch": 1.5537463427044904, "ewc_loss": 0.058878667652606964, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027384524582885206, "grad_norm": 7.049208164215088, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8656314611434937, "num_tokens": 466032493.0, "step": 12214 }, { "epoch": 1.553873552983081, "ewc_loss": 0.058812215924263, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002731807471718639, "grad_norm": 7.11141300201416, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8654234409332275, "num_tokens": 466067048.0, "step": 12215 }, { "epoch": 1.5540007632616715, "ewc_loss": 0.058806613087654114, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027312475140206516, "grad_norm": 7.042299747467041, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8416659235954285, "num_tokens": 466106895.0, "step": 12216 }, { "epoch": 1.554127973540262, "ewc_loss": 0.058910124003887177, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002741598291322589, "grad_norm": 7.065390586853027, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8496979475021362, "num_tokens": 466146740.0, "step": 12217 }, { "epoch": 1.5542551838188525, "ewc_loss": 0.05882399156689644, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027329850126989186, "grad_norm": 10.118599891662598, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8702421188354492, "num_tokens": 466187097.0, "step": 12218 }, { "epoch": 1.554382394097443, "ewc_loss": 0.06167084723711014, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00030176708241924644, "grad_norm": 7.335900783538818, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8537930250167847, "num_tokens": 466223969.0, "step": 12219 }, { "epoch": 1.5545096043760336, "ewc_loss": 0.05895698815584183, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002746284590102732, "grad_norm": 7.1892170906066895, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8749791383743286, "num_tokens": 466260127.0, "step": 12220 }, { "epoch": 1.554636814654624, "ewc_loss": 0.05898987501859665, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002749573322944343, "grad_norm": 7.083024024963379, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8696063756942749, "num_tokens": 466297219.0, "step": 12221 }, { "epoch": 1.5547640249332146, "ewc_loss": 0.059490494430065155, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000279963540378958, "grad_norm": 7.269490718841553, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8442773818969727, "num_tokens": 466329474.0, "step": 12222 }, { "epoch": 1.5548912352118052, "ewc_loss": 0.058865033090114594, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002737089234869927, "grad_norm": 7.061765670776367, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8622546792030334, "num_tokens": 466372801.0, "step": 12223 }, { "epoch": 1.5550184454903957, "ewc_loss": 0.05905412882566452, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002755998575594276, "grad_norm": 7.090253829956055, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.870598554611206, "num_tokens": 466412950.0, "step": 12224 }, { "epoch": 1.5551456557689862, "ewc_loss": 0.05907947197556496, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758533228188753, "grad_norm": 7.17697811126709, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8697555065155029, "num_tokens": 466448830.0, "step": 12225 }, { "epoch": 1.5552728660475768, "ewc_loss": 0.05895985662937164, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000274657184490934, "grad_norm": 7.116109848022461, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8533816337585449, "num_tokens": 466489749.0, "step": 12226 }, { "epoch": 1.5554000763261673, "ewc_loss": 0.058905214071273804, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002741107309702784, "grad_norm": 7.104821681976318, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8458161354064941, "num_tokens": 466531320.0, "step": 12227 }, { "epoch": 1.5555272866047578, "ewc_loss": 0.058904748409986496, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002741060743574053, "grad_norm": 7.171530723571777, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8559879064559937, "num_tokens": 466570001.0, "step": 12228 }, { "epoch": 1.5556544968833483, "ewc_loss": 0.058786049485206604, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002729191037360579, "grad_norm": 7.036818027496338, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8668087124824524, "num_tokens": 466607186.0, "step": 12229 }, { "epoch": 1.5557817071619386, "ewc_loss": 0.05862174183130264, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002737174218054861, "grad_norm": 7.076127529144287, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8544653654098511, "num_tokens": 466649480.0, "step": 12230 }, { "epoch": 1.5559089174405292, "ewc_loss": 0.058573685586452484, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027323683025315404, "grad_norm": 7.057183265686035, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8701231479644775, "num_tokens": 466688861.0, "step": 12231 }, { "epoch": 1.5560361277191197, "ewc_loss": 0.05892620608210564, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002743206568993628, "grad_norm": 10.14582347869873, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8439429998397827, "num_tokens": 466723200.0, "step": 12232 }, { "epoch": 1.5561633379977102, "ewc_loss": 0.06179984658956528, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00030305705149658024, "grad_norm": 7.340607643127441, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8492241501808167, "num_tokens": 466759639.0, "step": 12233 }, { "epoch": 1.5562905482763008, "ewc_loss": 0.05908992886543274, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027595789288170636, "grad_norm": 7.193925380706787, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8575750589370728, "num_tokens": 466795748.0, "step": 12234 }, { "epoch": 1.5564177585548913, "ewc_loss": 0.05904625728726387, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002755211608018726, "grad_norm": 7.0838775634765625, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8741577863693237, "num_tokens": 466832938.0, "step": 12235 }, { "epoch": 1.5565449688334816, "ewc_loss": 0.059501759707927704, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028007617220282555, "grad_norm": 7.172752857208252, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8554965853691101, "num_tokens": 466868710.0, "step": 12236 }, { "epoch": 1.5566721791120721, "ewc_loss": 0.058978017419576645, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002748387632891536, "grad_norm": 7.1151442527771, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8612606525421143, "num_tokens": 466903558.0, "step": 12237 }, { "epoch": 1.5567993893906626, "ewc_loss": 0.05910157412290573, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027607433730736375, "grad_norm": 7.147636413574219, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8535356521606445, "num_tokens": 466946785.0, "step": 12238 }, { "epoch": 1.5569265996692532, "ewc_loss": 0.05898289382457733, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027488754130899906, "grad_norm": 7.09197473526001, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8562374114990234, "num_tokens": 466988563.0, "step": 12239 }, { "epoch": 1.5570538099478437, "ewc_loss": 0.05905596539378166, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002756182511802763, "grad_norm": 7.127427101135254, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8564685583114624, "num_tokens": 467025447.0, "step": 12240 }, { "epoch": 1.5571810202264342, "ewc_loss": 0.058926649391651154, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002743251097854227, "grad_norm": 7.053565979003906, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8718962669372559, "num_tokens": 467060751.0, "step": 12241 }, { "epoch": 1.5573082305050248, "ewc_loss": 0.06036529690027237, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0002740631462074816, "grad_norm": 35.94782257080078, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8627477884292603, "num_tokens": 467101889.0, "step": 12242 }, { "epoch": 1.5574354407836153, "ewc_loss": 0.08851076662540436, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0005677248700521886, "grad_norm": 10.831714630126953, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8654565811157227, "num_tokens": 467136782.0, "step": 12243 }, { "epoch": 1.5575626510622058, "ewc_loss": 0.058671727776527405, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00026933447225019336, "grad_norm": 6.038950443267822, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8713988065719604, "num_tokens": 467174137.0, "step": 12244 }, { "epoch": 1.5576898613407963, "ewc_loss": 0.07502199709415436, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00043283714330755174, "grad_norm": 9.943371772766113, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8585591316223145, "num_tokens": 467213919.0, "step": 12245 }, { "epoch": 1.5578170716193869, "ewc_loss": 0.07903826236724854, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00047299975994974375, "grad_norm": 9.667149543762207, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8709160089492798, "num_tokens": 467254860.0, "step": 12246 }, { "epoch": 1.5579442818979774, "ewc_loss": 0.06496702134609222, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00033228739630430937, "grad_norm": 7.159026145935059, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8742016553878784, "num_tokens": 467296806.0, "step": 12247 }, { "epoch": 1.558071492176568, "ewc_loss": 0.06732389330863953, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00035585614386945963, "grad_norm": 8.673855781555176, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8700879812240601, "num_tokens": 467332882.0, "step": 12248 }, { "epoch": 1.5581987024551585, "ewc_loss": 0.0689132809638977, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0003741913824342191, "grad_norm": 8.090176582336426, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8784623742103577, "num_tokens": 467375088.0, "step": 12249 }, { "epoch": 1.558325912733749, "ewc_loss": 0.0633668527007103, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00031872710678726435, "grad_norm": 7.583580017089844, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8623047471046448, "num_tokens": 467413000.0, "step": 12250 }, { "epoch": 1.5584531230123395, "ewc_loss": 0.06416802108287811, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0003267387510277331, "grad_norm": 7.895610809326172, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8779429793357849, "num_tokens": 467450947.0, "step": 12251 }, { "epoch": 1.55858033329093, "ewc_loss": 0.06323845684528351, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00031744319130666554, "grad_norm": 7.494870662689209, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8539049029350281, "num_tokens": 467493770.0, "step": 12252 }, { "epoch": 1.5587075435695206, "ewc_loss": 0.06218408793210983, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0003068994847126305, "grad_norm": 7.568142414093018, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.860714316368103, "num_tokens": 467531882.0, "step": 12253 }, { "epoch": 1.5588347538481109, "ewc_loss": 0.061608318239450455, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0003035831905435771, "grad_norm": 7.413950443267822, "learning_rate": 1e-06, "loss": 0.3683, "mean_token_accuracy": 0.8898535370826721, "num_tokens": 467566059.0, "step": 12254 }, { "epoch": 1.5589619641267014, "ewc_loss": 0.06102663278579712, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.000297766353469342, "grad_norm": 7.433948993682861, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8784362077713013, "num_tokens": 467603734.0, "step": 12255 }, { "epoch": 1.559089174405292, "ewc_loss": 0.060452207922935486, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00029202207224443555, "grad_norm": 7.295873641967773, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8722280859947205, "num_tokens": 467641224.0, "step": 12256 }, { "epoch": 1.5592163846838825, "ewc_loss": 0.06061025708913803, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002911611518356949, "grad_norm": 7.316410064697266, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.861436128616333, "num_tokens": 467677576.0, "step": 12257 }, { "epoch": 1.559343594962473, "ewc_loss": 0.059869859367609024, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00028619859949685633, "grad_norm": 7.220784664154053, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.870819091796875, "num_tokens": 467717526.0, "step": 12258 }, { "epoch": 1.5594708052410635, "ewc_loss": 0.05983807519078255, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00028588075656443834, "grad_norm": 7.209900379180908, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8586429357528687, "num_tokens": 467757591.0, "step": 12259 }, { "epoch": 1.5595980155196538, "ewc_loss": 0.05951564759016037, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002826564887072891, "grad_norm": 7.148111820220947, "learning_rate": 1e-06, "loss": 0.5048, "mean_token_accuracy": 0.850384533405304, "num_tokens": 467792794.0, "step": 12260 }, { "epoch": 1.5597252257982444, "ewc_loss": 0.0595916211605072, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002834162150975317, "grad_norm": 7.183194160461426, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8590507507324219, "num_tokens": 467829554.0, "step": 12261 }, { "epoch": 1.5598524360768349, "ewc_loss": 0.059279315173625946, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002802931412588805, "grad_norm": 7.072306156158447, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8480244874954224, "num_tokens": 467871403.0, "step": 12262 }, { "epoch": 1.5599796463554254, "ewc_loss": 0.05938424542546272, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002813424507621676, "grad_norm": 7.17509651184082, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8430224061012268, "num_tokens": 467906498.0, "step": 12263 }, { "epoch": 1.560106856634016, "ewc_loss": 0.059136077761650085, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002788607671391219, "grad_norm": 7.067502498626709, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8620395660400391, "num_tokens": 467946426.0, "step": 12264 }, { "epoch": 1.5602340669126065, "ewc_loss": 0.059299733489751816, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00028049733373336494, "grad_norm": 7.174798965454102, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8659169673919678, "num_tokens": 467979930.0, "step": 12265 }, { "epoch": 1.560361277191197, "ewc_loss": 0.0589861199259758, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002773611922748387, "grad_norm": 7.073917388916016, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8575500249862671, "num_tokens": 468016347.0, "step": 12266 }, { "epoch": 1.5604884874697875, "ewc_loss": 0.059126198291778564, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002787619596347213, "grad_norm": 7.121945858001709, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8706527948379517, "num_tokens": 468051446.0, "step": 12267 }, { "epoch": 1.560615697748378, "ewc_loss": 0.058917708694934845, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002766770776361227, "grad_norm": 7.060969829559326, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8793991208076477, "num_tokens": 468085713.0, "step": 12268 }, { "epoch": 1.5607429080269686, "ewc_loss": 0.05925736576318741, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002776322653517127, "grad_norm": 7.115149021148682, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8760250806808472, "num_tokens": 468117863.0, "step": 12269 }, { "epoch": 1.560870118305559, "ewc_loss": 0.05915353074669838, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027659389888867736, "grad_norm": 7.043745517730713, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.860435962677002, "num_tokens": 468153041.0, "step": 12270 }, { "epoch": 1.5609973285841496, "ewc_loss": 0.05929401516914368, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027799876988865435, "grad_norm": 7.098538875579834, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8610122203826904, "num_tokens": 468193624.0, "step": 12271 }, { "epoch": 1.5611245388627402, "ewc_loss": 0.05913608521223068, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764194505289197, "grad_norm": 7.060968399047852, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8793454766273499, "num_tokens": 468227091.0, "step": 12272 }, { "epoch": 1.5612517491413307, "ewc_loss": 0.05894097685813904, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027690979186445475, "grad_norm": 7.040040969848633, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8591285347938538, "num_tokens": 468265079.0, "step": 12273 }, { "epoch": 1.5613789594199212, "ewc_loss": 0.059002239257097244, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027752239839173853, "grad_norm": 7.055473804473877, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8609799146652222, "num_tokens": 468307055.0, "step": 12274 }, { "epoch": 1.5615061696985117, "ewc_loss": 0.05919858068227768, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027704439708031714, "grad_norm": 7.081727027893066, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8582258224487305, "num_tokens": 468344429.0, "step": 12275 }, { "epoch": 1.5616333799771023, "ewc_loss": 0.058993905782699585, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.0002774390741251409, "grad_norm": 7.108592987060547, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8631651401519775, "num_tokens": 468379448.0, "step": 12276 }, { "epoch": 1.5617605902556928, "ewc_loss": 0.0589480884373188, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027698089252226055, "grad_norm": 7.094420909881592, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8666776418685913, "num_tokens": 468423344.0, "step": 12277 }, { "epoch": 1.5618878005342833, "ewc_loss": 0.05896233022212982, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027712329756468534, "grad_norm": 7.125226974487305, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8694488406181335, "num_tokens": 468461657.0, "step": 12278 }, { "epoch": 1.5620150108128736, "ewc_loss": 0.05884849280118942, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027598493034020066, "grad_norm": 7.065621376037598, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8781613707542419, "num_tokens": 468500428.0, "step": 12279 }, { "epoch": 1.5621422210914642, "ewc_loss": 0.05905085802078247, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027800860698334873, "grad_norm": 7.150533199310303, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.852645754814148, "num_tokens": 468536310.0, "step": 12280 }, { "epoch": 1.5622694313700547, "ewc_loss": 0.05904313176870346, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027548993239179254, "grad_norm": 7.079437255859375, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8611948490142822, "num_tokens": 468577954.0, "step": 12281 }, { "epoch": 1.5623966416486452, "ewc_loss": 0.05904075503349304, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027790755848400295, "grad_norm": 7.139768123626709, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8557820320129395, "num_tokens": 468612131.0, "step": 12282 }, { "epoch": 1.5625238519272358, "ewc_loss": 0.05875653401017189, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027506533660925925, "grad_norm": 6.966127872467041, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8589717745780945, "num_tokens": 468654022.0, "step": 12283 }, { "epoch": 1.5626510622058263, "ewc_loss": 0.05931098759174347, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002781684452202171, "grad_norm": 7.124204158782959, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8537310361862183, "num_tokens": 468693120.0, "step": 12284 }, { "epoch": 1.5627782724844166, "ewc_loss": 0.059138331562280655, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764419186860323, "grad_norm": 7.058996200561523, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8627285361289978, "num_tokens": 468727695.0, "step": 12285 }, { "epoch": 1.5629054827630071, "ewc_loss": 0.059293992817401886, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027799850795418024, "grad_norm": 7.075627326965332, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8594286441802979, "num_tokens": 468766705.0, "step": 12286 }, { "epoch": 1.5630326930415976, "ewc_loss": 0.05895959585905075, "ewc_loss_diag": 3.123283386230469e-05, "ewc_loss_parallel": 0.00027709596906788647, "grad_norm": 7.1204729080200195, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8538457751274109, "num_tokens": 468804555.0, "step": 12287 }, { "epoch": 1.5631599033201882, "ewc_loss": 0.059133097529411316, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002763895899988711, "grad_norm": 7.06717586517334, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8592221736907959, "num_tokens": 468842495.0, "step": 12288 }, { "epoch": 1.5632871135987787, "ewc_loss": 0.059246450662612915, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027752312598749995, "grad_norm": 7.135435581207275, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8612518310546875, "num_tokens": 468877574.0, "step": 12289 }, { "epoch": 1.5634143238773692, "ewc_loss": 0.05902831256389618, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027534173568710685, "grad_norm": 7.091434955596924, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8736117482185364, "num_tokens": 468914573.0, "step": 12290 }, { "epoch": 1.5635415341559598, "ewc_loss": 0.05914808064699173, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027653941651806235, "grad_norm": 7.132228851318359, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8742371797561646, "num_tokens": 468950943.0, "step": 12291 }, { "epoch": 1.5636687444345503, "ewc_loss": 0.059055402874946594, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027561260503716767, "grad_norm": 7.037728309631348, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8782567977905273, "num_tokens": 468990007.0, "step": 12292 }, { "epoch": 1.5637959547131408, "ewc_loss": 0.05912435054779053, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027630210388451815, "grad_norm": 7.0760321617126465, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8655444383621216, "num_tokens": 469022461.0, "step": 12293 }, { "epoch": 1.5639231649917313, "ewc_loss": 0.059027571231126785, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002753343142103404, "grad_norm": 7.048490047454834, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.869371771812439, "num_tokens": 469056396.0, "step": 12294 }, { "epoch": 1.5640503752703219, "ewc_loss": 0.059121839702129364, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000276276987278834, "grad_norm": 7.05025577545166, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8723329305648804, "num_tokens": 469099464.0, "step": 12295 }, { "epoch": 1.5641775855489124, "ewc_loss": 0.05903201550245285, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027537872665561736, "grad_norm": 7.040717601776123, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8713754415512085, "num_tokens": 469139830.0, "step": 12296 }, { "epoch": 1.564304795827503, "ewc_loss": 0.05913323909044266, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000276390986982733, "grad_norm": 7.024327754974365, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8627374768257141, "num_tokens": 469177838.0, "step": 12297 }, { "epoch": 1.5644320061060935, "ewc_loss": 0.05906488746404648, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002757074835244566, "grad_norm": 6.9934468269348145, "learning_rate": 1e-06, "loss": 0.3797, "mean_token_accuracy": 0.88880455493927, "num_tokens": 469218727.0, "step": 12298 }, { "epoch": 1.564559216384684, "ewc_loss": 0.059186819940805435, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002769267885014415, "grad_norm": 7.077549934387207, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8416293859481812, "num_tokens": 469261186.0, "step": 12299 }, { "epoch": 1.5646864266632745, "ewc_loss": 0.05903242155909538, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002753828011918813, "grad_norm": 7.029727935791016, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8713330626487732, "num_tokens": 469301191.0, "step": 12300 }, { "epoch": 1.564813636941865, "ewc_loss": 0.059234701097011566, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027740560472011566, "grad_norm": 7.123787879943848, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8691346645355225, "num_tokens": 469340132.0, "step": 12301 }, { "epoch": 1.5649408472204556, "ewc_loss": 0.058961544185876846, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027467403560876846, "grad_norm": 7.034633636474609, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8825926184654236, "num_tokens": 469376211.0, "step": 12302 }, { "epoch": 1.5650680574990459, "ewc_loss": 0.05910411849617958, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027609977405518293, "grad_norm": 7.0779194831848145, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8532007932662964, "num_tokens": 469413961.0, "step": 12303 }, { "epoch": 1.5651952677776364, "ewc_loss": 0.059025924652814865, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027531784144230187, "grad_norm": 7.023590087890625, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8625857830047607, "num_tokens": 469449507.0, "step": 12304 }, { "epoch": 1.565322478056227, "ewc_loss": 0.059111278504133224, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761713694781065, "grad_norm": 7.097927570343018, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8493796586990356, "num_tokens": 469486324.0, "step": 12305 }, { "epoch": 1.5654496883348175, "ewc_loss": 0.05898872762918472, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002749458944890648, "grad_norm": 6.99609899520874, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8581186532974243, "num_tokens": 469523799.0, "step": 12306 }, { "epoch": 1.565576898613408, "ewc_loss": 0.05916663259267807, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002767249243333936, "grad_norm": 7.11215353012085, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8493185043334961, "num_tokens": 469558661.0, "step": 12307 }, { "epoch": 1.5657041088919985, "ewc_loss": 0.05902255326509476, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027528413920663297, "grad_norm": 6.982194423675537, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8625011444091797, "num_tokens": 469603087.0, "step": 12308 }, { "epoch": 1.5658313191705888, "ewc_loss": 0.059239596128463745, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002774545573629439, "grad_norm": 7.066471576690674, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.86342853307724, "num_tokens": 469637344.0, "step": 12309 }, { "epoch": 1.5659585294491793, "ewc_loss": 0.05906599760055542, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027571857208386064, "grad_norm": 7.031985282897949, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8730055093765259, "num_tokens": 469681127.0, "step": 12310 }, { "epoch": 1.5660857397277699, "ewc_loss": 0.059187132865190506, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002769299317151308, "grad_norm": 7.046989440917969, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8618694543838501, "num_tokens": 469724238.0, "step": 12311 }, { "epoch": 1.5662129500063604, "ewc_loss": 0.05914796143770218, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027653819415718317, "grad_norm": 7.0434064865112305, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.85648512840271, "num_tokens": 469765967.0, "step": 12312 }, { "epoch": 1.566340160284951, "ewc_loss": 0.05922725796699524, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002773311862256378, "grad_norm": 7.063178539276123, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.864834725856781, "num_tokens": 469801387.0, "step": 12313 }, { "epoch": 1.5664673705635415, "ewc_loss": 0.059113819152116776, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027619677712209523, "grad_norm": 7.063849449157715, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8645107746124268, "num_tokens": 469835448.0, "step": 12314 }, { "epoch": 1.566594580842132, "ewc_loss": 0.059268951416015625, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027774812770076096, "grad_norm": 7.049804210662842, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8536853790283203, "num_tokens": 469876647.0, "step": 12315 }, { "epoch": 1.5667217911207225, "ewc_loss": 0.0592317134141922, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002773757150862366, "grad_norm": 7.048610687255859, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8768059611320496, "num_tokens": 469918158.0, "step": 12316 }, { "epoch": 1.566849001399313, "ewc_loss": 0.0591767318546772, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027682591462507844, "grad_norm": 7.1275105476379395, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.848355770111084, "num_tokens": 469953691.0, "step": 12317 }, { "epoch": 1.5669762116779036, "ewc_loss": 0.05907899886369705, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758486079983413, "grad_norm": 6.9908246994018555, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8667598366737366, "num_tokens": 469996174.0, "step": 12318 }, { "epoch": 1.567103421956494, "ewc_loss": 0.05933958292007446, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000278454419458285, "grad_norm": 7.153931140899658, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8498212099075317, "num_tokens": 470031371.0, "step": 12319 }, { "epoch": 1.5672306322350846, "ewc_loss": 0.059116482734680176, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002762234362307936, "grad_norm": 7.0136542320251465, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.865721583366394, "num_tokens": 470068783.0, "step": 12320 }, { "epoch": 1.5673578425136752, "ewc_loss": 0.05932719260454178, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002783304953482002, "grad_norm": 7.11704683303833, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8463655710220337, "num_tokens": 470103490.0, "step": 12321 }, { "epoch": 1.5674850527922657, "ewc_loss": 0.059251151978969574, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002775700995698571, "grad_norm": 7.075544834136963, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8604276180267334, "num_tokens": 470137213.0, "step": 12322 }, { "epoch": 1.5676122630708562, "ewc_loss": 0.05926425755023956, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027770118322223425, "grad_norm": 7.106156826019287, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8679020404815674, "num_tokens": 470171816.0, "step": 12323 }, { "epoch": 1.5677394733494467, "ewc_loss": 0.059229038655757904, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027734896866604686, "grad_norm": 7.01753044128418, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8395660519599915, "num_tokens": 470210517.0, "step": 12324 }, { "epoch": 1.5678666836280373, "ewc_loss": 0.05948817729949951, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027749896980822086, "grad_norm": 7.05711030960083, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8515201210975647, "num_tokens": 470247085.0, "step": 12325 }, { "epoch": 1.5679938939066278, "ewc_loss": 0.05950191244482994, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002776363107841462, "grad_norm": 7.002439975738525, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8632982969284058, "num_tokens": 470288483.0, "step": 12326 }, { "epoch": 1.5681211041852183, "ewc_loss": 0.05955244228243828, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000278141611488536, "grad_norm": 7.116184711456299, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8641582131385803, "num_tokens": 470322173.0, "step": 12327 }, { "epoch": 1.5682483144638086, "ewc_loss": 0.05939458683133125, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002765630488283932, "grad_norm": 6.997700214385986, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8637281656265259, "num_tokens": 470356617.0, "step": 12328 }, { "epoch": 1.5683755247423992, "ewc_loss": 0.05950571596622467, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027767434949055314, "grad_norm": 7.104453086853027, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8739829063415527, "num_tokens": 470393095.0, "step": 12329 }, { "epoch": 1.5685027350209897, "ewc_loss": 0.05941781401634216, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002767953264992684, "grad_norm": 7.027843475341797, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8748286962509155, "num_tokens": 470427785.0, "step": 12330 }, { "epoch": 1.5686299452995802, "ewc_loss": 0.05948007106781006, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002774178865365684, "grad_norm": 7.011861324310303, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8577722311019897, "num_tokens": 470469971.0, "step": 12331 }, { "epoch": 1.5687571555781707, "ewc_loss": 0.05943894386291504, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002770066203083843, "grad_norm": 7.111942768096924, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8546035885810852, "num_tokens": 470503935.0, "step": 12332 }, { "epoch": 1.5688843658567613, "ewc_loss": 0.05939631909132004, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000276580365607515, "grad_norm": 7.005946159362793, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8645762801170349, "num_tokens": 470536014.0, "step": 12333 }, { "epoch": 1.5690115761353516, "ewc_loss": 0.05954141914844513, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002780313661787659, "grad_norm": 7.0054240226745605, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8768274188041687, "num_tokens": 470575632.0, "step": 12334 }, { "epoch": 1.569138786413942, "ewc_loss": 0.05945097282528877, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000277126906439662, "grad_norm": 7.050804138183594, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8741843104362488, "num_tokens": 470611295.0, "step": 12335 }, { "epoch": 1.5692659966925326, "ewc_loss": 0.05947510898113251, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027736829360947013, "grad_norm": 7.068918704986572, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8624817728996277, "num_tokens": 470646993.0, "step": 12336 }, { "epoch": 1.5693932069711232, "ewc_loss": 0.05941639840602875, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027678118203766644, "grad_norm": 7.087823390960693, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8671963810920715, "num_tokens": 470680253.0, "step": 12337 }, { "epoch": 1.5695204172497137, "ewc_loss": 0.05948454141616821, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002774626191239804, "grad_norm": 7.077531814575195, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8615244626998901, "num_tokens": 470716169.0, "step": 12338 }, { "epoch": 1.5696476275283042, "ewc_loss": 0.05929884687066078, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002756056492216885, "grad_norm": 7.0228657722473145, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8769424557685852, "num_tokens": 470754208.0, "step": 12339 }, { "epoch": 1.5697748378068948, "ewc_loss": 0.05936384201049805, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002762555959634483, "grad_norm": 6.978024959564209, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.864326000213623, "num_tokens": 470801329.0, "step": 12340 }, { "epoch": 1.5699020480854853, "ewc_loss": 0.05940037965774536, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002766209654510021, "grad_norm": 7.132382869720459, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8585909008979797, "num_tokens": 470838408.0, "step": 12341 }, { "epoch": 1.5700292583640758, "ewc_loss": 0.05920384079217911, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002746555837802589, "grad_norm": 7.035617828369141, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8637683987617493, "num_tokens": 470874001.0, "step": 12342 }, { "epoch": 1.5701564686426663, "ewc_loss": 0.059415534138679504, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002767725382000208, "grad_norm": 7.068747043609619, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.859290599822998, "num_tokens": 470916870.0, "step": 12343 }, { "epoch": 1.5702836789212569, "ewc_loss": 0.059270963072776794, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000275326834525913, "grad_norm": 7.082169532775879, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8709474802017212, "num_tokens": 470953862.0, "step": 12344 }, { "epoch": 1.5704108891998474, "ewc_loss": 0.059286005795001984, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002754772431217134, "grad_norm": 7.098822116851807, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.874413788318634, "num_tokens": 470991507.0, "step": 12345 }, { "epoch": 1.570538099478438, "ewc_loss": 0.05923407897353172, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027495797257870436, "grad_norm": 7.032774448394775, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8668540716171265, "num_tokens": 471030109.0, "step": 12346 }, { "epoch": 1.5706653097570284, "ewc_loss": 0.059307437390089035, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002756915637291968, "grad_norm": 7.077640533447266, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8774228096008301, "num_tokens": 471063829.0, "step": 12347 }, { "epoch": 1.570792520035619, "ewc_loss": 0.05922514945268631, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002748686820268631, "grad_norm": 7.0904388427734375, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8535490036010742, "num_tokens": 471107259.0, "step": 12348 }, { "epoch": 1.5709197303142095, "ewc_loss": 0.05926203355193138, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002752375148702413, "grad_norm": 7.12142276763916, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8667851686477661, "num_tokens": 471141516.0, "step": 12349 }, { "epoch": 1.5710469405928, "ewc_loss": 0.05915999412536621, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002742171345744282, "grad_norm": 7.046420097351074, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8648455142974854, "num_tokens": 471178655.0, "step": 12350 }, { "epoch": 1.5711741508713906, "ewc_loss": 0.05928037315607071, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027542089810594916, "grad_norm": 7.1173272132873535, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8761007785797119, "num_tokens": 471209437.0, "step": 12351 }, { "epoch": 1.5713013611499809, "ewc_loss": 0.05917557328939438, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002743729273788631, "grad_norm": 6.989302158355713, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8793612718582153, "num_tokens": 471249559.0, "step": 12352 }, { "epoch": 1.5714285714285714, "ewc_loss": 0.059055086225271225, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027560946182347834, "grad_norm": 7.093982696533203, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.869257926940918, "num_tokens": 471287179.0, "step": 12353 }, { "epoch": 1.571555781707162, "ewc_loss": 0.059134289622306824, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027396008954383433, "grad_norm": 7.041166305541992, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.866621732711792, "num_tokens": 471328052.0, "step": 12354 }, { "epoch": 1.5716829919857525, "ewc_loss": 0.05923087149858475, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027492590015754104, "grad_norm": 7.0923895835876465, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8619351387023926, "num_tokens": 471362414.0, "step": 12355 }, { "epoch": 1.571810202264343, "ewc_loss": 0.05893712863326073, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002744298835750669, "grad_norm": 7.07628870010376, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8601483106613159, "num_tokens": 471397910.0, "step": 12356 }, { "epoch": 1.5719374125429335, "ewc_loss": 0.05916520953178406, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002742692595347762, "grad_norm": 7.045251846313477, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8779432773590088, "num_tokens": 471436951.0, "step": 12357 }, { "epoch": 1.5720646228215238, "ewc_loss": 0.05909284949302673, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002735456801019609, "grad_norm": 7.047146320343018, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8685566186904907, "num_tokens": 471470139.0, "step": 12358 }, { "epoch": 1.5721918331001143, "ewc_loss": 0.05913291871547699, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002739463816396892, "grad_norm": 7.010775089263916, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8708153963088989, "num_tokens": 471508591.0, "step": 12359 }, { "epoch": 1.5723190433787049, "ewc_loss": 0.059259142726659775, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027520861476659775, "grad_norm": 7.068813800811768, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8649417757987976, "num_tokens": 471549165.0, "step": 12360 }, { "epoch": 1.5724462536572954, "ewc_loss": 0.059108421206474304, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002737013855949044, "grad_norm": 7.059067249298096, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8579206466674805, "num_tokens": 471589021.0, "step": 12361 }, { "epoch": 1.572573463935886, "ewc_loss": 0.058929018676280975, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027434880030341446, "grad_norm": 7.066525936126709, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8585558533668518, "num_tokens": 471624263.0, "step": 12362 }, { "epoch": 1.5727006742144765, "ewc_loss": 0.058842018246650696, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002734787994995713, "grad_norm": 7.015246868133545, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8586161732673645, "num_tokens": 471663678.0, "step": 12363 }, { "epoch": 1.572827884493067, "ewc_loss": 0.05890418961644173, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002741004864219576, "grad_norm": 7.064337253570557, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.862940788269043, "num_tokens": 471705887.0, "step": 12364 }, { "epoch": 1.5729550947716575, "ewc_loss": 0.05893886834383011, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027444728766568005, "grad_norm": 7.057262420654297, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8716761469841003, "num_tokens": 471739796.0, "step": 12365 }, { "epoch": 1.573082305050248, "ewc_loss": 0.05910022556781769, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027361942920833826, "grad_norm": 7.086834907531738, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8616079688072205, "num_tokens": 471774723.0, "step": 12366 }, { "epoch": 1.5732095153288386, "ewc_loss": 0.059116944670677185, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002737866307143122, "grad_norm": 7.0575270652771, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8575113415718079, "num_tokens": 471814169.0, "step": 12367 }, { "epoch": 1.573336725607429, "ewc_loss": 0.05919933691620827, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002746105601545423, "grad_norm": 7.068602085113525, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8654136657714844, "num_tokens": 471852367.0, "step": 12368 }, { "epoch": 1.5734639358860196, "ewc_loss": 0.059093400835990906, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002735511807259172, "grad_norm": 7.0578694343566895, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8706845045089722, "num_tokens": 471883438.0, "step": 12369 }, { "epoch": 1.5735911461646102, "ewc_loss": 0.05920051783323288, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002746223472058773, "grad_norm": 7.043424129486084, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8612183928489685, "num_tokens": 471921051.0, "step": 12370 }, { "epoch": 1.5737183564432007, "ewc_loss": 0.05915149301290512, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002741321222856641, "grad_norm": 7.002681732177734, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.867800235748291, "num_tokens": 471959685.0, "step": 12371 }, { "epoch": 1.5738455667217912, "ewc_loss": 0.05923187732696533, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002749359409790486, "grad_norm": 7.070717811584473, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8638101816177368, "num_tokens": 471997345.0, "step": 12372 }, { "epoch": 1.5739727770003817, "ewc_loss": 0.059226155281066895, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027487872284837067, "grad_norm": 7.053833484649658, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8695948123931885, "num_tokens": 472034455.0, "step": 12373 }, { "epoch": 1.5740999872789723, "ewc_loss": 0.059216588735580444, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002747830585576594, "grad_norm": 7.087329864501953, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8633849620819092, "num_tokens": 472073630.0, "step": 12374 }, { "epoch": 1.5742271975575628, "ewc_loss": 0.05917346477508545, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002743518562056124, "grad_norm": 7.0674943923950195, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8624467849731445, "num_tokens": 472111556.0, "step": 12375 }, { "epoch": 1.5743544078361533, "ewc_loss": 0.058887965977191925, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002739382325671613, "grad_norm": 7.025501251220703, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.871459424495697, "num_tokens": 472148276.0, "step": 12376 }, { "epoch": 1.5744816181147436, "ewc_loss": 0.05919422209262848, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027455942472442985, "grad_norm": 7.110278606414795, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8645591735839844, "num_tokens": 472181665.0, "step": 12377 }, { "epoch": 1.5746088283933342, "ewc_loss": 0.05891843140125275, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002742428914643824, "grad_norm": 7.025021076202393, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8573079109191895, "num_tokens": 472222759.0, "step": 12378 }, { "epoch": 1.5747360386719247, "ewc_loss": 0.05930650979280472, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002756822796072811, "grad_norm": 7.125111103057861, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8594743609428406, "num_tokens": 472260867.0, "step": 12379 }, { "epoch": 1.5748632489505152, "ewc_loss": 0.058858372271060944, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002736423339229077, "grad_norm": 7.019911766052246, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8529530763626099, "num_tokens": 472300068.0, "step": 12380 }, { "epoch": 1.5749904592291057, "ewc_loss": 0.059076789766550064, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758264890871942, "grad_norm": 7.146523952484131, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8752304315567017, "num_tokens": 472336459.0, "step": 12381 }, { "epoch": 1.5751176695076963, "ewc_loss": 0.05890354514122009, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002740940544754267, "grad_norm": 7.032035827636719, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8688350915908813, "num_tokens": 472376696.0, "step": 12382 }, { "epoch": 1.5752448797862866, "ewc_loss": 0.05898019298911095, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002748605329543352, "grad_norm": 7.04994010925293, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8765993118286133, "num_tokens": 472414491.0, "step": 12383 }, { "epoch": 1.575372090064877, "ewc_loss": 0.058955129235982895, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002746098907664418, "grad_norm": 7.0708489418029785, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8636098504066467, "num_tokens": 472452188.0, "step": 12384 }, { "epoch": 1.5754993003434676, "ewc_loss": 0.05894634127616882, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027452202630229294, "grad_norm": 7.069075107574463, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.863293468952179, "num_tokens": 472489690.0, "step": 12385 }, { "epoch": 1.5756265106220582, "ewc_loss": 0.05894538015127182, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002745124220382422, "grad_norm": 7.0127739906311035, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8632832169532776, "num_tokens": 472529912.0, "step": 12386 }, { "epoch": 1.5757537209006487, "ewc_loss": 0.059213705360889435, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027475421666167676, "grad_norm": 10.127082824707031, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8623719215393066, "num_tokens": 472562534.0, "step": 12387 }, { "epoch": 1.5758809311792392, "ewc_loss": 0.06190917640924454, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00030415033688768744, "grad_norm": 7.286388874053955, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8623201847076416, "num_tokens": 472598955.0, "step": 12388 }, { "epoch": 1.5760081414578297, "ewc_loss": 0.059366337954998016, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002787220000755042, "grad_norm": 7.191577434539795, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.867760956287384, "num_tokens": 472638064.0, "step": 12389 }, { "epoch": 1.5761353517364203, "ewc_loss": 0.059199146926403046, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002770500723272562, "grad_norm": 7.150725841522217, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8680649399757385, "num_tokens": 472676823.0, "step": 12390 }, { "epoch": 1.5762625620150108, "ewc_loss": 0.059697799384593964, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028203657711856067, "grad_norm": 8.082877159118652, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.863266110420227, "num_tokens": 472718528.0, "step": 12391 }, { "epoch": 1.5763897722936013, "ewc_loss": 0.059221051633358, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027726913685910404, "grad_norm": 7.053837299346924, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8706204891204834, "num_tokens": 472755634.0, "step": 12392 }, { "epoch": 1.5765169825721919, "ewc_loss": 0.05952860414981842, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002803446550387889, "grad_norm": 7.208643436431885, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8764129877090454, "num_tokens": 472798060.0, "step": 12393 }, { "epoch": 1.5766441928507824, "ewc_loss": 0.059050798416137695, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002755665627773851, "grad_norm": 7.065917491912842, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8694090247154236, "num_tokens": 472837628.0, "step": 12394 }, { "epoch": 1.576771403129373, "ewc_loss": 0.05951007455587387, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028015932184644043, "grad_norm": 7.203241348266602, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8655797243118286, "num_tokens": 472877446.0, "step": 12395 }, { "epoch": 1.5768986134079634, "ewc_loss": 0.059090547263622284, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027596409199759364, "grad_norm": 7.0699782371521, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8586704730987549, "num_tokens": 472918437.0, "step": 12396 }, { "epoch": 1.577025823686554, "ewc_loss": 0.059372372925281525, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027878230321221054, "grad_norm": 7.165630340576172, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8728475570678711, "num_tokens": 472956242.0, "step": 12397 }, { "epoch": 1.5771530339651445, "ewc_loss": 0.05905482545495033, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027560684247873724, "grad_norm": 7.097897529602051, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8748452067375183, "num_tokens": 472985620.0, "step": 12398 }, { "epoch": 1.577280244243735, "ewc_loss": 0.05924622714519501, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002775208849925548, "grad_norm": 7.152122974395752, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8691384792327881, "num_tokens": 473019517.0, "step": 12399 }, { "epoch": 1.5774074545223256, "ewc_loss": 0.05902766436338425, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027533521642908454, "grad_norm": 7.066403388977051, "learning_rate": 1e-06, "loss": 0.3867, "mean_token_accuracy": 0.8841757774353027, "num_tokens": 473054624.0, "step": 12400 }, { "epoch": 1.5775346648009159, "ewc_loss": 0.05913861095905304, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764446835499257, "grad_norm": 7.114193439483643, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8547895550727844, "num_tokens": 473094140.0, "step": 12401 }, { "epoch": 1.5776618750795064, "ewc_loss": 0.05899454653263092, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002750040730461478, "grad_norm": 7.072360038757324, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8472741842269897, "num_tokens": 473136886.0, "step": 12402 }, { "epoch": 1.577789085358097, "ewc_loss": 0.05907788127660751, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758374030236155, "grad_norm": 7.089433193206787, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8522487282752991, "num_tokens": 473173267.0, "step": 12403 }, { "epoch": 1.5779162956366874, "ewc_loss": 0.05904814228415489, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002755400200840086, "grad_norm": 7.059943675994873, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8439943790435791, "num_tokens": 473216477.0, "step": 12404 }, { "epoch": 1.578043505915278, "ewc_loss": 0.059116244316101074, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027622104971669614, "grad_norm": 7.1061015129089355, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8569666147232056, "num_tokens": 473256986.0, "step": 12405 }, { "epoch": 1.5781707161938685, "ewc_loss": 0.059085018932819366, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002759087656158954, "grad_norm": 7.07790470123291, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8721976280212402, "num_tokens": 473297967.0, "step": 12406 }, { "epoch": 1.5782979264724588, "ewc_loss": 0.05909967049956322, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027605530340224504, "grad_norm": 7.1323370933532715, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8576146364212036, "num_tokens": 473333580.0, "step": 12407 }, { "epoch": 1.5784251367510493, "ewc_loss": 0.05905747041106224, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002756332978606224, "grad_norm": 7.036510467529297, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8698586821556091, "num_tokens": 473370553.0, "step": 12408 }, { "epoch": 1.5785523470296399, "ewc_loss": 0.05918041616678238, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002768627309706062, "grad_norm": 7.115292549133301, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8472390174865723, "num_tokens": 473410705.0, "step": 12409 }, { "epoch": 1.5786795573082304, "ewc_loss": 0.05910911783576012, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761497744359076, "grad_norm": 7.168832302093506, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8681422472000122, "num_tokens": 473443315.0, "step": 12410 }, { "epoch": 1.578806767586821, "ewc_loss": 0.059073738753795624, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002757959591690451, "grad_norm": 7.033622741699219, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8569101095199585, "num_tokens": 473488767.0, "step": 12411 }, { "epoch": 1.5789339778654115, "ewc_loss": 0.05920649692416191, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027712355949915946, "grad_norm": 7.12314510345459, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8795902132987976, "num_tokens": 473525360.0, "step": 12412 }, { "epoch": 1.579061188144002, "ewc_loss": 0.05912603810429573, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027631898410618305, "grad_norm": 7.109899520874023, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8559459447860718, "num_tokens": 473560625.0, "step": 12413 }, { "epoch": 1.5791883984225925, "ewc_loss": 0.05907505005598068, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027580911410041153, "grad_norm": 7.052874565124512, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8825184106826782, "num_tokens": 473595484.0, "step": 12414 }, { "epoch": 1.579315608701183, "ewc_loss": 0.05917920172214508, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027685059467330575, "grad_norm": 7.354432582855225, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8622652292251587, "num_tokens": 473627269.0, "step": 12415 }, { "epoch": 1.5794428189797736, "ewc_loss": 0.058929212391376495, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027435069205239415, "grad_norm": 6.949826717376709, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8492454886436462, "num_tokens": 473669159.0, "step": 12416 }, { "epoch": 1.579570029258364, "ewc_loss": 0.059341300278902054, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027847159071825445, "grad_norm": 7.142215251922607, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8748648166656494, "num_tokens": 473707739.0, "step": 12417 }, { "epoch": 1.5796972395369546, "ewc_loss": 0.058905184268951416, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027411041082814336, "grad_norm": 7.006307601928711, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.865182101726532, "num_tokens": 473746908.0, "step": 12418 }, { "epoch": 1.5798244498155452, "ewc_loss": 0.059318624436855316, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027824484277516603, "grad_norm": 7.110863208770752, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8627667427062988, "num_tokens": 473781480.0, "step": 12419 }, { "epoch": 1.5799516600941357, "ewc_loss": 0.05906103551387787, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002756689500529319, "grad_norm": 7.043555736541748, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8641635775566101, "num_tokens": 473814676.0, "step": 12420 }, { "epoch": 1.5800788703727262, "ewc_loss": 0.059272367507219315, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002777822664938867, "grad_norm": 7.139419078826904, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8595266938209534, "num_tokens": 473855487.0, "step": 12421 }, { "epoch": 1.5802060806513167, "ewc_loss": 0.05907951295375824, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758537302725017, "grad_norm": 7.078462600708008, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8767322897911072, "num_tokens": 473885648.0, "step": 12422 }, { "epoch": 1.5803332909299073, "ewc_loss": 0.05915490537881851, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027660763589665294, "grad_norm": 7.076350212097168, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8599690198898315, "num_tokens": 473923862.0, "step": 12423 }, { "epoch": 1.5804605012084978, "ewc_loss": 0.059145741164684296, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002765159879345447, "grad_norm": 7.116827487945557, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.867979109287262, "num_tokens": 473957725.0, "step": 12424 }, { "epoch": 1.5805877114870883, "ewc_loss": 0.05933260917663574, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002759432536549866, "grad_norm": 7.072157382965088, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8642003536224365, "num_tokens": 473994939.0, "step": 12425 }, { "epoch": 1.5807149217656786, "ewc_loss": 0.05913541465997696, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764127275440842, "grad_norm": 7.099776744842529, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8602405786514282, "num_tokens": 474035996.0, "step": 12426 }, { "epoch": 1.5808421320442692, "ewc_loss": 0.05902866646647453, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002753452572505921, "grad_norm": 7.086324691772461, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8731695413589478, "num_tokens": 474071693.0, "step": 12427 }, { "epoch": 1.5809693423228597, "ewc_loss": 0.05903550982475281, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027541370945982635, "grad_norm": 7.074496269226074, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8718934059143066, "num_tokens": 474114415.0, "step": 12428 }, { "epoch": 1.5810965526014502, "ewc_loss": 0.05906028673052788, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002756614703685045, "grad_norm": 7.1294264793396, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8588199615478516, "num_tokens": 474152060.0, "step": 12429 }, { "epoch": 1.5812237628800407, "ewc_loss": 0.05908283591270447, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027588693774305284, "grad_norm": 7.115880966186523, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8675386309623718, "num_tokens": 474190605.0, "step": 12430 }, { "epoch": 1.5813509731586313, "ewc_loss": 0.05902263894677162, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002752849832177162, "grad_norm": 7.069327354431152, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8605630397796631, "num_tokens": 474229447.0, "step": 12431 }, { "epoch": 1.5814781834372216, "ewc_loss": 0.05908636003732681, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002759222115855664, "grad_norm": 7.088925838470459, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8718863725662231, "num_tokens": 474268088.0, "step": 12432 }, { "epoch": 1.581605393715812, "ewc_loss": 0.059335775673389435, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000275974947726354, "grad_norm": 7.08634090423584, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8595646619796753, "num_tokens": 474304856.0, "step": 12433 }, { "epoch": 1.5817326039944026, "ewc_loss": 0.059103675186634064, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027609532116912305, "grad_norm": 7.114805698394775, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8679782152175903, "num_tokens": 474349248.0, "step": 12434 }, { "epoch": 1.5818598142729932, "ewc_loss": 0.05907086282968521, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002757672336883843, "grad_norm": 8.055329322814941, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8763636946678162, "num_tokens": 474384432.0, "step": 12435 }, { "epoch": 1.5819870245515837, "ewc_loss": 0.05880610644817352, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027311965823173523, "grad_norm": 6.958199977874756, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8622273802757263, "num_tokens": 474415881.0, "step": 12436 }, { "epoch": 1.5821142348301742, "ewc_loss": 0.05940627306699753, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027912130462937057, "grad_norm": 7.327369689941406, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8636783361434937, "num_tokens": 474452755.0, "step": 12437 }, { "epoch": 1.5822414451087647, "ewc_loss": 0.058644793927669525, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027150652022100985, "grad_norm": 6.924747943878174, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8602274656295776, "num_tokens": 474493937.0, "step": 12438 }, { "epoch": 1.5823686553873553, "ewc_loss": 0.05950839817523956, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002801425871439278, "grad_norm": 7.229574203491211, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8698933720588684, "num_tokens": 474534072.0, "step": 12439 }, { "epoch": 1.5824958656659458, "ewc_loss": 0.05876201391220093, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002726787352003157, "grad_norm": 7.027167797088623, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8558598756790161, "num_tokens": 474569343.0, "step": 12440 }, { "epoch": 1.5826230759445363, "ewc_loss": 0.0592249259352684, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002773078449536115, "grad_norm": 7.170551300048828, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8557115793228149, "num_tokens": 474606610.0, "step": 12441 }, { "epoch": 1.5827502862231269, "ewc_loss": 0.05890164524316788, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002740750496741384, "grad_norm": 7.093522071838379, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8548831939697266, "num_tokens": 474645071.0, "step": 12442 }, { "epoch": 1.5828774965017174, "ewc_loss": 0.059036895632743835, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027542756288312376, "grad_norm": 7.088942050933838, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.852951169013977, "num_tokens": 474683985.0, "step": 12443 }, { "epoch": 1.583004706780308, "ewc_loss": 0.05897650867700577, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000274823687504977, "grad_norm": 7.151617527008057, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8693958520889282, "num_tokens": 474726102.0, "step": 12444 }, { "epoch": 1.5831319170588984, "ewc_loss": 0.058905601501464844, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002741146308835596, "grad_norm": 7.05159854888916, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8671587705612183, "num_tokens": 474766430.0, "step": 12445 }, { "epoch": 1.583259127337489, "ewc_loss": 0.05899178236722946, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027497639530338347, "grad_norm": 7.094254970550537, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8634762763977051, "num_tokens": 474800648.0, "step": 12446 }, { "epoch": 1.5833863376160795, "ewc_loss": 0.05888606607913971, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002739192277658731, "grad_norm": 7.062131404876709, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8689157962799072, "num_tokens": 474845540.0, "step": 12447 }, { "epoch": 1.58351354789467, "ewc_loss": 0.0590503066778183, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027556164423003793, "grad_norm": 7.126796722412109, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8619126677513123, "num_tokens": 474884296.0, "step": 12448 }, { "epoch": 1.5836407581732606, "ewc_loss": 0.05894066393375397, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027446524472907186, "grad_norm": 7.0521240234375, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8694639205932617, "num_tokens": 474929344.0, "step": 12449 }, { "epoch": 1.5837679684518509, "ewc_loss": 0.05900003761053085, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002750589628703892, "grad_norm": 7.118535041809082, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8694157600402832, "num_tokens": 474968035.0, "step": 12450 }, { "epoch": 1.5838951787304414, "ewc_loss": 0.05896422266960144, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002747008402366191, "grad_norm": 7.039492130279541, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8583656549453735, "num_tokens": 475006189.0, "step": 12451 }, { "epoch": 1.584022389009032, "ewc_loss": 0.05910356342792511, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027609424432739615, "grad_norm": 7.136080741882324, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8597177267074585, "num_tokens": 475039768.0, "step": 12452 }, { "epoch": 1.5841495992876224, "ewc_loss": 0.059088777750730515, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002759463677648455, "grad_norm": 7.148159027099609, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8699356317520142, "num_tokens": 475079938.0, "step": 12453 }, { "epoch": 1.584276809566213, "ewc_loss": 0.05899152159690857, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002749738050624728, "grad_norm": 7.067603588104248, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8520915508270264, "num_tokens": 475122405.0, "step": 12454 }, { "epoch": 1.5844040198448035, "ewc_loss": 0.05910256505012512, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027608423260971904, "grad_norm": 7.103788375854492, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8574622869491577, "num_tokens": 475158237.0, "step": 12455 }, { "epoch": 1.5845312301233938, "ewc_loss": 0.05903128534555435, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027537145069800317, "grad_norm": 7.073155403137207, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8654600977897644, "num_tokens": 475194966.0, "step": 12456 }, { "epoch": 1.5846584404019843, "ewc_loss": 0.05909635126590729, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002760220959316939, "grad_norm": 7.161527633666992, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8638854622840881, "num_tokens": 475234483.0, "step": 12457 }, { "epoch": 1.5847856506805749, "ewc_loss": 0.05902621150016785, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027532069361768663, "grad_norm": 7.069057464599609, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8597373366355896, "num_tokens": 475275172.0, "step": 12458 }, { "epoch": 1.5849128609591654, "ewc_loss": 0.05910984054207802, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027615699218586087, "grad_norm": 7.103667736053467, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8687556385993958, "num_tokens": 475313550.0, "step": 12459 }, { "epoch": 1.585040071237756, "ewc_loss": 0.059026654809713364, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002753251465037465, "grad_norm": 7.00770902633667, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8719344139099121, "num_tokens": 475357019.0, "step": 12460 }, { "epoch": 1.5851672815163464, "ewc_loss": 0.05927548184990883, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002778134075924754, "grad_norm": 7.108235836029053, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8686877489089966, "num_tokens": 475394788.0, "step": 12461 }, { "epoch": 1.585294491794937, "ewc_loss": 0.059075843542814255, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027581703034229577, "grad_norm": 7.088198661804199, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.861653208732605, "num_tokens": 475432354.0, "step": 12462 }, { "epoch": 1.5854217020735275, "ewc_loss": 0.05931747704744339, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002782333758659661, "grad_norm": 7.099915504455566, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8676004409790039, "num_tokens": 475467284.0, "step": 12463 }, { "epoch": 1.585548912352118, "ewc_loss": 0.05918949842453003, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027695359312929213, "grad_norm": 7.126615524291992, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8562837839126587, "num_tokens": 475505946.0, "step": 12464 }, { "epoch": 1.5856761226307086, "ewc_loss": 0.05922897905111313, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027734835748560727, "grad_norm": 7.125073432922363, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8508766889572144, "num_tokens": 475542799.0, "step": 12465 }, { "epoch": 1.585803332909299, "ewc_loss": 0.05908088758587837, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758674672804773, "grad_norm": 7.101008415222168, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8636184334754944, "num_tokens": 475582064.0, "step": 12466 }, { "epoch": 1.5859305431878896, "ewc_loss": 0.05914226919412613, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027648129616864026, "grad_norm": 7.142932891845703, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8602001667022705, "num_tokens": 475615521.0, "step": 12467 }, { "epoch": 1.5860577534664801, "ewc_loss": 0.05902648717164993, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002753234875854105, "grad_norm": 7.068914890289307, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8723932504653931, "num_tokens": 475650500.0, "step": 12468 }, { "epoch": 1.5861849637450707, "ewc_loss": 0.059145476669073105, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002765133685898036, "grad_norm": 7.098196506500244, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8648990392684937, "num_tokens": 475695824.0, "step": 12469 }, { "epoch": 1.5863121740236612, "ewc_loss": 0.05912007391452789, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027625932125374675, "grad_norm": 7.136335849761963, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8422282934188843, "num_tokens": 475734015.0, "step": 12470 }, { "epoch": 1.5864393843022517, "ewc_loss": 0.059028349816799164, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002753421140369028, "grad_norm": 7.086616516113281, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.861675500869751, "num_tokens": 475768961.0, "step": 12471 }, { "epoch": 1.5865665945808423, "ewc_loss": 0.05911768972873688, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002762354852166027, "grad_norm": 7.089077472686768, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.8796583414077759, "num_tokens": 475806551.0, "step": 12472 }, { "epoch": 1.5866938048594328, "ewc_loss": 0.05910017713904381, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002760603674687445, "grad_norm": 7.122721195220947, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8583629131317139, "num_tokens": 475844066.0, "step": 12473 }, { "epoch": 1.5868210151380233, "ewc_loss": 0.059106819331645966, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761267824098468, "grad_norm": 7.146543025970459, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8688621520996094, "num_tokens": 475874554.0, "step": 12474 }, { "epoch": 1.5869482254166136, "ewc_loss": 0.05910786613821983, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761372597888112, "grad_norm": 7.059039115905762, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8710530996322632, "num_tokens": 475915840.0, "step": 12475 }, { "epoch": 1.5870754356952042, "ewc_loss": 0.0591861754655838, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002769203274510801, "grad_norm": 7.11652135848999, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8626481294631958, "num_tokens": 475951516.0, "step": 12476 }, { "epoch": 1.5872026459737947, "ewc_loss": 0.059098146855831146, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002760400820989162, "grad_norm": 7.091518878936768, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8674228191375732, "num_tokens": 475984224.0, "step": 12477 }, { "epoch": 1.5873298562523852, "ewc_loss": 0.059186894446611404, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027692754520103335, "grad_norm": 7.1669111251831055, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.843459963798523, "num_tokens": 476015758.0, "step": 12478 }, { "epoch": 1.5874570665309757, "ewc_loss": 0.05904529243707657, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000275511498330161, "grad_norm": 7.070331573486328, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8605529069900513, "num_tokens": 476051835.0, "step": 12479 }, { "epoch": 1.5875842768095663, "ewc_loss": 0.05912266671657562, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002762852527666837, "grad_norm": 7.097822666168213, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8603972792625427, "num_tokens": 476090667.0, "step": 12480 }, { "epoch": 1.5877114870881566, "ewc_loss": 0.0590888187289238, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002759467752184719, "grad_norm": 7.082107067108154, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8620874881744385, "num_tokens": 476130448.0, "step": 12481 }, { "epoch": 1.587838697366747, "ewc_loss": 0.05921854078769684, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002772440202534199, "grad_norm": 7.072676658630371, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8536967039108276, "num_tokens": 476174495.0, "step": 12482 }, { "epoch": 1.5879659076453376, "ewc_loss": 0.05930142104625702, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027563137700781226, "grad_norm": 7.042751312255859, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8727788329124451, "num_tokens": 476213412.0, "step": 12483 }, { "epoch": 1.5880931179239282, "ewc_loss": 0.05946999043226242, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027731709997169673, "grad_norm": 7.111155033111572, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8779147863388062, "num_tokens": 476250441.0, "step": 12484 }, { "epoch": 1.5882203282025187, "ewc_loss": 0.059372447431087494, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027634165599010885, "grad_norm": 7.058226585388184, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8774625062942505, "num_tokens": 476284466.0, "step": 12485 }, { "epoch": 1.5883475384811092, "ewc_loss": 0.05942751467227936, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027689235867001116, "grad_norm": 7.069493770599365, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8622408509254456, "num_tokens": 476327760.0, "step": 12486 }, { "epoch": 1.5884747487596997, "ewc_loss": 0.05916929244995117, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027675149613060057, "grad_norm": 7.090010166168213, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.859856367111206, "num_tokens": 476369259.0, "step": 12487 }, { "epoch": 1.5886019590382903, "ewc_loss": 0.05920923501253128, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027715094620361924, "grad_norm": 7.102200508117676, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8658292293548584, "num_tokens": 476403915.0, "step": 12488 }, { "epoch": 1.5887291693168808, "ewc_loss": 0.05913955718278885, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764541422948241, "grad_norm": 7.111321449279785, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8617130517959595, "num_tokens": 476440837.0, "step": 12489 }, { "epoch": 1.5888563795954713, "ewc_loss": 0.05911078304052353, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761664509307593, "grad_norm": 7.0508623123168945, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.862369179725647, "num_tokens": 476478478.0, "step": 12490 }, { "epoch": 1.5889835898740619, "ewc_loss": 0.059161778539419174, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027667637914419174, "grad_norm": 7.090637683868408, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8724820613861084, "num_tokens": 476514675.0, "step": 12491 }, { "epoch": 1.5891108001526524, "ewc_loss": 0.059424348175525665, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002768606645986438, "grad_norm": 7.0185136795043945, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8607032299041748, "num_tokens": 476559015.0, "step": 12492 }, { "epoch": 1.589238010431243, "ewc_loss": 0.05952320247888565, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002778492053039372, "grad_norm": 7.134083271026611, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8489266633987427, "num_tokens": 476601848.0, "step": 12493 }, { "epoch": 1.5893652207098334, "ewc_loss": 0.0594247467815876, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027686465182341635, "grad_norm": 7.056009769439697, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8724241256713867, "num_tokens": 476636794.0, "step": 12494 }, { "epoch": 1.589492430988424, "ewc_loss": 0.059282176196575165, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027788037550635636, "grad_norm": 7.1635050773620605, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8579031229019165, "num_tokens": 476671593.0, "step": 12495 }, { "epoch": 1.5896196412670145, "ewc_loss": 0.05922140181064606, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027727262931875885, "grad_norm": 7.07846212387085, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8740589022636414, "num_tokens": 476706776.0, "step": 12496 }, { "epoch": 1.589746851545605, "ewc_loss": 0.05921090021729469, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002771675935946405, "grad_norm": 7.126774787902832, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8628462553024292, "num_tokens": 476746700.0, "step": 12497 }, { "epoch": 1.5898740618241956, "ewc_loss": 0.059218890964984894, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027724748360924423, "grad_norm": 7.086514472961426, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8696103096008301, "num_tokens": 476785837.0, "step": 12498 }, { "epoch": 1.5900012721027859, "ewc_loss": 0.05951434373855591, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002777606132440269, "grad_norm": 7.146470069885254, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8696037530899048, "num_tokens": 476818894.0, "step": 12499 }, { "epoch": 1.5901284823813764, "ewc_loss": 0.059418998658657074, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002768071717582643, "grad_norm": 7.047428131103516, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8462055921554565, "num_tokens": 476861773.0, "step": 12500 }, { "epoch": 1.590255692659967, "ewc_loss": 0.05953732132911682, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027799041708931327, "grad_norm": 7.118365287780762, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8689835667610168, "num_tokens": 476899522.0, "step": 12501 }, { "epoch": 1.5903829029385574, "ewc_loss": 0.059403181076049805, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002766490215435624, "grad_norm": 7.102361679077148, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8570191264152527, "num_tokens": 476939923.0, "step": 12502 }, { "epoch": 1.590510113217148, "ewc_loss": 0.059204258024692535, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002771011786535382, "grad_norm": 7.123013496398926, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8640203475952148, "num_tokens": 476979867.0, "step": 12503 }, { "epoch": 1.5906373234957385, "ewc_loss": 0.059207476675510406, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002771333674900234, "grad_norm": 7.08890438079834, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8721950054168701, "num_tokens": 477016068.0, "step": 12504 }, { "epoch": 1.5907645337743288, "ewc_loss": 0.05921562388539314, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027721482911147177, "grad_norm": 7.154813766479492, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8603991270065308, "num_tokens": 477054526.0, "step": 12505 }, { "epoch": 1.5908917440529193, "ewc_loss": 0.05914050713181496, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764636592473835, "grad_norm": 7.111695766448975, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.858498215675354, "num_tokens": 477098029.0, "step": 12506 }, { "epoch": 1.5910189543315099, "ewc_loss": 0.05914933234453201, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027655193116515875, "grad_norm": 7.100205898284912, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8697859048843384, "num_tokens": 477137070.0, "step": 12507 }, { "epoch": 1.5911461646101004, "ewc_loss": 0.059111177921295166, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027617037994787097, "grad_norm": 7.130825519561768, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8628934621810913, "num_tokens": 477175640.0, "step": 12508 }, { "epoch": 1.591273374888691, "ewc_loss": 0.058998677879571915, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027504537138156593, "grad_norm": 7.081722736358643, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8652409315109253, "num_tokens": 477210935.0, "step": 12509 }, { "epoch": 1.5914005851672814, "ewc_loss": 0.059169091284275055, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002767495170701295, "grad_norm": 7.131394386291504, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8708527684211731, "num_tokens": 477251567.0, "step": 12510 }, { "epoch": 1.591527795445872, "ewc_loss": 0.05893046781420708, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027436326490715146, "grad_norm": 7.037312030792236, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.866517961025238, "num_tokens": 477287964.0, "step": 12511 }, { "epoch": 1.5916550057244625, "ewc_loss": 0.059198036789894104, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027703895466402173, "grad_norm": 7.1738152503967285, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8392865061759949, "num_tokens": 477328660.0, "step": 12512 }, { "epoch": 1.591782216003053, "ewc_loss": 0.059042736887931824, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027548594516701996, "grad_norm": 7.079819679260254, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8681040406227112, "num_tokens": 477364111.0, "step": 12513 }, { "epoch": 1.5919094262816436, "ewc_loss": 0.05918954312801361, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000276954029686749, "grad_norm": 7.133893966674805, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8708083629608154, "num_tokens": 477402691.0, "step": 12514 }, { "epoch": 1.592036636560234, "ewc_loss": 0.05916115641593933, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000276670150924474, "grad_norm": 7.127553462982178, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8777452707290649, "num_tokens": 477437526.0, "step": 12515 }, { "epoch": 1.5921638468388246, "ewc_loss": 0.05919088423252106, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027696744655258954, "grad_norm": 7.104547023773193, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8623546361923218, "num_tokens": 477479175.0, "step": 12516 }, { "epoch": 1.5922910571174151, "ewc_loss": 0.05915677547454834, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002766263496596366, "grad_norm": 7.13745641708374, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8664470314979553, "num_tokens": 477516896.0, "step": 12517 }, { "epoch": 1.5924182673960057, "ewc_loss": 0.0591779425740242, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000276837992714718, "grad_norm": 7.1256794929504395, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8545918464660645, "num_tokens": 477555543.0, "step": 12518 }, { "epoch": 1.5925454776745962, "ewc_loss": 0.0591595433652401, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027665402740240097, "grad_norm": 7.147592544555664, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8758995532989502, "num_tokens": 477592251.0, "step": 12519 }, { "epoch": 1.5926726879531867, "ewc_loss": 0.05907934531569481, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758520422503352, "grad_norm": 7.13090705871582, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8506733179092407, "num_tokens": 477630039.0, "step": 12520 }, { "epoch": 1.5927998982317773, "ewc_loss": 0.05918095260858536, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027686814428307116, "grad_norm": 7.151073455810547, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.861473560333252, "num_tokens": 477661028.0, "step": 12521 }, { "epoch": 1.5929271085103678, "ewc_loss": 0.05913867801427841, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764453529380262, "grad_norm": 7.146000862121582, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8778074979782104, "num_tokens": 477699612.0, "step": 12522 }, { "epoch": 1.5930543187889583, "ewc_loss": 0.05910222977399826, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027608091477304697, "grad_norm": 7.10231351852417, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8596404790878296, "num_tokens": 477737300.0, "step": 12523 }, { "epoch": 1.5931815290675486, "ewc_loss": 0.059141356498003006, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027647215756587684, "grad_norm": 7.124577522277832, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8520287275314331, "num_tokens": 477776031.0, "step": 12524 }, { "epoch": 1.5933087393461391, "ewc_loss": 0.059184517711400986, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002769037673715502, "grad_norm": 7.0892534255981445, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8655675649642944, "num_tokens": 477813640.0, "step": 12525 }, { "epoch": 1.5934359496247297, "ewc_loss": 0.059130214154720306, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027636074810288846, "grad_norm": 7.11326789855957, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.860550582408905, "num_tokens": 477848652.0, "step": 12526 }, { "epoch": 1.5935631599033202, "ewc_loss": 0.05917419120669365, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002768005069810897, "grad_norm": 7.0949602127075195, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8607710599899292, "num_tokens": 477883710.0, "step": 12527 }, { "epoch": 1.5936903701819107, "ewc_loss": 0.05923108756542206, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002773694577626884, "grad_norm": 7.109654903411865, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8576615452766418, "num_tokens": 477920316.0, "step": 12528 }, { "epoch": 1.5938175804605013, "ewc_loss": 0.059248603880405426, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002775446337182075, "grad_norm": 7.089495658874512, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8727136850357056, "num_tokens": 477955523.0, "step": 12529 }, { "epoch": 1.5939447907390916, "ewc_loss": 0.059259526431560516, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027765383129008114, "grad_norm": 7.1038126945495605, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8556911945343018, "num_tokens": 477995768.0, "step": 12530 }, { "epoch": 1.594072001017682, "ewc_loss": 0.059273384511470795, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027779245283454657, "grad_norm": 7.116377830505371, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8639228940010071, "num_tokens": 478037718.0, "step": 12531 }, { "epoch": 1.5941992112962726, "ewc_loss": 0.05920421704649925, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027710077119991183, "grad_norm": 7.083685874938965, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8519288301467896, "num_tokens": 478077388.0, "step": 12532 }, { "epoch": 1.5943264215748632, "ewc_loss": 0.05925098806619644, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027756846975535154, "grad_norm": 7.08892297744751, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8782464265823364, "num_tokens": 478109494.0, "step": 12533 }, { "epoch": 1.5944536318534537, "ewc_loss": 0.059511274099349976, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002777299378067255, "grad_norm": 7.1469407081604, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8588727712631226, "num_tokens": 478142420.0, "step": 12534 }, { "epoch": 1.5945808421320442, "ewc_loss": 0.059494197368621826, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002775591565296054, "grad_norm": 7.098850250244141, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8678364157676697, "num_tokens": 478183848.0, "step": 12535 }, { "epoch": 1.5947080524106347, "ewc_loss": 0.059279002249240875, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027784862322732806, "grad_norm": 7.133154392242432, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8726162910461426, "num_tokens": 478225242.0, "step": 12536 }, { "epoch": 1.5948352626892253, "ewc_loss": 0.059480682015419006, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002774240274447948, "grad_norm": 7.094261646270752, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8699511885643005, "num_tokens": 478263167.0, "step": 12537 }, { "epoch": 1.5949624729678158, "ewc_loss": 0.05923536792397499, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027741226949729025, "grad_norm": 7.178643703460693, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8651667833328247, "num_tokens": 478297752.0, "step": 12538 }, { "epoch": 1.5950896832464063, "ewc_loss": 0.05905719846487045, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027563059120438993, "grad_norm": 7.066020965576172, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8663036823272705, "num_tokens": 478332955.0, "step": 12539 }, { "epoch": 1.5952168935249968, "ewc_loss": 0.059339992702007294, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002784585230983794, "grad_norm": 7.188548564910889, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8427679538726807, "num_tokens": 478367298.0, "step": 12540 }, { "epoch": 1.5953441038035874, "ewc_loss": 0.059093136340379715, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002759899653028697, "grad_norm": 7.019859790802002, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8686187267303467, "num_tokens": 478411183.0, "step": 12541 }, { "epoch": 1.595471314082178, "ewc_loss": 0.059437185525894165, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027943047462031245, "grad_norm": 7.192136764526367, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8603889346122742, "num_tokens": 478451184.0, "step": 12542 }, { "epoch": 1.5955985243607684, "ewc_loss": 0.0591595433652401, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027665402740240097, "grad_norm": 7.049432277679443, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8591166734695435, "num_tokens": 478490913.0, "step": 12543 }, { "epoch": 1.595725734639359, "ewc_loss": 0.059335291385650635, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002784114913083613, "grad_norm": 7.125877380371094, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8649380803108215, "num_tokens": 478529769.0, "step": 12544 }, { "epoch": 1.5958529449179495, "ewc_loss": 0.05908362567424774, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758948248811066, "grad_norm": 7.086706638336182, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8601373434066772, "num_tokens": 478566665.0, "step": 12545 }, { "epoch": 1.59598015519654, "ewc_loss": 0.059276193380355835, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002778205380309373, "grad_norm": 7.143495559692383, "learning_rate": 1e-06, "loss": 0.3717, "mean_token_accuracy": 0.8908329606056213, "num_tokens": 478600814.0, "step": 12546 }, { "epoch": 1.5961073654751305, "ewc_loss": 0.05917989835143089, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027685757959261537, "grad_norm": 7.139926433563232, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8685439229011536, "num_tokens": 478641610.0, "step": 12547 }, { "epoch": 1.5962345757537209, "ewc_loss": 0.059124402701854706, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027630262775346637, "grad_norm": 7.137785911560059, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.87216717004776, "num_tokens": 478678183.0, "step": 12548 }, { "epoch": 1.5963617860323114, "ewc_loss": 0.05921385437250137, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002771971339825541, "grad_norm": 7.176589012145996, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8651418685913086, "num_tokens": 478710881.0, "step": 12549 }, { "epoch": 1.596488996310902, "ewc_loss": 0.05907868221402168, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758454065769911, "grad_norm": 7.089565753936768, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8776017427444458, "num_tokens": 478748550.0, "step": 12550 }, { "epoch": 1.5966162065894924, "ewc_loss": 0.05917955935001373, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002768542035482824, "grad_norm": 7.1601881980896, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8528238534927368, "num_tokens": 478790417.0, "step": 12551 }, { "epoch": 1.596743416868083, "ewc_loss": 0.05912186950445175, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027627727831713855, "grad_norm": 7.056962490081787, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.870718777179718, "num_tokens": 478828856.0, "step": 12552 }, { "epoch": 1.5968706271466735, "ewc_loss": 0.05921248346567154, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002771834551822394, "grad_norm": 7.1315717697143555, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8620964288711548, "num_tokens": 478870240.0, "step": 12553 }, { "epoch": 1.5969978374252638, "ewc_loss": 0.05911264196038246, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761850191745907, "grad_norm": 7.069060325622559, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8548504114151001, "num_tokens": 478911920.0, "step": 12554 }, { "epoch": 1.5971250477038543, "ewc_loss": 0.0592941977083683, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027800057432614267, "grad_norm": 7.222714900970459, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8571223020553589, "num_tokens": 478949083.0, "step": 12555 }, { "epoch": 1.5972522579824449, "ewc_loss": 0.059080496430397034, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758635382633656, "grad_norm": 7.108259677886963, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.862326979637146, "num_tokens": 478984087.0, "step": 12556 }, { "epoch": 1.5973794682610354, "ewc_loss": 0.05918511748313904, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002769097627606243, "grad_norm": 7.160892009735107, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.844659686088562, "num_tokens": 479019364.0, "step": 12557 }, { "epoch": 1.597506678539626, "ewc_loss": 0.05909072235226631, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002759658091235906, "grad_norm": 7.10376501083374, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8666539192199707, "num_tokens": 479059002.0, "step": 12558 }, { "epoch": 1.5976338888182164, "ewc_loss": 0.0590762123465538, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002758206974249333, "grad_norm": 7.133732795715332, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8636696338653564, "num_tokens": 479092676.0, "step": 12559 }, { "epoch": 1.597761099096807, "ewc_loss": 0.059047020971775055, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002755287860054523, "grad_norm": 7.13614559173584, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8677984476089478, "num_tokens": 479125951.0, "step": 12560 }, { "epoch": 1.5978883093753975, "ewc_loss": 0.05915513634681702, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000276609935099259, "grad_norm": 7.0725555419921875, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8802583813667297, "num_tokens": 479166230.0, "step": 12561 }, { "epoch": 1.598015519653988, "ewc_loss": 0.0594823956489563, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002774411696009338, "grad_norm": 11.426950454711914, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8601434230804443, "num_tokens": 479205813.0, "step": 12562 }, { "epoch": 1.5981427299325786, "ewc_loss": 0.06456255912780762, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0003306842118036002, "grad_norm": 7.706435680389404, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8606557846069336, "num_tokens": 479244960.0, "step": 12563 }, { "epoch": 1.598269940211169, "ewc_loss": 0.058923542499542236, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027429399779066443, "grad_norm": 7.1159539222717285, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8706791400909424, "num_tokens": 479289617.0, "step": 12564 }, { "epoch": 1.5983971504897596, "ewc_loss": 0.06016945838928223, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028431176906451583, "grad_norm": 7.3490095138549805, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8639532327651978, "num_tokens": 479326377.0, "step": 12565 }, { "epoch": 1.5985243607683501, "ewc_loss": 0.059728436172008514, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028234298224560916, "grad_norm": 7.227302074432373, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8628224730491638, "num_tokens": 479360563.0, "step": 12566 }, { "epoch": 1.5986515710469407, "ewc_loss": 0.05954258516430855, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002804844407364726, "grad_norm": 7.22028923034668, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8750371932983398, "num_tokens": 479403042.0, "step": 12567 }, { "epoch": 1.5987787813255312, "ewc_loss": 0.059408023953437805, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002791388251353055, "grad_norm": 7.119504928588867, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8699238300323486, "num_tokens": 479437099.0, "step": 12568 }, { "epoch": 1.5989059916041217, "ewc_loss": 0.05952989310026169, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002803575189318508, "grad_norm": 7.2381463050842285, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8622654676437378, "num_tokens": 479479326.0, "step": 12569 }, { "epoch": 1.5990332018827123, "ewc_loss": 0.05940241366624832, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002790827420540154, "grad_norm": 7.180325031280518, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8454675674438477, "num_tokens": 479515320.0, "step": 12570 }, { "epoch": 1.5991604121613028, "ewc_loss": 0.05942753702402115, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002793339663185179, "grad_norm": 7.1614990234375, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8531543612480164, "num_tokens": 479559960.0, "step": 12571 }, { "epoch": 1.5992876224398933, "ewc_loss": 0.059353530406951904, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002785939141176641, "grad_norm": 7.17002010345459, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8643394708633423, "num_tokens": 479599115.0, "step": 12572 }, { "epoch": 1.5994148327184836, "ewc_loss": 0.059334151446819305, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027840008260682225, "grad_norm": 7.149155139923096, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8724817633628845, "num_tokens": 479636116.0, "step": 12573 }, { "epoch": 1.5995420429970741, "ewc_loss": 0.05933757498860359, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027843433781526983, "grad_norm": 7.163092136383057, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8550532460212708, "num_tokens": 479676880.0, "step": 12574 }, { "epoch": 1.5996692532756647, "ewc_loss": 0.05936963111162186, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002787548874039203, "grad_norm": 7.2093353271484375, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8656591773033142, "num_tokens": 479717424.0, "step": 12575 }, { "epoch": 1.5997964635542552, "ewc_loss": 0.0592719241976738, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002777778427116573, "grad_norm": 7.176672458648682, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8629242181777954, "num_tokens": 479756240.0, "step": 12576 }, { "epoch": 1.5999236738328457, "ewc_loss": 0.05940008908510208, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027905945898965, "grad_norm": 7.191638469696045, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8500873446464539, "num_tokens": 479791863.0, "step": 12577 }, { "epoch": 1.6000508841114363, "ewc_loss": 0.05915975198149681, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027665612287819386, "grad_norm": 7.1714701652526855, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8743420839309692, "num_tokens": 479827252.0, "step": 12578 }, { "epoch": 1.6001780943900266, "ewc_loss": 0.05921194702386856, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002771780709736049, "grad_norm": 7.162635326385498, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8613050580024719, "num_tokens": 479863100.0, "step": 12579 }, { "epoch": 1.600305304668617, "ewc_loss": 0.059195324778556824, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027701182989403605, "grad_norm": 7.2056145668029785, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8531062602996826, "num_tokens": 479899684.0, "step": 12580 }, { "epoch": 1.6004325149472076, "ewc_loss": 0.059173598885536194, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027679456979967654, "grad_norm": 7.212258815765381, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.865654468536377, "num_tokens": 479937656.0, "step": 12581 }, { "epoch": 1.6005597252257981, "ewc_loss": 0.05915757268667221, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027663432410918176, "grad_norm": 7.179372310638428, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8580795526504517, "num_tokens": 479973357.0, "step": 12582 }, { "epoch": 1.6006869355043887, "ewc_loss": 0.05914713442325592, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027652992866933346, "grad_norm": 7.14000129699707, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8647633790969849, "num_tokens": 480017862.0, "step": 12583 }, { "epoch": 1.6008141457829792, "ewc_loss": 0.05923096835613251, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027736826450563967, "grad_norm": 7.192224979400635, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8689785003662109, "num_tokens": 480057039.0, "step": 12584 }, { "epoch": 1.6009413560615697, "ewc_loss": 0.05906892940402031, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027574787964113057, "grad_norm": 7.093148708343506, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8674345016479492, "num_tokens": 480093396.0, "step": 12585 }, { "epoch": 1.6010685663401603, "ewc_loss": 0.0593084990978241, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027814359054900706, "grad_norm": 7.191239356994629, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8606857657432556, "num_tokens": 480125200.0, "step": 12586 }, { "epoch": 1.6011957766187508, "ewc_loss": 0.05913309007883072, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002763895026873797, "grad_norm": 7.194627285003662, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8598968982696533, "num_tokens": 480158085.0, "step": 12587 }, { "epoch": 1.6013229868973413, "ewc_loss": 0.0592615082859993, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027767368010245264, "grad_norm": 7.124186038970947, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8792808055877686, "num_tokens": 480192979.0, "step": 12588 }, { "epoch": 1.6014501971759318, "ewc_loss": 0.059237897396087646, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002774375898297876, "grad_norm": 7.120118141174316, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8749663829803467, "num_tokens": 480227984.0, "step": 12589 }, { "epoch": 1.6015774074545224, "ewc_loss": 0.05925743281841278, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002776329347398132, "grad_norm": 7.134249210357666, "learning_rate": 1e-06, "loss": 0.4009, "mean_token_accuracy": 0.8812434077262878, "num_tokens": 480264280.0, "step": 12590 }, { "epoch": 1.601704617733113, "ewc_loss": 0.05928283929824829, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002778870111797005, "grad_norm": 7.144376754760742, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8493179082870483, "num_tokens": 480300670.0, "step": 12591 }, { "epoch": 1.6018318280117034, "ewc_loss": 0.059227488934993744, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002773334563244134, "grad_norm": 7.1538004875183105, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8629181385040283, "num_tokens": 480334460.0, "step": 12592 }, { "epoch": 1.601959038290294, "ewc_loss": 0.05929500609636307, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002780086360871792, "grad_norm": 7.0827107429504395, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8734709024429321, "num_tokens": 480376066.0, "step": 12593 }, { "epoch": 1.6020862485688845, "ewc_loss": 0.05935864895582199, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002786451077554375, "grad_norm": 7.124596118927002, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.870677649974823, "num_tokens": 480417575.0, "step": 12594 }, { "epoch": 1.602213458847475, "ewc_loss": 0.05933205783367157, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002783791860565543, "grad_norm": 7.135998249053955, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8552550077438354, "num_tokens": 480454106.0, "step": 12595 }, { "epoch": 1.6023406691260655, "ewc_loss": 0.059339310973882675, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002784517128020525, "grad_norm": 7.141581058502197, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8515850305557251, "num_tokens": 480493031.0, "step": 12596 }, { "epoch": 1.6024678794046558, "ewc_loss": 0.059230536222457886, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002773639571387321, "grad_norm": 7.102654457092285, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.864395260810852, "num_tokens": 480528326.0, "step": 12597 }, { "epoch": 1.6025950896832464, "ewc_loss": 0.059383343905210495, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002788920246530324, "grad_norm": 7.135222434997559, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8710961937904358, "num_tokens": 480565399.0, "step": 12598 }, { "epoch": 1.602722299961837, "ewc_loss": 0.0592927448451519, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027798605151474476, "grad_norm": 7.104979991912842, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8795337677001953, "num_tokens": 480600028.0, "step": 12599 }, { "epoch": 1.6028495102404274, "ewc_loss": 0.059335313737392426, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027841172413900495, "grad_norm": 7.117722511291504, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8643490076065063, "num_tokens": 480635759.0, "step": 12600 }, { "epoch": 1.602976720519018, "ewc_loss": 0.05951908603310585, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002778080524876714, "grad_norm": 11.489481925964355, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8701065182685852, "num_tokens": 480669528.0, "step": 12601 }, { "epoch": 1.6031039307976085, "ewc_loss": 0.06474722176790237, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0003300894168205559, "grad_norm": 7.685711860656738, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8619623184204102, "num_tokens": 480709425.0, "step": 12602 }, { "epoch": 1.6032311410761988, "ewc_loss": 0.05923838168382645, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027500101714394987, "grad_norm": 7.191010475158691, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8495651483535767, "num_tokens": 480745010.0, "step": 12603 }, { "epoch": 1.6033583513547893, "ewc_loss": 0.05998138338327408, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002848724543582648, "grad_norm": 7.281471252441406, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8710982799530029, "num_tokens": 480780320.0, "step": 12604 }, { "epoch": 1.6034855616333799, "ewc_loss": 0.06022056192159653, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002848228323273361, "grad_norm": 7.157053470611572, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8713140487670898, "num_tokens": 480819335.0, "step": 12605 }, { "epoch": 1.6036127719119704, "ewc_loss": 0.06048733741044998, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028260776889510453, "grad_norm": 7.300893306732178, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.876607358455658, "num_tokens": 480852702.0, "step": 12606 }, { "epoch": 1.603739982190561, "ewc_loss": 0.05954549461603165, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002805135154630989, "grad_norm": 7.167355060577393, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8585600852966309, "num_tokens": 480890258.0, "step": 12607 }, { "epoch": 1.6038671924691514, "ewc_loss": 0.05968587473034859, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028191733872517943, "grad_norm": 7.220295429229736, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8642842173576355, "num_tokens": 480929555.0, "step": 12608 }, { "epoch": 1.603994402747742, "ewc_loss": 0.05950348451733589, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002800934307742864, "grad_norm": 7.100635528564453, "learning_rate": 1e-06, "loss": 0.3882, "mean_token_accuracy": 0.8873140811920166, "num_tokens": 480970295.0, "step": 12609 }, { "epoch": 1.6041216130263325, "ewc_loss": 0.05962586775422096, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002813172759488225, "grad_norm": 7.1710662841796875, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8663232326507568, "num_tokens": 481011213.0, "step": 12610 }, { "epoch": 1.604248823304923, "ewc_loss": 0.059442222118377686, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002794808242470026, "grad_norm": 7.227705478668213, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8587352633476257, "num_tokens": 481045273.0, "step": 12611 }, { "epoch": 1.6043760335835135, "ewc_loss": 0.059446729719638824, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002795259060803801, "grad_norm": 7.122000694274902, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8440044522285461, "num_tokens": 481084567.0, "step": 12612 }, { "epoch": 1.604503243862104, "ewc_loss": 0.05942978337407112, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002793564344756305, "grad_norm": 7.1755523681640625, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8660100698471069, "num_tokens": 481119455.0, "step": 12613 }, { "epoch": 1.6046304541406946, "ewc_loss": 0.059418484568595886, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000279243424301967, "grad_norm": 7.2109222412109375, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8494723439216614, "num_tokens": 481154718.0, "step": 12614 }, { "epoch": 1.6047576644192851, "ewc_loss": 0.05937571078538895, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002788156853057444, "grad_norm": 7.129491806030273, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8582721948623657, "num_tokens": 481193716.0, "step": 12615 }, { "epoch": 1.6048848746978757, "ewc_loss": 0.05937263369560242, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027878492255695164, "grad_norm": 7.1839823722839355, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8405292630195618, "num_tokens": 481231378.0, "step": 12616 }, { "epoch": 1.6050120849764662, "ewc_loss": 0.059349752962589264, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002785561082419008, "grad_norm": 7.12504243850708, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8619363307952881, "num_tokens": 481267907.0, "step": 12617 }, { "epoch": 1.6051392952550567, "ewc_loss": 0.05940229818224907, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027908157790079713, "grad_norm": 7.138832092285156, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8654419183731079, "num_tokens": 481309899.0, "step": 12618 }, { "epoch": 1.6052665055336472, "ewc_loss": 0.05939587205648422, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002790172875393182, "grad_norm": 7.148496150970459, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8741738796234131, "num_tokens": 481348845.0, "step": 12619 }, { "epoch": 1.6053937158122378, "ewc_loss": 0.05942115932703018, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002792701998259872, "grad_norm": 7.078307151794434, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8556803464889526, "num_tokens": 481384999.0, "step": 12620 }, { "epoch": 1.6055209260908283, "ewc_loss": 0.05939618870615959, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027902048896066844, "grad_norm": 7.157527446746826, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8671848773956299, "num_tokens": 481417287.0, "step": 12621 }, { "epoch": 1.6056481363694186, "ewc_loss": 0.05936775356531143, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027873614453710616, "grad_norm": 7.119797706604004, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8682183623313904, "num_tokens": 481455943.0, "step": 12622 }, { "epoch": 1.6057753466480091, "ewc_loss": 0.0594705268740654, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002797638881020248, "grad_norm": 7.138509750366211, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8799821734428406, "num_tokens": 481492017.0, "step": 12623 }, { "epoch": 1.6059025569265997, "ewc_loss": 0.05966565012931824, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027927366318181157, "grad_norm": 7.105171203613281, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8572980761528015, "num_tokens": 481535115.0, "step": 12624 }, { "epoch": 1.6060297672051902, "ewc_loss": 0.05935804545879364, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002786390541587025, "grad_norm": 7.08589506149292, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8487634658813477, "num_tokens": 481580048.0, "step": 12625 }, { "epoch": 1.6061569774837807, "ewc_loss": 0.05949530005455017, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028001159080304205, "grad_norm": 7.1287841796875, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.875052809715271, "num_tokens": 481616748.0, "step": 12626 }, { "epoch": 1.6062841877623713, "ewc_loss": 0.059297528117895126, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027803386910818517, "grad_norm": 7.096983432769775, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8353931903839111, "num_tokens": 481659123.0, "step": 12627 }, { "epoch": 1.6064113980409616, "ewc_loss": 0.059495359659194946, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002800121728796512, "grad_norm": 7.13645076751709, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8550013303756714, "num_tokens": 481704791.0, "step": 12628 }, { "epoch": 1.606538608319552, "ewc_loss": 0.05940311402082443, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000279089726973325, "grad_norm": 7.146872043609619, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8513796329498291, "num_tokens": 481742913.0, "step": 12629 }, { "epoch": 1.6066658185981426, "ewc_loss": 0.05962805077433586, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002788976998999715, "grad_norm": 7.093120574951172, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8515074849128723, "num_tokens": 481783549.0, "step": 12630 }, { "epoch": 1.6067930288767331, "ewc_loss": 0.05942773446440697, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000279335945378989, "grad_norm": 7.175965309143066, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8692110180854797, "num_tokens": 481818486.0, "step": 12631 }, { "epoch": 1.6069202391553237, "ewc_loss": 0.0593143068253994, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027820165269076824, "grad_norm": 7.13899564743042, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8647314310073853, "num_tokens": 481849969.0, "step": 12632 }, { "epoch": 1.6070474494339142, "ewc_loss": 0.05941620469093323, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002792206360027194, "grad_norm": 7.152711868286133, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.847549319267273, "num_tokens": 481886108.0, "step": 12633 }, { "epoch": 1.6071746597125047, "ewc_loss": 0.059318892657756805, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027824752032756805, "grad_norm": 7.092204570770264, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8753150701522827, "num_tokens": 481919389.0, "step": 12634 }, { "epoch": 1.6073018699910953, "ewc_loss": 0.05935373157262802, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027859589317813516, "grad_norm": 7.1258015632629395, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8772108554840088, "num_tokens": 481958531.0, "step": 12635 }, { "epoch": 1.6074290802696858, "ewc_loss": 0.05939562991261482, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027901490102522075, "grad_norm": 7.121945858001709, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8593478202819824, "num_tokens": 481999864.0, "step": 12636 }, { "epoch": 1.6075562905482763, "ewc_loss": 0.059599410742521286, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027861128910444677, "grad_norm": 7.131323337554932, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8874896764755249, "num_tokens": 482033890.0, "step": 12637 }, { "epoch": 1.6076835008268668, "ewc_loss": 0.05936764180660248, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002787350385915488, "grad_norm": 7.148211479187012, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8686602711677551, "num_tokens": 482071186.0, "step": 12638 }, { "epoch": 1.6078107111054574, "ewc_loss": 0.05929344892501831, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027799306553788483, "grad_norm": 7.126926422119141, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8628052473068237, "num_tokens": 482107218.0, "step": 12639 }, { "epoch": 1.607937921384048, "ewc_loss": 0.05959713086485863, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027858850080519915, "grad_norm": 7.083436489105225, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8600735664367676, "num_tokens": 482146101.0, "step": 12640 }, { "epoch": 1.6080651316626384, "ewc_loss": 0.059390589594841, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002789644931908697, "grad_norm": 7.150096416473389, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8777315616607666, "num_tokens": 482178531.0, "step": 12641 }, { "epoch": 1.608192341941229, "ewc_loss": 0.059321947395801544, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027827805024571717, "grad_norm": 7.060461521148682, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8732536435127258, "num_tokens": 482217019.0, "step": 12642 }, { "epoch": 1.6083195522198195, "ewc_loss": 0.059680063277482986, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002794178144540638, "grad_norm": 7.163964748382568, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8654295206069946, "num_tokens": 482250547.0, "step": 12643 }, { "epoch": 1.60844676249841, "ewc_loss": 0.059326086193323135, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002783194649964571, "grad_norm": 7.1183695793151855, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8595948219299316, "num_tokens": 482287009.0, "step": 12644 }, { "epoch": 1.6085739727770005, "ewc_loss": 0.059482473880052567, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027988333022221923, "grad_norm": 7.113437652587891, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8516687154769897, "num_tokens": 482327373.0, "step": 12645 }, { "epoch": 1.6087011830555908, "ewc_loss": 0.05938739702105522, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027893256628885865, "grad_norm": 7.166317939758301, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8533334136009216, "num_tokens": 482363409.0, "step": 12646 }, { "epoch": 1.6088283933341814, "ewc_loss": 0.05934898555278778, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002785484539344907, "grad_norm": 7.144289970397949, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.853357195854187, "num_tokens": 482396499.0, "step": 12647 }, { "epoch": 1.608955603612772, "ewc_loss": 0.05934188514947891, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027847744058817625, "grad_norm": 7.083057880401611, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8662752509117126, "num_tokens": 482435059.0, "step": 12648 }, { "epoch": 1.6090828138913624, "ewc_loss": 0.05981205403804779, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027829629834741354, "grad_norm": 7.131087303161621, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8561704158782959, "num_tokens": 482472793.0, "step": 12649 }, { "epoch": 1.609210024169953, "ewc_loss": 0.05954882502555847, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002781054354272783, "grad_norm": 7.087835788726807, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8651769161224365, "num_tokens": 482508857.0, "step": 12650 }, { "epoch": 1.6093372344485435, "ewc_loss": 0.059679724276065826, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002794144384097308, "grad_norm": 7.101749420166016, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8491827249526978, "num_tokens": 482548526.0, "step": 12651 }, { "epoch": 1.6094644447271338, "ewc_loss": 0.0598805733025074, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002789815189316869, "grad_norm": 7.161347389221191, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8561801910400391, "num_tokens": 482586461.0, "step": 12652 }, { "epoch": 1.6095916550057243, "ewc_loss": 0.059894222766160965, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027911801589652896, "grad_norm": 7.115376949310303, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8563617467880249, "num_tokens": 482624642.0, "step": 12653 }, { "epoch": 1.6097188652843148, "ewc_loss": 0.059893928468227386, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027911504730582237, "grad_norm": 7.078799247741699, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8626586198806763, "num_tokens": 482664096.0, "step": 12654 }, { "epoch": 1.6098460755629054, "ewc_loss": 0.059942830353975296, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027960407896898687, "grad_norm": 7.1571197509765625, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8749921321868896, "num_tokens": 482701230.0, "step": 12655 }, { "epoch": 1.609973285841496, "ewc_loss": 0.05981016159057617, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002782774099614471, "grad_norm": 7.112244129180908, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8652553558349609, "num_tokens": 482741611.0, "step": 12656 }, { "epoch": 1.6101004961200864, "ewc_loss": 0.05965668335556984, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027918402338400483, "grad_norm": 7.103867530822754, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8745399713516235, "num_tokens": 482779482.0, "step": 12657 }, { "epoch": 1.610227706398677, "ewc_loss": 0.059534162282943726, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027795881032943726, "grad_norm": 7.108818531036377, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.87320476770401, "num_tokens": 482816279.0, "step": 12658 }, { "epoch": 1.6103549166772675, "ewc_loss": 0.05957305431365967, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002783477248158306, "grad_norm": 7.141263484954834, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8548743724822998, "num_tokens": 482859941.0, "step": 12659 }, { "epoch": 1.610482126955858, "ewc_loss": 0.05957246944308281, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002783418749459088, "grad_norm": 7.141402244567871, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8689686059951782, "num_tokens": 482894348.0, "step": 12660 }, { "epoch": 1.6106093372344485, "ewc_loss": 0.05982680991292, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027844388387165964, "grad_norm": 7.170010089874268, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.858808696269989, "num_tokens": 482930882.0, "step": 12661 }, { "epoch": 1.610736547513039, "ewc_loss": 0.0595238134264946, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002778553171083331, "grad_norm": 7.1851019859313965, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.846501886844635, "num_tokens": 482969515.0, "step": 12662 }, { "epoch": 1.6108637577916296, "ewc_loss": 0.05959657207131386, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027858291286975145, "grad_norm": 7.373500823974609, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8606181740760803, "num_tokens": 483008074.0, "step": 12663 }, { "epoch": 1.6109909680702201, "ewc_loss": 0.059437818825244904, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027699535712599754, "grad_norm": 7.590918064117432, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8572958707809448, "num_tokens": 483038920.0, "step": 12664 }, { "epoch": 1.6111181783488107, "ewc_loss": 0.059179410338401794, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027441131533123553, "grad_norm": 7.096991062164307, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8737728595733643, "num_tokens": 483075559.0, "step": 12665 }, { "epoch": 1.6112453886274012, "ewc_loss": 0.05963815376162529, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027655731537379324, "grad_norm": 7.219746112823486, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8674870133399963, "num_tokens": 483114378.0, "step": 12666 }, { "epoch": 1.6113725989059917, "ewc_loss": 0.05910521745681763, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027366934227757156, "grad_norm": 7.193412780761719, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8686310052871704, "num_tokens": 483147911.0, "step": 12667 }, { "epoch": 1.6114998091845822, "ewc_loss": 0.05930660665035248, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027568324003368616, "grad_norm": 7.040407657623291, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8784183859825134, "num_tokens": 483188275.0, "step": 12668 }, { "epoch": 1.6116270194631728, "ewc_loss": 0.059535615146160126, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002779733622446656, "grad_norm": 7.269162654876709, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8757451772689819, "num_tokens": 483230804.0, "step": 12669 }, { "epoch": 1.6117542297417633, "ewc_loss": 0.05918492376804352, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002744664379861206, "grad_norm": 7.045705795288086, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8677124977111816, "num_tokens": 483262368.0, "step": 12670 }, { "epoch": 1.6118814400203536, "ewc_loss": 0.05995183438062668, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027969412622042, "grad_norm": 7.260982513427734, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.851909339427948, "num_tokens": 483300894.0, "step": 12671 }, { "epoch": 1.6120086502989441, "ewc_loss": 0.059230685234069824, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002749240375123918, "grad_norm": 7.02801513671875, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.870864987373352, "num_tokens": 483337377.0, "step": 12672 }, { "epoch": 1.6121358605775347, "ewc_loss": 0.059998441487550735, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028016019496135414, "grad_norm": 7.332520484924316, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8638334274291992, "num_tokens": 483376572.0, "step": 12673 }, { "epoch": 1.6122630708561252, "ewc_loss": 0.059349872171878815, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027611589757725596, "grad_norm": 7.0991363525390625, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.851766049861908, "num_tokens": 483415584.0, "step": 12674 }, { "epoch": 1.6123902811347157, "ewc_loss": 0.059686727821826935, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002794844622258097, "grad_norm": 7.302207946777344, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.86524498462677, "num_tokens": 483459584.0, "step": 12675 }, { "epoch": 1.6125174914133062, "ewc_loss": 0.05929381772875786, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002755553578026593, "grad_norm": 7.053177833557129, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8581346869468689, "num_tokens": 483504260.0, "step": 12676 }, { "epoch": 1.6126447016918966, "ewc_loss": 0.059568680822849274, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028074541478417814, "grad_norm": 7.489920616149902, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8672851324081421, "num_tokens": 483547271.0, "step": 12677 }, { "epoch": 1.612771911970487, "ewc_loss": 0.059214454144239426, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002747617254499346, "grad_norm": 7.037493705749512, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8680024147033691, "num_tokens": 483582255.0, "step": 12678 }, { "epoch": 1.6128991222490776, "ewc_loss": 0.05995909124612808, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002822080859914422, "grad_norm": 7.650169849395752, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.859899640083313, "num_tokens": 483622290.0, "step": 12679 }, { "epoch": 1.6130263325276681, "ewc_loss": 0.05919507145881653, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027456789393909276, "grad_norm": 7.0771636962890625, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8762382864952087, "num_tokens": 483657253.0, "step": 12680 }, { "epoch": 1.6131535428062587, "ewc_loss": 0.06000635027885437, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002826807030942291, "grad_norm": 7.65302848815918, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8687264919281006, "num_tokens": 483693062.0, "step": 12681 }, { "epoch": 1.6132807530848492, "ewc_loss": 0.059283651411533356, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002754536981228739, "grad_norm": 6.995487213134766, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8590408563613892, "num_tokens": 483735845.0, "step": 12682 }, { "epoch": 1.6134079633634397, "ewc_loss": 0.06014065444469452, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000284023757558316, "grad_norm": 7.394376754760742, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8676819801330566, "num_tokens": 483783833.0, "step": 12683 }, { "epoch": 1.6135351736420303, "ewc_loss": 0.05931210517883301, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002757382462732494, "grad_norm": 7.155848503112793, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8637413382530212, "num_tokens": 483820175.0, "step": 12684 }, { "epoch": 1.6136623839206208, "ewc_loss": 0.059810787439346313, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002807250421028584, "grad_norm": 7.240612030029297, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8748177289962769, "num_tokens": 483863454.0, "step": 12685 }, { "epoch": 1.6137895941992113, "ewc_loss": 0.05947498604655266, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002773670421447605, "grad_norm": 7.24522590637207, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.884246826171875, "num_tokens": 483900657.0, "step": 12686 }, { "epoch": 1.6139168044778018, "ewc_loss": 0.059500597417354584, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027762315585277975, "grad_norm": 7.207243919372559, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8583812713623047, "num_tokens": 483935912.0, "step": 12687 }, { "epoch": 1.6140440147563924, "ewc_loss": 0.059280749410390854, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002778660855256021, "grad_norm": 7.241085529327393, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8655716776847839, "num_tokens": 483968594.0, "step": 12688 }, { "epoch": 1.614171225034983, "ewc_loss": 0.05940479040145874, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027666508685797453, "grad_norm": 7.103455543518066, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8732233047485352, "num_tokens": 484010866.0, "step": 12689 }, { "epoch": 1.6142984353135734, "ewc_loss": 0.05936852842569351, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027874388615600765, "grad_norm": 7.237245082855225, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8595532178878784, "num_tokens": 484051355.0, "step": 12690 }, { "epoch": 1.614425645592164, "ewc_loss": 0.05905938893556595, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002756524772848934, "grad_norm": 7.159134864807129, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8742607831954956, "num_tokens": 484087049.0, "step": 12691 }, { "epoch": 1.6145528558707545, "ewc_loss": 0.0592929869890213, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027798846713267267, "grad_norm": 7.227745056152344, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8539313673973083, "num_tokens": 484124914.0, "step": 12692 }, { "epoch": 1.614680066149345, "ewc_loss": 0.05911153554916382, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761739306151867, "grad_norm": 7.185730457305908, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8811229467391968, "num_tokens": 484154524.0, "step": 12693 }, { "epoch": 1.6148072764279355, "ewc_loss": 0.0591927245259285, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002769858401734382, "grad_norm": 7.181896209716797, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8629088401794434, "num_tokens": 484195091.0, "step": 12694 }, { "epoch": 1.6149344867065258, "ewc_loss": 0.059188731014728546, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027694590971805155, "grad_norm": 7.148322105407715, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8664911985397339, "num_tokens": 484236787.0, "step": 12695 }, { "epoch": 1.6150616969851164, "ewc_loss": 0.05913986265659332, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027645722730085254, "grad_norm": 7.194573402404785, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8604992628097534, "num_tokens": 484281748.0, "step": 12696 }, { "epoch": 1.615188907263707, "ewc_loss": 0.05910911411046982, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027614974533207715, "grad_norm": 7.199051856994629, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8721561431884766, "num_tokens": 484321181.0, "step": 12697 }, { "epoch": 1.6153161175422974, "ewc_loss": 0.05945844203233719, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002772016159724444, "grad_norm": 7.272462844848633, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8620539903640747, "num_tokens": 484360476.0, "step": 12698 }, { "epoch": 1.615443327820888, "ewc_loss": 0.05901172012090683, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027517578564584255, "grad_norm": 7.2020583152771, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8652313947677612, "num_tokens": 484395065.0, "step": 12699 }, { "epoch": 1.6155705380994785, "ewc_loss": 0.0591660812497139, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027671942370943725, "grad_norm": 7.196395397186279, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8669716119766235, "num_tokens": 484439309.0, "step": 12700 }, { "epoch": 1.6156977483780688, "ewc_loss": 0.059107035398483276, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002761289360933006, "grad_norm": 7.235133647918701, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8639446496963501, "num_tokens": 484473018.0, "step": 12701 }, { "epoch": 1.6158249586566593, "ewc_loss": 0.05909312516450882, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027598984888754785, "grad_norm": 7.125329971313477, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8714589476585388, "num_tokens": 484518306.0, "step": 12702 }, { "epoch": 1.6159521689352498, "ewc_loss": 0.059147581458091736, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002765344106592238, "grad_norm": 7.1989970207214355, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8513383865356445, "num_tokens": 484556348.0, "step": 12703 }, { "epoch": 1.6160793792138404, "ewc_loss": 0.05910007655620575, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.000276059377938509, "grad_norm": 7.201141357421875, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8753440976142883, "num_tokens": 484593136.0, "step": 12704 }, { "epoch": 1.616206589492431, "ewc_loss": 0.059141606092453, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002764746604952961, "grad_norm": 7.163665771484375, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8742126226425171, "num_tokens": 484628025.0, "step": 12705 }, { "epoch": 1.6163337997710214, "ewc_loss": 0.05941574275493622, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002767746336758137, "grad_norm": 7.223433017730713, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8709859848022461, "num_tokens": 484666703.0, "step": 12706 }, { "epoch": 1.616461010049612, "ewc_loss": 0.05915161222219467, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027657471946440637, "grad_norm": 7.180592060089111, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8739198446273804, "num_tokens": 484703417.0, "step": 12707 }, { "epoch": 1.6165882203282025, "ewc_loss": 0.05920931696891785, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002771517902147025, "grad_norm": 7.130476951599121, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8788753747940063, "num_tokens": 484741830.0, "step": 12708 }, { "epoch": 1.616715430606793, "ewc_loss": 0.05925655737519264, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027762417448684573, "grad_norm": 7.205844879150391, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8645168542861938, "num_tokens": 484780140.0, "step": 12709 }, { "epoch": 1.6168426408853835, "ewc_loss": 0.05931156873703003, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027817426598630846, "grad_norm": 7.1827592849731445, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8622937202453613, "num_tokens": 484813856.0, "step": 12710 }, { "epoch": 1.616969851163974, "ewc_loss": 0.05927644670009613, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002778230409603566, "grad_norm": 7.192307949066162, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8743863105773926, "num_tokens": 484851383.0, "step": 12711 }, { "epoch": 1.6170970614425646, "ewc_loss": 0.059476807713508606, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002773852611426264, "grad_norm": 7.187043190002441, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8736493587493896, "num_tokens": 484884361.0, "step": 12712 }, { "epoch": 1.6172242717211551, "ewc_loss": 0.059286341071128845, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027792202308773994, "grad_norm": 7.2437424659729, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8448035717010498, "num_tokens": 484922106.0, "step": 12713 }, { "epoch": 1.6173514819997457, "ewc_loss": 0.059227705001831055, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027733566821552813, "grad_norm": 7.212435722351074, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8649533987045288, "num_tokens": 484963100.0, "step": 12714 }, { "epoch": 1.6174786922783362, "ewc_loss": 0.059171199798583984, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027677061734721065, "grad_norm": 7.193717956542969, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8757727146148682, "num_tokens": 485006896.0, "step": 12715 }, { "epoch": 1.6176059025569267, "ewc_loss": 0.05920453369617462, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027710391441360116, "grad_norm": 7.128777980804443, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8649832010269165, "num_tokens": 485045968.0, "step": 12716 }, { "epoch": 1.6177331128355172, "ewc_loss": 0.05926567316055298, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027771529858000576, "grad_norm": 7.206508159637451, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8631492853164673, "num_tokens": 485084736.0, "step": 12717 }, { "epoch": 1.6178603231141078, "ewc_loss": 0.0591539666056633, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027659826446324587, "grad_norm": 7.09158182144165, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8717995882034302, "num_tokens": 485123130.0, "step": 12718 }, { "epoch": 1.6179875333926983, "ewc_loss": 0.059515565633773804, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002777728659566492, "grad_norm": 7.151763916015625, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8631857633590698, "num_tokens": 485158072.0, "step": 12719 }, { "epoch": 1.6181147436712886, "ewc_loss": 0.05949266999959946, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027754390612244606, "grad_norm": 7.1581926345825195, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8740607500076294, "num_tokens": 485196783.0, "step": 12720 }, { "epoch": 1.6182419539498791, "ewc_loss": 0.05952734500169754, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027789062005467713, "grad_norm": 7.11940336227417, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8707006573677063, "num_tokens": 485231433.0, "step": 12721 }, { "epoch": 1.6183691642284697, "ewc_loss": 0.05937042832374573, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027876286185346544, "grad_norm": 7.1729021072387695, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8632392883300781, "num_tokens": 485269016.0, "step": 12722 }, { "epoch": 1.6184963745070602, "ewc_loss": 0.05947142839431763, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002773314481601119, "grad_norm": 7.09025239944458, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8539912700653076, "num_tokens": 485308293.0, "step": 12723 }, { "epoch": 1.6186235847856507, "ewc_loss": 0.05942340940237045, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002792926679830998, "grad_norm": 7.1621832847595215, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.87431401014328, "num_tokens": 485340758.0, "step": 12724 }, { "epoch": 1.6187507950642412, "ewc_loss": 0.059532567858695984, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027794286143034697, "grad_norm": 7.083628177642822, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8798103332519531, "num_tokens": 485384047.0, "step": 12725 }, { "epoch": 1.6188780053428315, "ewc_loss": 0.05952844396233559, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028034302522428334, "grad_norm": 7.193820953369141, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8687753081321716, "num_tokens": 485416909.0, "step": 12726 }, { "epoch": 1.619005215621422, "ewc_loss": 0.059417299926280975, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027923157904297113, "grad_norm": 7.146081447601318, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.864580512046814, "num_tokens": 485449532.0, "step": 12727 }, { "epoch": 1.6191324259000126, "ewc_loss": 0.059644944965839386, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002790666476357728, "grad_norm": 7.131165981292725, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8752315044403076, "num_tokens": 485491171.0, "step": 12728 }, { "epoch": 1.6192596361786031, "ewc_loss": 0.05970287322998047, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027964593027718365, "grad_norm": 7.173046112060547, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8614076375961304, "num_tokens": 485525511.0, "step": 12729 }, { "epoch": 1.6193868464571937, "ewc_loss": 0.05972980707883835, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027991525712423027, "grad_norm": 7.148226261138916, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8620496988296509, "num_tokens": 485562517.0, "step": 12730 }, { "epoch": 1.6195140567357842, "ewc_loss": 0.05969619005918503, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000279579107882455, "grad_norm": 7.124396324157715, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8673000931739807, "num_tokens": 485601058.0, "step": 12731 }, { "epoch": 1.6196412670143747, "ewc_loss": 0.059705235064029694, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002796695625875145, "grad_norm": 7.152543067932129, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8782910108566284, "num_tokens": 485634737.0, "step": 12732 }, { "epoch": 1.6197684772929652, "ewc_loss": 0.05967766046524048, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027939380379393697, "grad_norm": 7.143717288970947, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8681879043579102, "num_tokens": 485671300.0, "step": 12733 }, { "epoch": 1.6198956875715558, "ewc_loss": 0.05968114733695984, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002794286410789937, "grad_norm": 7.119284152984619, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8732829689979553, "num_tokens": 485708712.0, "step": 12734 }, { "epoch": 1.6200228978501463, "ewc_loss": 0.059712573885917664, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002797429042402655, "grad_norm": 7.145205497741699, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.878124475479126, "num_tokens": 485746820.0, "step": 12735 }, { "epoch": 1.6201501081287368, "ewc_loss": 0.059686996042728424, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002794871397782117, "grad_norm": 7.158396244049072, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.866562008857727, "num_tokens": 485785921.0, "step": 12736 }, { "epoch": 1.6202773184073274, "ewc_loss": 0.05968337133526802, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002794509055092931, "grad_norm": 7.154972076416016, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8663797378540039, "num_tokens": 485825766.0, "step": 12737 }, { "epoch": 1.6204045286859179, "ewc_loss": 0.05971625819802284, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027977977879345417, "grad_norm": 7.147154808044434, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.859273374080658, "num_tokens": 485862288.0, "step": 12738 }, { "epoch": 1.6205317389645084, "ewc_loss": 0.05944327265024185, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002794913307297975, "grad_norm": 7.1611328125, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8596770763397217, "num_tokens": 485903582.0, "step": 12739 }, { "epoch": 1.620658949243099, "ewc_loss": 0.05966133624315262, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027923056040890515, "grad_norm": 7.078639984130859, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8623631000518799, "num_tokens": 485950931.0, "step": 12740 }, { "epoch": 1.6207861595216895, "ewc_loss": 0.05977483093738556, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002803655224852264, "grad_norm": 7.167786598205566, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8671326041221619, "num_tokens": 485984185.0, "step": 12741 }, { "epoch": 1.62091336980028, "ewc_loss": 0.05988212674856186, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002789970312733203, "grad_norm": 7.109872817993164, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8710826635360718, "num_tokens": 486018796.0, "step": 12742 }, { "epoch": 1.6210405800788705, "ewc_loss": 0.05981603264808655, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002807775163091719, "grad_norm": 7.222713470458984, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8530482053756714, "num_tokens": 486055328.0, "step": 12743 }, { "epoch": 1.6211677903574608, "ewc_loss": 0.05995743349194527, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027975012199021876, "grad_norm": 7.154537677764893, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8593685626983643, "num_tokens": 486092340.0, "step": 12744 }, { "epoch": 1.6212950006360514, "ewc_loss": 0.0595264732837677, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00028032332193106413, "grad_norm": 7.182036399841309, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8532544374465942, "num_tokens": 486123195.0, "step": 12745 }, { "epoch": 1.621422210914642, "ewc_loss": 0.05965026468038559, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002791198203340173, "grad_norm": 7.277118682861328, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8758658766746521, "num_tokens": 486167145.0, "step": 12746 }, { "epoch": 1.6215494211932324, "ewc_loss": 0.05937137454748154, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.00027877234970219433, "grad_norm": 7.11686372756958, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8582336902618408, "num_tokens": 486207925.0, "step": 12747 }, { "epoch": 1.621676631471823, "ewc_loss": 0.059704020619392395, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027965736808255315, "grad_norm": 7.149359703063965, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.869670569896698, "num_tokens": 486245446.0, "step": 12748 }, { "epoch": 1.6218038417504135, "ewc_loss": 0.059599995613098145, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027861713897436857, "grad_norm": 7.107720851898193, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8658625483512878, "num_tokens": 486280461.0, "step": 12749 }, { "epoch": 1.6219310520290038, "ewc_loss": 0.06002341955900192, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002804099931381643, "grad_norm": 7.085165500640869, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.853323221206665, "num_tokens": 486323926.0, "step": 12750 }, { "epoch": 1.6220582623075943, "ewc_loss": 0.05981814116239548, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002807985874824226, "grad_norm": 7.179375648498535, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8581998348236084, "num_tokens": 486360016.0, "step": 12751 }, { "epoch": 1.6221854725861848, "ewc_loss": 0.05974014103412628, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002800186048261821, "grad_norm": 7.117020130157471, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.868109941482544, "num_tokens": 486394994.0, "step": 12752 }, { "epoch": 1.6223126828647754, "ewc_loss": 0.05995309352874756, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028214813210070133, "grad_norm": 7.127152919769287, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8642826080322266, "num_tokens": 486436909.0, "step": 12753 }, { "epoch": 1.622439893143366, "ewc_loss": 0.059999167919158936, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028016744181513786, "grad_norm": 7.162067413330078, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8717185258865356, "num_tokens": 486473201.0, "step": 12754 }, { "epoch": 1.6225671034219564, "ewc_loss": 0.05991702526807785, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002817874192260206, "grad_norm": 7.151710510253906, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.845575213432312, "num_tokens": 486510733.0, "step": 12755 }, { "epoch": 1.622694313700547, "ewc_loss": 0.059785809367895126, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028047527302987874, "grad_norm": 7.159759044647217, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8809636831283569, "num_tokens": 486546666.0, "step": 12756 }, { "epoch": 1.6228215239791375, "ewc_loss": 0.060074105858802795, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028091686544939876, "grad_norm": 7.139387130737305, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.846183180809021, "num_tokens": 486585639.0, "step": 12757 }, { "epoch": 1.622948734257728, "ewc_loss": 0.059808943420648575, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002807066193781793, "grad_norm": 7.138726234436035, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8695944547653198, "num_tokens": 486623028.0, "step": 12758 }, { "epoch": 1.6230759445363185, "ewc_loss": 0.05990859866142273, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002817031927406788, "grad_norm": 7.244288921356201, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8600308895111084, "num_tokens": 486655906.0, "step": 12759 }, { "epoch": 1.623203154814909, "ewc_loss": 0.0597514882683754, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028013208066113293, "grad_norm": 7.148909568786621, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8634628057479858, "num_tokens": 486689461.0, "step": 12760 }, { "epoch": 1.6233303650934996, "ewc_loss": 0.059963665902614594, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002822538372129202, "grad_norm": 7.18949556350708, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8485232591629028, "num_tokens": 486728373.0, "step": 12761 }, { "epoch": 1.6234575753720901, "ewc_loss": 0.059766367077827454, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028028085944242775, "grad_norm": 7.062533855438232, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8762140870094299, "num_tokens": 486769955.0, "step": 12762 }, { "epoch": 1.6235847856506807, "ewc_loss": 0.0599299855530262, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028191704768687487, "grad_norm": 7.22161865234375, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8692179322242737, "num_tokens": 486806468.0, "step": 12763 }, { "epoch": 1.6237119959292712, "ewc_loss": 0.05976354330778122, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002802526287268847, "grad_norm": 7.145755767822266, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8518714904785156, "num_tokens": 486846526.0, "step": 12764 }, { "epoch": 1.6238392062078617, "ewc_loss": 0.05984296277165413, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028104681405238807, "grad_norm": 7.172784328460693, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8821265697479248, "num_tokens": 486886323.0, "step": 12765 }, { "epoch": 1.6239664164864522, "ewc_loss": 0.05969785153865814, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002795957261696458, "grad_norm": 7.132202625274658, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8465072512626648, "num_tokens": 486918099.0, "step": 12766 }, { "epoch": 1.6240936267650428, "ewc_loss": 0.059855375438928604, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028117094188928604, "grad_norm": 7.142223834991455, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8788851499557495, "num_tokens": 486953789.0, "step": 12767 }, { "epoch": 1.6242208370436333, "ewc_loss": 0.06008176878094673, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002809934667311609, "grad_norm": 7.137924671173096, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8648250102996826, "num_tokens": 486996165.0, "step": 12768 }, { "epoch": 1.6243480473222236, "ewc_loss": 0.059876590967178345, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002813830797094852, "grad_norm": 7.183222770690918, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8766021132469177, "num_tokens": 487028249.0, "step": 12769 }, { "epoch": 1.6244752576008141, "ewc_loss": 0.059915512800216675, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028177231433801353, "grad_norm": 7.178238391876221, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8672551512718201, "num_tokens": 487070099.0, "step": 12770 }, { "epoch": 1.6246024678794047, "ewc_loss": 0.05977209657430649, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028033816488459706, "grad_norm": 7.127570629119873, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8689538240432739, "num_tokens": 487107117.0, "step": 12771 }, { "epoch": 1.6247296781579952, "ewc_loss": 0.060208141803741455, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028225721325725317, "grad_norm": 7.197865962982178, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8535416126251221, "num_tokens": 487146024.0, "step": 12772 }, { "epoch": 1.6248568884365857, "ewc_loss": 0.06007850915193558, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028096087044104934, "grad_norm": 7.137148380279541, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8517090678215027, "num_tokens": 487189864.0, "step": 12773 }, { "epoch": 1.6249840987151762, "ewc_loss": 0.05991160869598389, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028173328610137105, "grad_norm": 7.181905746459961, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8667498826980591, "num_tokens": 487230307.0, "step": 12774 }, { "epoch": 1.6251113089937665, "ewc_loss": 0.060150593519210815, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002816817432176322, "grad_norm": 7.217991828918457, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8691550493240356, "num_tokens": 487261387.0, "step": 12775 }, { "epoch": 1.625238519272357, "ewc_loss": 0.05989384278655052, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002815556072164327, "grad_norm": 7.18223237991333, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.854489266872406, "num_tokens": 487298611.0, "step": 12776 }, { "epoch": 1.6253657295509476, "ewc_loss": 0.060070913285017014, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028088490944355726, "grad_norm": 7.1346588134765625, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8642648458480835, "num_tokens": 487338172.0, "step": 12777 }, { "epoch": 1.6254929398295381, "ewc_loss": 0.05984998494386673, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028111704159528017, "grad_norm": 7.175037860870361, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8665561676025391, "num_tokens": 487375126.0, "step": 12778 }, { "epoch": 1.6256201501081287, "ewc_loss": 0.05982974171638489, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002809146244544536, "grad_norm": 7.1032891273498535, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8766629099845886, "num_tokens": 487416539.0, "step": 12779 }, { "epoch": 1.6257473603867192, "ewc_loss": 0.059900250285863876, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002816196938510984, "grad_norm": 7.213265419006348, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8664720058441162, "num_tokens": 487448429.0, "step": 12780 }, { "epoch": 1.6258745706653097, "ewc_loss": 0.059826090931892395, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002808780991472304, "grad_norm": 7.151320934295654, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8641977906227112, "num_tokens": 487486054.0, "step": 12781 }, { "epoch": 1.6260017809439002, "ewc_loss": 0.060163162648677826, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002818073844537139, "grad_norm": 7.127821922302246, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8712109327316284, "num_tokens": 487523771.0, "step": 12782 }, { "epoch": 1.6261289912224908, "ewc_loss": 0.05983521044254303, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002809693105518818, "grad_norm": 7.1671223640441895, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8717499375343323, "num_tokens": 487561334.0, "step": 12783 }, { "epoch": 1.6262562015010813, "ewc_loss": 0.05991673842072487, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002817845670506358, "grad_norm": 7.135337829589844, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8661984205245972, "num_tokens": 487606382.0, "step": 12784 }, { "epoch": 1.6263834117796718, "ewc_loss": 0.059879370033741, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028141087386757135, "grad_norm": 7.13952112197876, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8613108992576599, "num_tokens": 487647640.0, "step": 12785 }, { "epoch": 1.6265106220582624, "ewc_loss": 0.0598362535238266, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028097970061935484, "grad_norm": 7.175435543060303, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8623483777046204, "num_tokens": 487682604.0, "step": 12786 }, { "epoch": 1.6266378323368529, "ewc_loss": 0.05993729829788208, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028199018561281264, "grad_norm": 7.177002906799316, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8689255118370056, "num_tokens": 487718271.0, "step": 12787 }, { "epoch": 1.6267650426154434, "ewc_loss": 0.05982619896531105, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002808791759889573, "grad_norm": 7.14957332611084, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.852319598197937, "num_tokens": 487752872.0, "step": 12788 }, { "epoch": 1.626892252894034, "ewc_loss": 0.05983419716358185, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002809591533150524, "grad_norm": 7.167769908905029, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8552944660186768, "num_tokens": 487798309.0, "step": 12789 }, { "epoch": 1.6270194631726245, "ewc_loss": 0.05989689379930496, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002815861371345818, "grad_norm": 7.138609409332275, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8544284105300903, "num_tokens": 487834052.0, "step": 12790 }, { "epoch": 1.627146673451215, "ewc_loss": 0.0598347969353199, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002809651487041265, "grad_norm": 7.158790588378906, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8545833826065063, "num_tokens": 487868662.0, "step": 12791 }, { "epoch": 1.6272738837298055, "ewc_loss": 0.05988110974431038, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002814282779581845, "grad_norm": 7.212218284606934, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8619738221168518, "num_tokens": 487907425.0, "step": 12792 }, { "epoch": 1.6274010940083958, "ewc_loss": 0.05980768799781799, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000280694046523422, "grad_norm": 7.118392467498779, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8739063143730164, "num_tokens": 487942979.0, "step": 12793 }, { "epoch": 1.6275283042869864, "ewc_loss": 0.05985115468502045, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002811287413351238, "grad_norm": 7.163943290710449, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.870705783367157, "num_tokens": 487982402.0, "step": 12794 }, { "epoch": 1.6276555145655769, "ewc_loss": 0.05990743637084961, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002816915512084961, "grad_norm": 7.162107944488525, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8732203245162964, "num_tokens": 488020071.0, "step": 12795 }, { "epoch": 1.6277827248441674, "ewc_loss": 0.05979183688759804, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028053554706275463, "grad_norm": 7.082231521606445, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8511620759963989, "num_tokens": 488057457.0, "step": 12796 }, { "epoch": 1.627909935122758, "ewc_loss": 0.05989281088113785, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028154527535662055, "grad_norm": 7.153082847595215, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8624882698059082, "num_tokens": 488094002.0, "step": 12797 }, { "epoch": 1.6280371454013485, "ewc_loss": 0.059906117618083954, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002816783671732992, "grad_norm": 7.134332180023193, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8503738045692444, "num_tokens": 488134995.0, "step": 12798 }, { "epoch": 1.6281643556799388, "ewc_loss": 0.06022068113088608, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002823825925588608, "grad_norm": 7.149889945983887, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8679597973823547, "num_tokens": 488169942.0, "step": 12799 }, { "epoch": 1.6282915659585293, "ewc_loss": 0.05981418862938881, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028075906448066235, "grad_norm": 7.10369873046875, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8671139478683472, "num_tokens": 488210852.0, "step": 12800 }, { "epoch": 1.6284187762371198, "ewc_loss": 0.05999046936631203, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002825218834914267, "grad_norm": 7.129560470581055, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8684471845626831, "num_tokens": 488253067.0, "step": 12801 }, { "epoch": 1.6285459865157104, "ewc_loss": 0.05985119193792343, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002811291196849197, "grad_norm": 7.166375160217285, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8734467029571533, "num_tokens": 488293498.0, "step": 12802 }, { "epoch": 1.628673196794301, "ewc_loss": 0.059866514056921005, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028128232224844396, "grad_norm": 7.113409042358398, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8676191568374634, "num_tokens": 488333931.0, "step": 12803 }, { "epoch": 1.6288004070728914, "ewc_loss": 0.05996854603290558, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002823026734404266, "grad_norm": 7.188570499420166, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8699865341186523, "num_tokens": 488371724.0, "step": 12804 }, { "epoch": 1.628927617351482, "ewc_loss": 0.059844210743904114, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000281059299595654, "grad_norm": 7.110571384429932, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8601168394088745, "num_tokens": 488415171.0, "step": 12805 }, { "epoch": 1.6290548276300725, "ewc_loss": 0.06001761183142662, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002827933058142662, "grad_norm": 7.198338508605957, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.859781801700592, "num_tokens": 488451672.0, "step": 12806 }, { "epoch": 1.629182037908663, "ewc_loss": 0.05976346135139465, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028025181381963193, "grad_norm": 7.135792255401611, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8802156448364258, "num_tokens": 488492657.0, "step": 12807 }, { "epoch": 1.6293092481872535, "ewc_loss": 0.06026054918766022, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002827812859322876, "grad_norm": 7.191975116729736, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8723021745681763, "num_tokens": 488530681.0, "step": 12808 }, { "epoch": 1.629436458465844, "ewc_loss": 0.059936605393886566, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00027954185497947037, "grad_norm": 7.150580883026123, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8753959536552429, "num_tokens": 488560542.0, "step": 12809 }, { "epoch": 1.6295636687444346, "ewc_loss": 0.06018378585577011, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028201364330016077, "grad_norm": 7.207350730895996, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8673983812332153, "num_tokens": 488597491.0, "step": 12810 }, { "epoch": 1.6296908790230251, "ewc_loss": 0.05974727123975754, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002800899092108011, "grad_norm": 7.174050331115723, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8718901872634888, "num_tokens": 488634556.0, "step": 12811 }, { "epoch": 1.6298180893016156, "ewc_loss": 0.05980169028043747, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002806340635288507, "grad_norm": 7.159069061279297, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8620034456253052, "num_tokens": 488668174.0, "step": 12812 }, { "epoch": 1.6299452995802062, "ewc_loss": 0.05979763716459274, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028059358010068536, "grad_norm": 7.158400058746338, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8692979216575623, "num_tokens": 488705994.0, "step": 12813 }, { "epoch": 1.6300725098587967, "ewc_loss": 0.0598176047205925, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028079323237761855, "grad_norm": 7.265233993530273, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8561573028564453, "num_tokens": 488746908.0, "step": 12814 }, { "epoch": 1.6301997201373872, "ewc_loss": 0.05965689569711685, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002791861188597977, "grad_norm": 7.103670597076416, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8664513230323792, "num_tokens": 488785607.0, "step": 12815 }, { "epoch": 1.6303269304159778, "ewc_loss": 0.05984045937657356, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002810217847581953, "grad_norm": 7.188746452331543, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8636106252670288, "num_tokens": 488823831.0, "step": 12816 }, { "epoch": 1.630454140694568, "ewc_loss": 0.059679578989744186, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00027941298321820796, "grad_norm": 7.134383201599121, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.87176114320755, "num_tokens": 488859944.0, "step": 12817 }, { "epoch": 1.6305813509731586, "ewc_loss": 0.05982059985399246, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028082318021915853, "grad_norm": 7.149230003356934, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8503928184509277, "num_tokens": 488903764.0, "step": 12818 }, { "epoch": 1.6307085612517491, "ewc_loss": 0.05969824641942978, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000279599626082927, "grad_norm": 7.164154052734375, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8548265099525452, "num_tokens": 488946890.0, "step": 12819 }, { "epoch": 1.6308357715303397, "ewc_loss": 0.0597289577126503, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002799067588057369, "grad_norm": 7.236800670623779, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8629214763641357, "num_tokens": 488981901.0, "step": 12820 }, { "epoch": 1.6309629818089302, "ewc_loss": 0.05960367992520332, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002786539844237268, "grad_norm": 7.127984046936035, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8686966896057129, "num_tokens": 489024180.0, "step": 12821 }, { "epoch": 1.6310901920875207, "ewc_loss": 0.05975358188152313, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002801530063152313, "grad_norm": 7.172484397888184, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8680402040481567, "num_tokens": 489070971.0, "step": 12822 }, { "epoch": 1.6312174023661112, "ewc_loss": 0.059673331677913666, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002793505263980478, "grad_norm": 7.093868255615234, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8752979040145874, "num_tokens": 489111636.0, "step": 12823 }, { "epoch": 1.6313446126447015, "ewc_loss": 0.059873923659324646, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002813564206007868, "grad_norm": 7.215572357177734, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8631999492645264, "num_tokens": 489148677.0, "step": 12824 }, { "epoch": 1.631471822923292, "ewc_loss": 0.05959928780794144, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002786100667435676, "grad_norm": 7.126925468444824, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8534424901008606, "num_tokens": 489186222.0, "step": 12825 }, { "epoch": 1.6315990332018826, "ewc_loss": 0.059914182871580124, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002817590138874948, "grad_norm": 7.210207462310791, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.838127076625824, "num_tokens": 489223173.0, "step": 12826 }, { "epoch": 1.6317262434804731, "ewc_loss": 0.05967795476317406, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002793967432808131, "grad_norm": 7.156355381011963, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8652419447898865, "num_tokens": 489259236.0, "step": 12827 }, { "epoch": 1.6318534537590637, "ewc_loss": 0.059862423688173294, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028124143136665225, "grad_norm": 7.174887657165527, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8569468855857849, "num_tokens": 489298449.0, "step": 12828 }, { "epoch": 1.6319806640376542, "ewc_loss": 0.05975043773651123, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002801215450745076, "grad_norm": 7.19304084777832, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8444725275039673, "num_tokens": 489334800.0, "step": 12829 }, { "epoch": 1.6321078743162447, "ewc_loss": 0.05979862064123154, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028060341719537973, "grad_norm": 7.118050575256348, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8717577457427979, "num_tokens": 489378510.0, "step": 12830 }, { "epoch": 1.6322350845948352, "ewc_loss": 0.059950076043605804, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028211792232468724, "grad_norm": 7.17559814453125, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8529207706451416, "num_tokens": 489419267.0, "step": 12831 }, { "epoch": 1.6323622948734258, "ewc_loss": 0.05981931835412979, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002808103454299271, "grad_norm": 7.158566474914551, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8683282732963562, "num_tokens": 489454772.0, "step": 12832 }, { "epoch": 1.6324895051520163, "ewc_loss": 0.059881918132305145, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028143636882305145, "grad_norm": 7.193521499633789, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.858649492263794, "num_tokens": 489490146.0, "step": 12833 }, { "epoch": 1.6326167154306068, "ewc_loss": 0.05982634425163269, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002808806311804801, "grad_norm": 7.126297473907471, "learning_rate": 1e-06, "loss": 0.5505, "mean_token_accuracy": 0.8358381986618042, "num_tokens": 489532754.0, "step": 12834 }, { "epoch": 1.6327439257091974, "ewc_loss": 0.059964679181575775, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028226396534591913, "grad_norm": 7.220171928405762, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8614412546157837, "num_tokens": 489570378.0, "step": 12835 }, { "epoch": 1.6328711359877879, "ewc_loss": 0.05983980745077133, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028101526550017297, "grad_norm": 7.165043830871582, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8569684028625488, "num_tokens": 489604562.0, "step": 12836 }, { "epoch": 1.6329983462663784, "ewc_loss": 0.06019376963376999, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002821134985424578, "grad_norm": 7.1833271980285645, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8717166185379028, "num_tokens": 489644060.0, "step": 12837 }, { "epoch": 1.633125556544969, "ewc_loss": 0.060145579278469086, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002816315973177552, "grad_norm": 7.185791969299316, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8661459684371948, "num_tokens": 489682693.0, "step": 12838 }, { "epoch": 1.6332527668235595, "ewc_loss": 0.060093145817518234, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028110723360441625, "grad_norm": 7.132131576538086, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8729268312454224, "num_tokens": 489720786.0, "step": 12839 }, { "epoch": 1.63337997710215, "ewc_loss": 0.060146067291498184, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002816364576574415, "grad_norm": 7.161842346191406, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8708654642105103, "num_tokens": 489758773.0, "step": 12840 }, { "epoch": 1.6335071873807405, "ewc_loss": 0.06017248332500458, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002819006040226668, "grad_norm": 7.177295684814453, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8661201000213623, "num_tokens": 489793436.0, "step": 12841 }, { "epoch": 1.6336343976593308, "ewc_loss": 0.060139745473861694, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.000281573215033859, "grad_norm": 7.1999711990356445, "learning_rate": 1e-06, "loss": 0.392, "mean_token_accuracy": 0.8835877180099487, "num_tokens": 489828634.0, "step": 12842 }, { "epoch": 1.6337616079379214, "ewc_loss": 0.06017832085490227, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.000281958986306563, "grad_norm": 7.175210952758789, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8680405616760254, "num_tokens": 489866335.0, "step": 12843 }, { "epoch": 1.6338888182165119, "ewc_loss": 0.060139477252960205, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.000281570537481457, "grad_norm": 7.192718029022217, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8730276226997375, "num_tokens": 489903059.0, "step": 12844 }, { "epoch": 1.6340160284951024, "ewc_loss": 0.0600547268986702, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028072306304238737, "grad_norm": 7.120389938354492, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8718470335006714, "num_tokens": 489940186.0, "step": 12845 }, { "epoch": 1.634143238773693, "ewc_loss": 0.060182370245456696, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028199946973472834, "grad_norm": 7.144153594970703, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.876394510269165, "num_tokens": 489984732.0, "step": 12846 }, { "epoch": 1.6342704490522835, "ewc_loss": 0.05977185070514679, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028033569105900824, "grad_norm": 7.169553279876709, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8668504357337952, "num_tokens": 490016699.0, "step": 12847 }, { "epoch": 1.6343976593308738, "ewc_loss": 0.05986722558736801, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028128945268690586, "grad_norm": 7.193063735961914, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8592749834060669, "num_tokens": 490051351.0, "step": 12848 }, { "epoch": 1.6345248696094643, "ewc_loss": 0.059784941375255585, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028046660008840263, "grad_norm": 7.125275135040283, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8736770153045654, "num_tokens": 490089668.0, "step": 12849 }, { "epoch": 1.6346520798880548, "ewc_loss": 0.0599125400185585, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002817425993271172, "grad_norm": 7.183316707611084, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8663840293884277, "num_tokens": 490129651.0, "step": 12850 }, { "epoch": 1.6347792901666454, "ewc_loss": 0.059774160385131836, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028035877039656043, "grad_norm": 7.118018627166748, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8578505516052246, "num_tokens": 490168441.0, "step": 12851 }, { "epoch": 1.6349065004452359, "ewc_loss": 0.05989791452884674, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028159632347524166, "grad_norm": 7.167624473571777, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.87404865026474, "num_tokens": 490204754.0, "step": 12852 }, { "epoch": 1.6350337107238264, "ewc_loss": 0.05982401221990585, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002808572899084538, "grad_norm": 7.1532206535339355, "learning_rate": 1e-06, "loss": 0.4104, "mean_token_accuracy": 0.8793172240257263, "num_tokens": 490244178.0, "step": 12853 }, { "epoch": 1.635160921002417, "ewc_loss": 0.059914566576480865, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028176282648928463, "grad_norm": 7.184586048126221, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8617461323738098, "num_tokens": 490281161.0, "step": 12854 }, { "epoch": 1.6352881312810075, "ewc_loss": 0.05984670668840408, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028108424157835543, "grad_norm": 7.174508094787598, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8665817975997925, "num_tokens": 490314892.0, "step": 12855 }, { "epoch": 1.635415341559598, "ewc_loss": 0.059901684522628784, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028163401293568313, "grad_norm": 7.208032131195068, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8533966541290283, "num_tokens": 490352600.0, "step": 12856 }, { "epoch": 1.6355425518381885, "ewc_loss": 0.059523455798625946, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002802931412588805, "grad_norm": 7.157642841339111, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8764998912811279, "num_tokens": 490384487.0, "step": 12857 }, { "epoch": 1.635669762116779, "ewc_loss": 0.059654876589775085, "ewc_loss_diag": 3.147125244140625e-05, "ewc_loss_parallel": 0.0002816073829308152, "grad_norm": 7.193502426147461, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8446162939071655, "num_tokens": 490422199.0, "step": 12858 }, { "epoch": 1.6357969723953696, "ewc_loss": 0.059852778911590576, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028114498127251863, "grad_norm": 7.203217029571533, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8569918870925903, "num_tokens": 490458058.0, "step": 12859 }, { "epoch": 1.6359241826739601, "ewc_loss": 0.059843823313713074, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028105542878620327, "grad_norm": 7.1458516120910645, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8775969743728638, "num_tokens": 490498447.0, "step": 12860 }, { "epoch": 1.6360513929525506, "ewc_loss": 0.05988169088959694, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028143409872427583, "grad_norm": 7.150460243225098, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8517144322395325, "num_tokens": 490538502.0, "step": 12861 }, { "epoch": 1.6361786032311412, "ewc_loss": 0.05983160436153412, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028093322180211544, "grad_norm": 7.182435512542725, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.869744062423706, "num_tokens": 490575752.0, "step": 12862 }, { "epoch": 1.6363058135097317, "ewc_loss": 0.05985056608915329, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028112283325754106, "grad_norm": 7.150298118591309, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8649389743804932, "num_tokens": 490613502.0, "step": 12863 }, { "epoch": 1.6364330237883222, "ewc_loss": 0.059915486723184586, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002817720524035394, "grad_norm": 7.147685527801514, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8602337837219238, "num_tokens": 490653820.0, "step": 12864 }, { "epoch": 1.6365602340669128, "ewc_loss": 0.059906333684921265, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028168054996058345, "grad_norm": 7.144476413726807, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.850856363773346, "num_tokens": 490693332.0, "step": 12865 }, { "epoch": 1.636687444345503, "ewc_loss": 0.05988719314336777, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028148910496383905, "grad_norm": 7.163442134857178, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8631985187530518, "num_tokens": 490733135.0, "step": 12866 }, { "epoch": 1.6368146546240936, "ewc_loss": 0.05998951569199562, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002825123374350369, "grad_norm": 7.161391735076904, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8686502575874329, "num_tokens": 490775505.0, "step": 12867 }, { "epoch": 1.6369418649026841, "ewc_loss": 0.05992026627063751, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002818198408931494, "grad_norm": 7.15733003616333, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8649115562438965, "num_tokens": 490817783.0, "step": 12868 }, { "epoch": 1.6370690751812746, "ewc_loss": 0.060032643377780914, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028294362709857523, "grad_norm": 7.2249579429626465, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8588660359382629, "num_tokens": 490849464.0, "step": 12869 }, { "epoch": 1.6371962854598652, "ewc_loss": 0.05993073433637619, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002819245564751327, "grad_norm": 7.1241936683654785, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8585585355758667, "num_tokens": 490891925.0, "step": 12870 }, { "epoch": 1.6373234957384557, "ewc_loss": 0.06002642959356308, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028288146131671965, "grad_norm": 7.259515762329102, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8576992750167847, "num_tokens": 490926221.0, "step": 12871 }, { "epoch": 1.6374507060170462, "ewc_loss": 0.05986202880740166, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028123747324571013, "grad_norm": 7.114609241485596, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8570992946624756, "num_tokens": 490964141.0, "step": 12872 }, { "epoch": 1.6375779162956365, "ewc_loss": 0.060014188289642334, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002827590797096491, "grad_norm": 7.297248840332031, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.860242486000061, "num_tokens": 490997917.0, "step": 12873 }, { "epoch": 1.637705126574227, "ewc_loss": 0.05998650938272476, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028004086925648153, "grad_norm": 7.138683795928955, "learning_rate": 1e-06, "loss": 0.4064, "mean_token_accuracy": 0.8792141079902649, "num_tokens": 491029244.0, "step": 12874 }, { "epoch": 1.6378323368528176, "ewc_loss": 0.06002262979745865, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002828434808179736, "grad_norm": 7.17378044128418, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8647274374961853, "num_tokens": 491066591.0, "step": 12875 }, { "epoch": 1.6379595471314081, "ewc_loss": 0.0597858801484108, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028047600062564015, "grad_norm": 7.093104362487793, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8638441562652588, "num_tokens": 491106546.0, "step": 12876 }, { "epoch": 1.6380867574099987, "ewc_loss": 0.0600164569914341, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002827817515935749, "grad_norm": 7.195801734924316, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8576914072036743, "num_tokens": 491145512.0, "step": 12877 }, { "epoch": 1.6382139676885892, "ewc_loss": 0.05986496061086655, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028126678080298007, "grad_norm": 7.098221302032471, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8633235692977905, "num_tokens": 491185624.0, "step": 12878 }, { "epoch": 1.6383411779671797, "ewc_loss": 0.060274384915828705, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002829196455422789, "grad_norm": 7.149359703063965, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8584276437759399, "num_tokens": 491220982.0, "step": 12879 }, { "epoch": 1.6384683882457702, "ewc_loss": 0.06020936369895935, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028226940776221454, "grad_norm": 7.174631118774414, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8654554486274719, "num_tokens": 491262501.0, "step": 12880 }, { "epoch": 1.6385955985243608, "ewc_loss": 0.060244880616664886, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028262456180527806, "grad_norm": 7.168423652648926, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8758527040481567, "num_tokens": 491298078.0, "step": 12881 }, { "epoch": 1.6387228088029513, "ewc_loss": 0.059983935207128525, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028245654539205134, "grad_norm": 7.096896171569824, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8713223338127136, "num_tokens": 491336203.0, "step": 12882 }, { "epoch": 1.6388500190815418, "ewc_loss": 0.06006874889135361, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000283304660115391, "grad_norm": 7.195502758026123, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8532490730285645, "num_tokens": 491374723.0, "step": 12883 }, { "epoch": 1.6389772293601323, "ewc_loss": 0.05994809418916702, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028209813171997666, "grad_norm": 7.238629341125488, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8661079406738281, "num_tokens": 491410546.0, "step": 12884 }, { "epoch": 1.6391044396387229, "ewc_loss": 0.05992632359266281, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002818804350681603, "grad_norm": 7.1631927490234375, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8611800074577332, "num_tokens": 491450642.0, "step": 12885 }, { "epoch": 1.6392316499173134, "ewc_loss": 0.06027943640947342, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002829701406881213, "grad_norm": 7.2403645515441895, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8617347478866577, "num_tokens": 491489893.0, "step": 12886 }, { "epoch": 1.639358860195904, "ewc_loss": 0.060091495513916016, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028109076083637774, "grad_norm": 7.109470844268799, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8539695143699646, "num_tokens": 491535870.0, "step": 12887 }, { "epoch": 1.6394860704744945, "ewc_loss": 0.060257688164711, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028275264776311815, "grad_norm": 7.196826457977295, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8633556365966797, "num_tokens": 491575988.0, "step": 12888 }, { "epoch": 1.639613280753085, "ewc_loss": 0.05980205535888672, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002806377597153187, "grad_norm": 7.114938259124756, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8669521808624268, "num_tokens": 491616822.0, "step": 12889 }, { "epoch": 1.6397404910316755, "ewc_loss": 0.06027033179998398, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002828790748026222, "grad_norm": 7.174030303955078, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8843640685081482, "num_tokens": 491654132.0, "step": 12890 }, { "epoch": 1.6398677013102658, "ewc_loss": 0.0601382777094841, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002815585467033088, "grad_norm": 7.151003360748291, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8518241047859192, "num_tokens": 491697215.0, "step": 12891 }, { "epoch": 1.6399949115888564, "ewc_loss": 0.06005186587572098, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002831358287949115, "grad_norm": 7.223111152648926, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.843898355960846, "num_tokens": 491731338.0, "step": 12892 }, { "epoch": 1.6401221218674469, "ewc_loss": 0.060173407196998596, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002819098299369216, "grad_norm": 7.1377949714660645, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.862684965133667, "num_tokens": 491770808.0, "step": 12893 }, { "epoch": 1.6402493321460374, "ewc_loss": 0.05999280512332916, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000282545224763453, "grad_norm": 7.234046936035156, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8719758987426758, "num_tokens": 491804167.0, "step": 12894 }, { "epoch": 1.640376542424628, "ewc_loss": 0.05989949032664299, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002816120977513492, "grad_norm": 7.151793003082275, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8659244775772095, "num_tokens": 491840094.0, "step": 12895 }, { "epoch": 1.6405037527032185, "ewc_loss": 0.06026224046945572, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002827981661539525, "grad_norm": 7.183567047119141, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8507916927337646, "num_tokens": 491881643.0, "step": 12896 }, { "epoch": 1.6406309629818088, "ewc_loss": 0.06007564812898636, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028093226137571037, "grad_norm": 7.184045791625977, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8538217544555664, "num_tokens": 491920951.0, "step": 12897 }, { "epoch": 1.6407581732603993, "ewc_loss": 0.06025223061442375, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028269807808101177, "grad_norm": 7.209181785583496, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8598459362983704, "num_tokens": 491954599.0, "step": 12898 }, { "epoch": 1.6408853835389898, "ewc_loss": 0.06014039367437363, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002815797342918813, "grad_norm": 7.124640941619873, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8659805655479431, "num_tokens": 491995375.0, "step": 12899 }, { "epoch": 1.6410125938175804, "ewc_loss": 0.060273315757513046, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002829089353326708, "grad_norm": 7.191339492797852, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.859500527381897, "num_tokens": 492035297.0, "step": 12900 }, { "epoch": 1.6411398040961709, "ewc_loss": 0.06016824394464493, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002818582288455218, "grad_norm": 7.187699317932129, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8642215728759766, "num_tokens": 492071003.0, "step": 12901 }, { "epoch": 1.6412670143747614, "ewc_loss": 0.06016620993614197, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028183788526803255, "grad_norm": 7.154397487640381, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8629693388938904, "num_tokens": 492106046.0, "step": 12902 }, { "epoch": 1.641394224653352, "ewc_loss": 0.059993185102939606, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028254903736524284, "grad_norm": 7.163233280181885, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8759596347808838, "num_tokens": 492147252.0, "step": 12903 }, { "epoch": 1.6415214349319425, "ewc_loss": 0.059918344020843506, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028180063236504793, "grad_norm": 7.1400532722473145, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8547278642654419, "num_tokens": 492188185.0, "step": 12904 }, { "epoch": 1.641648645210533, "ewc_loss": 0.06006808578968048, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028329805354587734, "grad_norm": 7.197918891906738, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8692018985748291, "num_tokens": 492219630.0, "step": 12905 }, { "epoch": 1.6417758554891235, "ewc_loss": 0.05988456308841705, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028146279510110617, "grad_norm": 7.17198371887207, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8663234710693359, "num_tokens": 492251955.0, "step": 12906 }, { "epoch": 1.641903065767714, "ewc_loss": 0.06025177240371704, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028269350877963006, "grad_norm": 7.193150520324707, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8674827814102173, "num_tokens": 492293484.0, "step": 12907 }, { "epoch": 1.6420302760463046, "ewc_loss": 0.06013557314872742, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00028153153834864497, "grad_norm": 7.147552013397217, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.850638747215271, "num_tokens": 492331169.0, "step": 12908 }, { "epoch": 1.642157486324895, "ewc_loss": 0.06001847982406616, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002828019787557423, "grad_norm": 7.1936869621276855, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.870948076248169, "num_tokens": 492366071.0, "step": 12909 }, { "epoch": 1.6422846966034856, "ewc_loss": 0.05994807928800583, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002820979570969939, "grad_norm": 7.175231456756592, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8839574456214905, "num_tokens": 492396820.0, "step": 12910 }, { "epoch": 1.6424119068820762, "ewc_loss": 0.05996061861515045, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028222339460626245, "grad_norm": 7.191064834594727, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8715086579322815, "num_tokens": 492435282.0, "step": 12911 }, { "epoch": 1.6425391171606667, "ewc_loss": 0.06001822277903557, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002827994176186621, "grad_norm": 7.16223669052124, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.872153639793396, "num_tokens": 492470195.0, "step": 12912 }, { "epoch": 1.6426663274392572, "ewc_loss": 0.059968993067741394, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000282307097222656, "grad_norm": 7.1618170738220215, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.857609748840332, "num_tokens": 492514265.0, "step": 12913 }, { "epoch": 1.6427935377178478, "ewc_loss": 0.059979889541864395, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002824160910677165, "grad_norm": 7.15718936920166, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8656954765319824, "num_tokens": 492550348.0, "step": 12914 }, { "epoch": 1.642920747996438, "ewc_loss": 0.06001248210668564, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000282741995761171, "grad_norm": 7.1732177734375, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.86768639087677, "num_tokens": 492590648.0, "step": 12915 }, { "epoch": 1.6430479582750286, "ewc_loss": 0.06044166535139084, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028215101337991655, "grad_norm": 7.189186096191406, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8674167394638062, "num_tokens": 492629523.0, "step": 12916 }, { "epoch": 1.6431751685536191, "ewc_loss": 0.05997128039598465, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028233000193722546, "grad_norm": 7.153267860412598, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8597904443740845, "num_tokens": 492668392.0, "step": 12917 }, { "epoch": 1.6433023788322096, "ewc_loss": 0.060493648052215576, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028267083689570427, "grad_norm": 7.27107572555542, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8545267581939697, "num_tokens": 492696073.0, "step": 12918 }, { "epoch": 1.6434295891108002, "ewc_loss": 0.05983060225844383, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002809232100844383, "grad_norm": 7.152599811553955, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8666194081306458, "num_tokens": 492728590.0, "step": 12919 }, { "epoch": 1.6435567993893907, "ewc_loss": 0.05997317284345627, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002823488903231919, "grad_norm": 7.194288730621338, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8574799299240112, "num_tokens": 492767290.0, "step": 12920 }, { "epoch": 1.6436840096679812, "ewc_loss": 0.060359805822372437, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028133240994066, "grad_norm": 7.11447286605835, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8751635551452637, "num_tokens": 492801019.0, "step": 12921 }, { "epoch": 1.6438112199465715, "ewc_loss": 0.06047067791223526, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002824411785695702, "grad_norm": 7.187073230743408, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8633773326873779, "num_tokens": 492841220.0, "step": 12922 }, { "epoch": 1.643938430225162, "ewc_loss": 0.06039004027843475, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028163479873910546, "grad_norm": 7.063014030456543, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8536943197250366, "num_tokens": 492885076.0, "step": 12923 }, { "epoch": 1.6440656405037526, "ewc_loss": 0.060601379722356796, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002837481733877212, "grad_norm": 7.226531505584717, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8586249947547913, "num_tokens": 492918554.0, "step": 12924 }, { "epoch": 1.6441928507823431, "ewc_loss": 0.06041402369737625, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002818745851982385, "grad_norm": 7.120946884155273, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8805403709411621, "num_tokens": 492961890.0, "step": 12925 }, { "epoch": 1.6443200610609336, "ewc_loss": 0.06060800701379776, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002838144719135016, "grad_norm": 7.218885898590088, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8745471835136414, "num_tokens": 492997768.0, "step": 12926 }, { "epoch": 1.6444472713395242, "ewc_loss": 0.06040231138467789, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002817574713844806, "grad_norm": 7.120722770690918, "learning_rate": 1e-06, "loss": 0.381, "mean_token_accuracy": 0.8892907500267029, "num_tokens": 493037017.0, "step": 12927 }, { "epoch": 1.6445744816181147, "ewc_loss": 0.0605350136756897, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002830845187418163, "grad_norm": 7.180636405944824, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8745926022529602, "num_tokens": 493074579.0, "step": 12928 }, { "epoch": 1.6447016918967052, "ewc_loss": 0.06047014147043228, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028243576525710523, "grad_norm": 7.198663234710693, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8669766187667847, "num_tokens": 493111249.0, "step": 12929 }, { "epoch": 1.6448289021752958, "ewc_loss": 0.06044451892375946, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002821795642375946, "grad_norm": 7.171830654144287, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8480573892593384, "num_tokens": 493150723.0, "step": 12930 }, { "epoch": 1.6449561124538863, "ewc_loss": 0.06046430766582489, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002823774702847004, "grad_norm": 7.146139144897461, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8615499138832092, "num_tokens": 493189530.0, "step": 12931 }, { "epoch": 1.6450833227324768, "ewc_loss": 0.0605488196015358, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028322255820967257, "grad_norm": 7.159860134124756, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8718420267105103, "num_tokens": 493231369.0, "step": 12932 }, { "epoch": 1.6452105330110673, "ewc_loss": 0.06051444634795189, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028287884197197855, "grad_norm": 7.126819610595703, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8517512679100037, "num_tokens": 493278430.0, "step": 12933 }, { "epoch": 1.6453377432896579, "ewc_loss": 0.06062241271138191, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000283958506770432, "grad_norm": 7.219850540161133, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8699617385864258, "num_tokens": 493312258.0, "step": 12934 }, { "epoch": 1.6454649535682484, "ewc_loss": 0.06053999066352844, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028313425718806684, "grad_norm": 7.169832229614258, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8626868724822998, "num_tokens": 493347725.0, "step": 12935 }, { "epoch": 1.645592163846839, "ewc_loss": 0.0600743442773819, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028336065588518977, "grad_norm": 7.139476776123047, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8682441115379333, "num_tokens": 493388119.0, "step": 12936 }, { "epoch": 1.6457193741254295, "ewc_loss": 0.06005418673157692, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000283159053651616, "grad_norm": 7.205113410949707, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8656737804412842, "num_tokens": 493424607.0, "step": 12937 }, { "epoch": 1.64584658440402, "ewc_loss": 0.06040921062231064, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002818264765664935, "grad_norm": 7.166297912597656, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.842525839805603, "num_tokens": 493459546.0, "step": 12938 }, { "epoch": 1.6459737946826105, "ewc_loss": 0.06056174635887146, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002833518374245614, "grad_norm": 7.200393199920654, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8688737154006958, "num_tokens": 493494162.0, "step": 12939 }, { "epoch": 1.6461010049612008, "ewc_loss": 0.059884585440158844, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002814630279317498, "grad_norm": 7.136061191558838, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8620874881744385, "num_tokens": 493537523.0, "step": 12940 }, { "epoch": 1.6462282152397913, "ewc_loss": 0.06050831824541092, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002828175784088671, "grad_norm": 7.154095649719238, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8634138703346252, "num_tokens": 493580065.0, "step": 12941 }, { "epoch": 1.6463554255183819, "ewc_loss": 0.060051508247852325, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002831322490237653, "grad_norm": 7.122326850891113, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8721747398376465, "num_tokens": 493626717.0, "step": 12942 }, { "epoch": 1.6464826357969724, "ewc_loss": 0.0605916790664196, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002836511703208089, "grad_norm": 7.205814361572266, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.870887279510498, "num_tokens": 493666266.0, "step": 12943 }, { "epoch": 1.646609846075563, "ewc_loss": 0.06049077957868576, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002826421696227044, "grad_norm": 7.129637718200684, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8540335893630981, "num_tokens": 493703184.0, "step": 12944 }, { "epoch": 1.6467370563541535, "ewc_loss": 0.060703471302986145, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028476910665631294, "grad_norm": 7.177568435668945, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8796442747116089, "num_tokens": 493744061.0, "step": 12945 }, { "epoch": 1.6468642666327438, "ewc_loss": 0.060625411570072174, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028398848371580243, "grad_norm": 7.166998863220215, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8639972805976868, "num_tokens": 493783585.0, "step": 12946 }, { "epoch": 1.6469914769113343, "ewc_loss": 0.06060720607638359, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028380643925629556, "grad_norm": 7.134706020355225, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.870367705821991, "num_tokens": 493827781.0, "step": 12947 }, { "epoch": 1.6471186871899248, "ewc_loss": 0.060669735074043274, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002844317350536585, "grad_norm": 7.246346950531006, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8689888119697571, "num_tokens": 493864418.0, "step": 12948 }, { "epoch": 1.6472458974685154, "ewc_loss": 0.06050164997577667, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002827508724294603, "grad_norm": 7.152551174163818, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8633639812469482, "num_tokens": 493895226.0, "step": 12949 }, { "epoch": 1.6473731077471059, "ewc_loss": 0.06067126989364624, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002844470727723092, "grad_norm": 7.201719284057617, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8560359477996826, "num_tokens": 493935210.0, "step": 12950 }, { "epoch": 1.6475003180256964, "ewc_loss": 0.06052550673484802, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028298943652771413, "grad_norm": 7.164320945739746, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8639481067657471, "num_tokens": 493966368.0, "step": 12951 }, { "epoch": 1.647627528304287, "ewc_loss": 0.06068585440516472, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002845929120667279, "grad_norm": 7.182512283325195, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8576133251190186, "num_tokens": 494004304.0, "step": 12952 }, { "epoch": 1.6477547385828775, "ewc_loss": 0.06057452782988548, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028347966144792736, "grad_norm": 7.156771183013916, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8719239234924316, "num_tokens": 494044758.0, "step": 12953 }, { "epoch": 1.647881948861468, "ewc_loss": 0.06070605665445328, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002847949508577585, "grad_norm": 7.2145304679870605, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.847960889339447, "num_tokens": 494080624.0, "step": 12954 }, { "epoch": 1.6480091591400585, "ewc_loss": 0.06057664752006531, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028350084903649986, "grad_norm": 7.181599140167236, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8706272840499878, "num_tokens": 494116215.0, "step": 12955 }, { "epoch": 1.648136369418649, "ewc_loss": 0.06064082682132721, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002841426176019013, "grad_norm": 7.224569320678711, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8634928464889526, "num_tokens": 494150317.0, "step": 12956 }, { "epoch": 1.6482635796972396, "ewc_loss": 0.06051630154252052, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002828973811119795, "grad_norm": 7.1681342124938965, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8779218196868896, "num_tokens": 494187273.0, "step": 12957 }, { "epoch": 1.64839078997583, "ewc_loss": 0.06065957248210907, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002843300753738731, "grad_norm": 7.198010444641113, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8616334199905396, "num_tokens": 494226921.0, "step": 12958 }, { "epoch": 1.6485180002544206, "ewc_loss": 0.0605025589466095, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028275998192839324, "grad_norm": 7.177391529083252, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8709622025489807, "num_tokens": 494263141.0, "step": 12959 }, { "epoch": 1.6486452105330112, "ewc_loss": 0.06047378480434418, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002824722323566675, "grad_norm": 7.139285087585449, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8597793579101562, "num_tokens": 494307925.0, "step": 12960 }, { "epoch": 1.6487724208116017, "ewc_loss": 0.060565561056137085, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002833899634424597, "grad_norm": 7.123892307281494, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8584490418434143, "num_tokens": 494353428.0, "step": 12961 }, { "epoch": 1.6488996310901922, "ewc_loss": 0.06066741421818733, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000284408510196954, "grad_norm": 7.214488983154297, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8528293967247009, "num_tokens": 494383559.0, "step": 12962 }, { "epoch": 1.6490268413687827, "ewc_loss": 0.06056560203433037, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028339039999991655, "grad_norm": 7.175961017608643, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8665043711662292, "num_tokens": 494419564.0, "step": 12963 }, { "epoch": 1.649154051647373, "ewc_loss": 0.060665324330329895, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002843876136466861, "grad_norm": 7.207174301147461, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8677000999450684, "num_tokens": 494466886.0, "step": 12964 }, { "epoch": 1.6492812619259636, "ewc_loss": 0.06056587025523186, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002833930775523186, "grad_norm": 7.209137916564941, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8469265699386597, "num_tokens": 494506672.0, "step": 12965 }, { "epoch": 1.649408472204554, "ewc_loss": 0.06005513668060303, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028316857060417533, "grad_norm": 7.2236199378967285, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.865033745765686, "num_tokens": 494541881.0, "step": 12966 }, { "epoch": 1.6495356824831446, "ewc_loss": 0.06009644269943237, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002835816121660173, "grad_norm": 7.122807025909424, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8701006174087524, "num_tokens": 494583546.0, "step": 12967 }, { "epoch": 1.6496628927617352, "ewc_loss": 0.060176171362400055, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002843789116013795, "grad_norm": 7.2736592292785645, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8536933064460754, "num_tokens": 494616625.0, "step": 12968 }, { "epoch": 1.6497901030403257, "ewc_loss": 0.06031300872564316, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028086447855457664, "grad_norm": 7.1173930168151855, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8788466453552246, "num_tokens": 494654590.0, "step": 12969 }, { "epoch": 1.6499173133189162, "ewc_loss": 0.06073373556137085, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028507172828540206, "grad_norm": 7.216517448425293, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8793779015541077, "num_tokens": 494692471.0, "step": 12970 }, { "epoch": 1.6500445235975065, "ewc_loss": 0.06040094047784805, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028174379258416593, "grad_norm": 7.1508097648620605, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8567036986351013, "num_tokens": 494730347.0, "step": 12971 }, { "epoch": 1.650171733876097, "ewc_loss": 0.0605749636888504, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002834839979186654, "grad_norm": 7.2006144523620605, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8521828651428223, "num_tokens": 494763753.0, "step": 12972 }, { "epoch": 1.6502989441546876, "ewc_loss": 0.06048927828669548, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002826271520461887, "grad_norm": 7.187722206115723, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8593212962150574, "num_tokens": 494800416.0, "step": 12973 }, { "epoch": 1.6504261544332781, "ewc_loss": 0.06062862649559975, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002840206434484571, "grad_norm": 7.149097442626953, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8624786734580994, "num_tokens": 494842733.0, "step": 12974 }, { "epoch": 1.6505533647118686, "ewc_loss": 0.06048166751861572, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002825510746333748, "grad_norm": 7.176087379455566, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8566291332244873, "num_tokens": 494878661.0, "step": 12975 }, { "epoch": 1.6506805749904592, "ewc_loss": 0.060588814318180084, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028362253215163946, "grad_norm": 7.1425700187683105, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8569707870483398, "num_tokens": 494918142.0, "step": 12976 }, { "epoch": 1.6508077852690497, "ewc_loss": 0.06058402359485626, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002835745981428772, "grad_norm": 7.193519115447998, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8631149530410767, "num_tokens": 494956947.0, "step": 12977 }, { "epoch": 1.6509349955476402, "ewc_loss": 0.060646504163742065, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028419942827895284, "grad_norm": 7.209839344024658, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8586660623550415, "num_tokens": 494989850.0, "step": 12978 }, { "epoch": 1.6510622058262308, "ewc_loss": 0.060474127531051636, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028247563750483096, "grad_norm": 7.208926200866699, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8566262125968933, "num_tokens": 495026770.0, "step": 12979 }, { "epoch": 1.6511894161048213, "ewc_loss": 0.06006743386387825, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028329153428785503, "grad_norm": 7.196454048156738, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8766598701477051, "num_tokens": 495065009.0, "step": 12980 }, { "epoch": 1.6513166263834118, "ewc_loss": 0.06002727895975113, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.000282889959635213, "grad_norm": 7.172854900360107, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8652749061584473, "num_tokens": 495096914.0, "step": 12981 }, { "epoch": 1.6514438366620023, "ewc_loss": 0.06002986803650856, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002829158620443195, "grad_norm": 7.148548126220703, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8684470057487488, "num_tokens": 495138348.0, "step": 12982 }, { "epoch": 1.6515710469405929, "ewc_loss": 0.06006281077861786, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002832452883012593, "grad_norm": 7.201234817504883, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8591369986534119, "num_tokens": 495177026.0, "step": 12983 }, { "epoch": 1.6516982572191834, "ewc_loss": 0.06006583198904991, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028327549807727337, "grad_norm": 7.212252140045166, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.865151584148407, "num_tokens": 495213850.0, "step": 12984 }, { "epoch": 1.651825467497774, "ewc_loss": 0.05996667593717575, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002822839596774429, "grad_norm": 7.128641605377197, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8738097548484802, "num_tokens": 495257213.0, "step": 12985 }, { "epoch": 1.6519526777763645, "ewc_loss": 0.06006759777665138, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002832931641023606, "grad_norm": 7.138297080993652, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8546782732009888, "num_tokens": 495300018.0, "step": 12986 }, { "epoch": 1.652079888054955, "ewc_loss": 0.06002575159072876, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002828747092280537, "grad_norm": 7.216518402099609, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8640173673629761, "num_tokens": 495336471.0, "step": 12987 }, { "epoch": 1.6522070983335455, "ewc_loss": 0.06045261770486832, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028226053109392524, "grad_norm": 7.187326431274414, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8681121468544006, "num_tokens": 495374680.0, "step": 12988 }, { "epoch": 1.6523343086121358, "ewc_loss": 0.06003154069185257, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028293259674683213, "grad_norm": 7.122261047363281, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8666903972625732, "num_tokens": 495414096.0, "step": 12989 }, { "epoch": 1.6524615188907263, "ewc_loss": 0.060522422194480896, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028295861557126045, "grad_norm": 7.201160907745361, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8536331653594971, "num_tokens": 495452955.0, "step": 12990 }, { "epoch": 1.6525887291693169, "ewc_loss": 0.060470979660749435, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028244417626410723, "grad_norm": 7.153650283813477, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.861850917339325, "num_tokens": 495490091.0, "step": 12991 }, { "epoch": 1.6527159394479074, "ewc_loss": 0.06013293191790581, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002839465159922838, "grad_norm": 7.2274861335754395, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.871337890625, "num_tokens": 495524543.0, "step": 12992 }, { "epoch": 1.652843149726498, "ewc_loss": 0.06047050654888153, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028243946144357324, "grad_norm": 7.16755485534668, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8555428385734558, "num_tokens": 495562068.0, "step": 12993 }, { "epoch": 1.6529703600050885, "ewc_loss": 0.060620445758104324, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028393883258104324, "grad_norm": 7.219655513763428, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.873337984085083, "num_tokens": 495598355.0, "step": 12994 }, { "epoch": 1.6530975702836788, "ewc_loss": 0.060493145138025284, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002826658310368657, "grad_norm": 9.849578857421875, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8506155014038086, "num_tokens": 495634735.0, "step": 12995 }, { "epoch": 1.6532247805622693, "ewc_loss": 0.06241864711046219, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0003019208670593798, "grad_norm": 7.2667741775512695, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8651459217071533, "num_tokens": 495668804.0, "step": 12996 }, { "epoch": 1.6533519908408598, "ewc_loss": 0.061175085604190826, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028948523686267436, "grad_norm": 7.437840461730957, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8684832453727722, "num_tokens": 495707129.0, "step": 12997 }, { "epoch": 1.6534792011194503, "ewc_loss": 0.0600186251103878, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028280343394726515, "grad_norm": 7.17933464050293, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8441842794418335, "num_tokens": 495746501.0, "step": 12998 }, { "epoch": 1.6536064113980409, "ewc_loss": 0.06091546267271042, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00029177183751016855, "grad_norm": 7.394534111022949, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8808143734931946, "num_tokens": 495789723.0, "step": 12999 }, { "epoch": 1.6537336216766314, "ewc_loss": 0.060085490345954895, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028347206534817815, "grad_norm": 7.272634506225586, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.855765700340271, "num_tokens": 495827073.0, "step": 13000 }, { "epoch": 1.653860831955222, "ewc_loss": 0.06073013320565224, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002850356977432966, "grad_norm": 7.24392032623291, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8532583117485046, "num_tokens": 495862362.0, "step": 13001 }, { "epoch": 1.6539880422338125, "ewc_loss": 0.06030524522066116, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028566966648213565, "grad_norm": 7.308064937591553, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8671766519546509, "num_tokens": 495905546.0, "step": 13002 }, { "epoch": 1.654115252512403, "ewc_loss": 0.060557182878255844, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002833062026184052, "grad_norm": 7.231625556945801, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8617074489593506, "num_tokens": 495942291.0, "step": 13003 }, { "epoch": 1.6542424627909935, "ewc_loss": 0.06064603105187416, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002841946843545884, "grad_norm": 7.251038074493408, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8537874817848206, "num_tokens": 495978631.0, "step": 13004 }, { "epoch": 1.654369673069584, "ewc_loss": 0.0604979507625103, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002827138814609498, "grad_norm": 7.242362976074219, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8586341142654419, "num_tokens": 496016626.0, "step": 13005 }, { "epoch": 1.6544968833481746, "ewc_loss": 0.06051526218652725, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028288699104450643, "grad_norm": 7.236490726470947, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8551418781280518, "num_tokens": 496051949.0, "step": 13006 }, { "epoch": 1.654624093626765, "ewc_loss": 0.06052738428115845, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002830082376021892, "grad_norm": 7.16927433013916, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8710935115814209, "num_tokens": 496092693.0, "step": 13007 }, { "epoch": 1.6547513039053556, "ewc_loss": 0.060492970049381256, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028266405570320785, "grad_norm": 7.25536584854126, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8591684103012085, "num_tokens": 496132853.0, "step": 13008 }, { "epoch": 1.6548785141839462, "ewc_loss": 0.06035834178328514, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002813177998177707, "grad_norm": 7.186644554138184, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8495160937309265, "num_tokens": 496168303.0, "step": 13009 }, { "epoch": 1.6550057244625367, "ewc_loss": 0.06054242327809334, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002831586170941591, "grad_norm": 7.203001976013184, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.851992130279541, "num_tokens": 496206132.0, "step": 13010 }, { "epoch": 1.6551329347411272, "ewc_loss": 0.060157839208841324, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.000281754182651639, "grad_norm": 7.209163665771484, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8574073910713196, "num_tokens": 496247542.0, "step": 13011 }, { "epoch": 1.6552601450197177, "ewc_loss": 0.06044386327266693, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002821729867719114, "grad_norm": 7.146604061126709, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8748033046722412, "num_tokens": 496285698.0, "step": 13012 }, { "epoch": 1.655387355298308, "ewc_loss": 0.06043507531285286, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002820851223077625, "grad_norm": 7.191938400268555, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8659322261810303, "num_tokens": 496322216.0, "step": 13013 }, { "epoch": 1.6555145655768986, "ewc_loss": 0.06043405085802078, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002820749068632722, "grad_norm": 7.1507415771484375, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8613702058792114, "num_tokens": 496357687.0, "step": 13014 }, { "epoch": 1.655641775855489, "ewc_loss": 0.06054677814245224, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002832021564245224, "grad_norm": 7.1629533767700195, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.874874472618103, "num_tokens": 496400875.0, "step": 13015 }, { "epoch": 1.6557689861340796, "ewc_loss": 0.060476623475551605, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028250060859136283, "grad_norm": 7.215658664703369, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8661631941795349, "num_tokens": 496432076.0, "step": 13016 }, { "epoch": 1.6558961964126702, "ewc_loss": 0.060504257678985596, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028277697856538, "grad_norm": 7.1915483474731445, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8618680238723755, "num_tokens": 496466852.0, "step": 13017 }, { "epoch": 1.6560234066912607, "ewc_loss": 0.06056886166334152, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002834229962900281, "grad_norm": 7.2019805908203125, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8625519275665283, "num_tokens": 496508890.0, "step": 13018 }, { "epoch": 1.6561506169698512, "ewc_loss": 0.060822196304798126, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002835149352904409, "grad_norm": 7.232933521270752, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8788845539093018, "num_tokens": 496537964.0, "step": 13019 }, { "epoch": 1.6562778272484415, "ewc_loss": 0.06048348918557167, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028256926452741027, "grad_norm": 7.227092266082764, "learning_rate": 1e-06, "loss": 0.523, "mean_token_accuracy": 0.8467212915420532, "num_tokens": 496571263.0, "step": 13020 }, { "epoch": 1.656405037527032, "ewc_loss": 0.06052964925765991, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028303085127845407, "grad_norm": 7.233530044555664, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8714537024497986, "num_tokens": 496610058.0, "step": 13021 }, { "epoch": 1.6565322478056226, "ewc_loss": 0.0604177862405777, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002819122455548495, "grad_norm": 7.227489471435547, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8574901819229126, "num_tokens": 496640542.0, "step": 13022 }, { "epoch": 1.656659458084213, "ewc_loss": 0.06044527888298035, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002821871603373438, "grad_norm": 7.205329418182373, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8610894680023193, "num_tokens": 496678030.0, "step": 13023 }, { "epoch": 1.6567866683628036, "ewc_loss": 0.060462746769189835, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028236184152774513, "grad_norm": 7.16218900680542, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8600048422813416, "num_tokens": 496714057.0, "step": 13024 }, { "epoch": 1.6569138786413942, "ewc_loss": 0.06047462671995163, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002824806433636695, "grad_norm": 7.225357532501221, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8798919916152954, "num_tokens": 496744501.0, "step": 13025 }, { "epoch": 1.6570410889199847, "ewc_loss": 0.06047099828720093, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028244435088708997, "grad_norm": 7.164458751678467, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8688546419143677, "num_tokens": 496780233.0, "step": 13026 }, { "epoch": 1.6571682991985752, "ewc_loss": 0.06070423498749733, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028233532793819904, "grad_norm": 7.244754791259766, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8661327958106995, "num_tokens": 496809645.0, "step": 13027 }, { "epoch": 1.6572955094771658, "ewc_loss": 0.06060417369008064, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002813347091432661, "grad_norm": 7.104615211486816, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8642802834510803, "num_tokens": 496854871.0, "step": 13028 }, { "epoch": 1.6574227197557563, "ewc_loss": 0.06085503101348877, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028384325560182333, "grad_norm": 7.17055082321167, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8809512853622437, "num_tokens": 496891958.0, "step": 13029 }, { "epoch": 1.6575499300343468, "ewc_loss": 0.06061787158250809, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028147170087322593, "grad_norm": 7.162552356719971, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8562042713165283, "num_tokens": 496928550.0, "step": 13030 }, { "epoch": 1.6576771403129373, "ewc_loss": 0.06075418367981911, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002828348078764975, "grad_norm": 7.174042224884033, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8629781007766724, "num_tokens": 496972009.0, "step": 13031 }, { "epoch": 1.6578043505915279, "ewc_loss": 0.06077203527092934, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002830133307725191, "grad_norm": 7.189670085906982, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8618118762969971, "num_tokens": 497013605.0, "step": 13032 }, { "epoch": 1.6579315608701184, "ewc_loss": 0.060448527336120605, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000282219669315964, "grad_norm": 7.181727886199951, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8664712309837341, "num_tokens": 497048712.0, "step": 13033 }, { "epoch": 1.658058771148709, "ewc_loss": 0.06075379624962807, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028283093706704676, "grad_norm": 7.155429840087891, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8785871267318726, "num_tokens": 497089161.0, "step": 13034 }, { "epoch": 1.6581859814272994, "ewc_loss": 0.06054450571537018, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002831794263329357, "grad_norm": 7.28170108795166, "learning_rate": 1e-06, "loss": 0.5445, "mean_token_accuracy": 0.835975706577301, "num_tokens": 497127253.0, "step": 13035 }, { "epoch": 1.65831319170589, "ewc_loss": 0.06037679314613342, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002815023181028664, "grad_norm": 7.119630813598633, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8595396280288696, "num_tokens": 497167637.0, "step": 13036 }, { "epoch": 1.6584404019844805, "ewc_loss": 0.060749832540750504, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002827912976499647, "grad_norm": 7.159183979034424, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.864403486251831, "num_tokens": 497205663.0, "step": 13037 }, { "epoch": 1.6585676122630708, "ewc_loss": 0.0606609508395195, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002819024957716465, "grad_norm": 7.244874477386475, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8803797364234924, "num_tokens": 497242146.0, "step": 13038 }, { "epoch": 1.6586948225416613, "ewc_loss": 0.060644395649433136, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028173692408017814, "grad_norm": 7.113549709320068, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8691033124923706, "num_tokens": 497279339.0, "step": 13039 }, { "epoch": 1.6588220328202519, "ewc_loss": 0.06069406867027283, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028223366825841367, "grad_norm": 7.165317058563232, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8558103442192078, "num_tokens": 497318530.0, "step": 13040 }, { "epoch": 1.6589492430988424, "ewc_loss": 0.060710132122039795, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002823942922987044, "grad_norm": 7.222021102905273, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8566309213638306, "num_tokens": 497352943.0, "step": 13041 }, { "epoch": 1.659076453377433, "ewc_loss": 0.06066855043172836, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002819784567691386, "grad_norm": 7.2064738273620605, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8589545488357544, "num_tokens": 497387729.0, "step": 13042 }, { "epoch": 1.6592036636560235, "ewc_loss": 0.06043848395347595, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002821192319970578, "grad_norm": 7.171717166900635, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8647488951683044, "num_tokens": 497427875.0, "step": 13043 }, { "epoch": 1.6593308739346138, "ewc_loss": 0.06062560901045799, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002815490588545799, "grad_norm": 7.1833696365356445, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8562532663345337, "num_tokens": 497469076.0, "step": 13044 }, { "epoch": 1.6594580842132043, "ewc_loss": 0.06044088676571846, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002821432426571846, "grad_norm": 7.178522109985352, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8655627369880676, "num_tokens": 497509556.0, "step": 13045 }, { "epoch": 1.6595852944917948, "ewc_loss": 0.06063924729824066, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002816854394041002, "grad_norm": 7.206935405731201, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.856055736541748, "num_tokens": 497548151.0, "step": 13046 }, { "epoch": 1.6597125047703853, "ewc_loss": 0.06053685396909714, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002806615084409714, "grad_norm": 7.165408611297607, "learning_rate": 1e-06, "loss": 0.5453, "mean_token_accuracy": 0.8375869393348694, "num_tokens": 497589088.0, "step": 13047 }, { "epoch": 1.6598397150489759, "ewc_loss": 0.060697220265865326, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002822651877067983, "grad_norm": 7.132671356201172, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8735942840576172, "num_tokens": 497632095.0, "step": 13048 }, { "epoch": 1.6599669253275664, "ewc_loss": 0.06068883836269379, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028218133957125247, "grad_norm": 7.225100040435791, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8749181032180786, "num_tokens": 497665148.0, "step": 13049 }, { "epoch": 1.660094135606157, "ewc_loss": 0.060571178793907166, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002810047590173781, "grad_norm": 7.175262928009033, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.85785311460495, "num_tokens": 497700134.0, "step": 13050 }, { "epoch": 1.6602213458847475, "ewc_loss": 0.06065985932946205, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028189155273139477, "grad_norm": 7.141909599304199, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8689069747924805, "num_tokens": 497743101.0, "step": 13051 }, { "epoch": 1.660348556163338, "ewc_loss": 0.06069079041481018, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028220086824148893, "grad_norm": 7.224730491638184, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8557544946670532, "num_tokens": 497775112.0, "step": 13052 }, { "epoch": 1.6604757664419285, "ewc_loss": 0.0605396069586277, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028068904066458344, "grad_norm": 7.075766086578369, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8747537136077881, "num_tokens": 497813320.0, "step": 13053 }, { "epoch": 1.660602976720519, "ewc_loss": 0.06082718074321747, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028356476104818285, "grad_norm": 7.1849141120910645, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8583247065544128, "num_tokens": 497852780.0, "step": 13054 }, { "epoch": 1.6607301869991096, "ewc_loss": 0.06055428460240364, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028083581128157675, "grad_norm": 7.1948041915893555, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.866081953048706, "num_tokens": 497886313.0, "step": 13055 }, { "epoch": 1.6608573972777, "ewc_loss": 0.060694292187690735, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002822358801495284, "grad_norm": 7.197452068328857, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.861922025680542, "num_tokens": 497924108.0, "step": 13056 }, { "epoch": 1.6609846075562906, "ewc_loss": 0.0606280118227005, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002815730986185372, "grad_norm": 7.155954837799072, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8542250394821167, "num_tokens": 497964656.0, "step": 13057 }, { "epoch": 1.6611118178348812, "ewc_loss": 0.060517288744449615, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002829072473105043, "grad_norm": 7.197265148162842, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8498001098632812, "num_tokens": 498005436.0, "step": 13058 }, { "epoch": 1.6612390281134717, "ewc_loss": 0.06065269559621811, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002818198991008103, "grad_norm": 7.146485805511475, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8602996468544006, "num_tokens": 498042237.0, "step": 13059 }, { "epoch": 1.6613662383920622, "ewc_loss": 0.06074656546115875, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028275864315219223, "grad_norm": 7.182375431060791, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8624093532562256, "num_tokens": 498083008.0, "step": 13060 }, { "epoch": 1.6614934486706527, "ewc_loss": 0.06060054898262024, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028373984969221056, "grad_norm": 7.269083499908447, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8415912389755249, "num_tokens": 498120550.0, "step": 13061 }, { "epoch": 1.661620658949243, "ewc_loss": 0.0605759397149086, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028105234378017485, "grad_norm": 7.18090295791626, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8600131869316101, "num_tokens": 498153033.0, "step": 13062 }, { "epoch": 1.6617478692278336, "ewc_loss": 0.06083352118730545, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028362817829474807, "grad_norm": 7.167483329772949, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8657748699188232, "num_tokens": 498190491.0, "step": 13063 }, { "epoch": 1.661875079506424, "ewc_loss": 0.0607156977057457, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002824499679263681, "grad_norm": 7.1337761878967285, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.85710209608078, "num_tokens": 498235889.0, "step": 13064 }, { "epoch": 1.6620022897850146, "ewc_loss": 0.06074812263250351, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002827742137014866, "grad_norm": 7.200784683227539, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8605519533157349, "num_tokens": 498277191.0, "step": 13065 }, { "epoch": 1.6621295000636052, "ewc_loss": 0.06073037534952164, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002825967385433614, "grad_norm": 7.183498859405518, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8522660136222839, "num_tokens": 498318443.0, "step": 13066 }, { "epoch": 1.6622567103421957, "ewc_loss": 0.060735270380973816, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028264569118618965, "grad_norm": 7.192499160766602, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8514177203178406, "num_tokens": 498355955.0, "step": 13067 }, { "epoch": 1.662383920620786, "ewc_loss": 0.06079184263944626, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002832113823387772, "grad_norm": 7.200335502624512, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8639530539512634, "num_tokens": 498388588.0, "step": 13068 }, { "epoch": 1.6625111308993765, "ewc_loss": 0.06045743077993393, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028230869793333113, "grad_norm": 7.251949787139893, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8626171350479126, "num_tokens": 498430661.0, "step": 13069 }, { "epoch": 1.662638341177967, "ewc_loss": 0.060742564499378204, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002827186253853142, "grad_norm": 7.151076793670654, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8522298336029053, "num_tokens": 498470428.0, "step": 13070 }, { "epoch": 1.6627655514565576, "ewc_loss": 0.06076745688915253, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002829675213433802, "grad_norm": 7.15366792678833, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8635002374649048, "num_tokens": 498512031.0, "step": 13071 }, { "epoch": 1.662892761735148, "ewc_loss": 0.0604911744594574, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028264609863981605, "grad_norm": 7.185201168060303, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8855603337287903, "num_tokens": 498548344.0, "step": 13072 }, { "epoch": 1.6630199720137386, "ewc_loss": 0.06046908348798752, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002824252296704799, "grad_norm": 7.15672492980957, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8766078948974609, "num_tokens": 498590115.0, "step": 13073 }, { "epoch": 1.6631471822923292, "ewc_loss": 0.06053078547120094, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028304223087616265, "grad_norm": 7.157796382904053, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8644247055053711, "num_tokens": 498634034.0, "step": 13074 }, { "epoch": 1.6632743925709197, "ewc_loss": 0.060515254735946655, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028288690373301506, "grad_norm": 7.218100547790527, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8554364442825317, "num_tokens": 498675086.0, "step": 13075 }, { "epoch": 1.6634016028495102, "ewc_loss": 0.06049978360533714, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002827322168741375, "grad_norm": 7.1193437576293945, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8709458112716675, "num_tokens": 498711374.0, "step": 13076 }, { "epoch": 1.6635288131281007, "ewc_loss": 0.06060473620891571, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002837817301042378, "grad_norm": 7.220766544342041, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.869830846786499, "num_tokens": 498753050.0, "step": 13077 }, { "epoch": 1.6636560234066913, "ewc_loss": 0.060545310378074646, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028318745899014175, "grad_norm": 7.251461982727051, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8653457164764404, "num_tokens": 498790406.0, "step": 13078 }, { "epoch": 1.6637832336852818, "ewc_loss": 0.06052020937204361, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002829364675562829, "grad_norm": 7.202212810516357, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8622024059295654, "num_tokens": 498827712.0, "step": 13079 }, { "epoch": 1.6639104439638723, "ewc_loss": 0.060573428869247437, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002834686602000147, "grad_norm": 7.19351863861084, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.868245005607605, "num_tokens": 498862283.0, "step": 13080 }, { "epoch": 1.6640376542424629, "ewc_loss": 0.05995211750268936, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028213835321366787, "grad_norm": 7.219499588012695, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8643524646759033, "num_tokens": 498902686.0, "step": 13081 }, { "epoch": 1.6641648645210534, "ewc_loss": 0.06007591262459755, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002833763137459755, "grad_norm": 7.227725028991699, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8529328107833862, "num_tokens": 498942785.0, "step": 13082 }, { "epoch": 1.664292074799644, "ewc_loss": 0.06004815548658371, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002830987505149096, "grad_norm": 7.201254844665527, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8690088987350464, "num_tokens": 498982761.0, "step": 13083 }, { "epoch": 1.6644192850782344, "ewc_loss": 0.06051947548985481, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002829291333910078, "grad_norm": 7.186007499694824, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8452763557434082, "num_tokens": 499025762.0, "step": 13084 }, { "epoch": 1.664546495356825, "ewc_loss": 0.06057672202587128, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028350160573609173, "grad_norm": 7.275649070739746, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8545079231262207, "num_tokens": 499063394.0, "step": 13085 }, { "epoch": 1.6646737056354155, "ewc_loss": 0.060483675450086594, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002825711271725595, "grad_norm": 7.123246669769287, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8742387890815735, "num_tokens": 499105824.0, "step": 13086 }, { "epoch": 1.6648009159140058, "ewc_loss": 0.060968492180109024, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028497789753600955, "grad_norm": 7.251730442047119, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8714426755905151, "num_tokens": 499140935.0, "step": 13087 }, { "epoch": 1.6649281261925963, "ewc_loss": 0.060526520013809204, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028299959376454353, "grad_norm": 7.185333728790283, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8638646602630615, "num_tokens": 499179492.0, "step": 13088 }, { "epoch": 1.6650553364711869, "ewc_loss": 0.060664501041173935, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002843793772626668, "grad_norm": 7.2315287590026855, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8749312162399292, "num_tokens": 499214716.0, "step": 13089 }, { "epoch": 1.6651825467497774, "ewc_loss": 0.06055638566613197, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002832982281688601, "grad_norm": 7.155242919921875, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8696099519729614, "num_tokens": 499250337.0, "step": 13090 }, { "epoch": 1.665309757028368, "ewc_loss": 0.06066863238811493, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002844206755980849, "grad_norm": 7.222162246704102, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8677783012390137, "num_tokens": 499283675.0, "step": 13091 }, { "epoch": 1.6654369673069584, "ewc_loss": 0.06002787500619888, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028289592592045665, "grad_norm": 7.1843461990356445, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8687861561775208, "num_tokens": 499319153.0, "step": 13092 }, { "epoch": 1.6655641775855488, "ewc_loss": 0.060228899121284485, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002849061565939337, "grad_norm": 7.217487812042236, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.861076831817627, "num_tokens": 499356675.0, "step": 13093 }, { "epoch": 1.6656913878641393, "ewc_loss": 0.060018762946128845, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028280483093112707, "grad_norm": 7.149372100830078, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8790332078933716, "num_tokens": 499392129.0, "step": 13094 }, { "epoch": 1.6658185981427298, "ewc_loss": 0.06073082983493805, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002850426535587758, "grad_norm": 7.249490261077881, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8752157688140869, "num_tokens": 499424958.0, "step": 13095 }, { "epoch": 1.6659458084213203, "ewc_loss": 0.060520462691783905, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028293902869336307, "grad_norm": 7.117690563201904, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8727585673332214, "num_tokens": 499467769.0, "step": 13096 }, { "epoch": 1.6660730186999109, "ewc_loss": 0.06081897020339966, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028592406306415796, "grad_norm": 7.287605285644531, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8541696071624756, "num_tokens": 499513828.0, "step": 13097 }, { "epoch": 1.6662002289785014, "ewc_loss": 0.0604780912399292, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028251527692191303, "grad_norm": 7.150494575500488, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8696573972702026, "num_tokens": 499548219.0, "step": 13098 }, { "epoch": 1.666327439257092, "ewc_loss": 0.060882117599248886, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028655555797740817, "grad_norm": 7.342710494995117, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8480282425880432, "num_tokens": 499587158.0, "step": 13099 }, { "epoch": 1.6664546495356825, "ewc_loss": 0.06041516736149788, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028188605210743845, "grad_norm": 7.129835605621338, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8674242496490479, "num_tokens": 499625235.0, "step": 13100 }, { "epoch": 1.666581859814273, "ewc_loss": 0.06086433306336403, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028637770446948707, "grad_norm": 7.27840518951416, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8443683385848999, "num_tokens": 499662842.0, "step": 13101 }, { "epoch": 1.6667090700928635, "ewc_loss": 0.06046690791845322, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028240346000529826, "grad_norm": 7.17738676071167, "learning_rate": 1e-06, "loss": 0.5288, "mean_token_accuracy": 0.8406566381454468, "num_tokens": 499703464.0, "step": 13102 }, { "epoch": 1.666836280371454, "ewc_loss": 0.060689013451337814, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002846245188266039, "grad_norm": 7.219295501708984, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8628142476081848, "num_tokens": 499739160.0, "step": 13103 }, { "epoch": 1.6669634906500446, "ewc_loss": 0.060594238340854645, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028367675258778036, "grad_norm": 7.2385149002075195, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8509809374809265, "num_tokens": 499775024.0, "step": 13104 }, { "epoch": 1.667090700928635, "ewc_loss": 0.060628026723861694, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002840146189555526, "grad_norm": 7.224478244781494, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8666318655014038, "num_tokens": 499816479.0, "step": 13105 }, { "epoch": 1.6672179112072256, "ewc_loss": 0.06056172028183937, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028335157549008727, "grad_norm": 7.204869747161865, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8666409254074097, "num_tokens": 499848790.0, "step": 13106 }, { "epoch": 1.6673451214858162, "ewc_loss": 0.06061091646552086, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002838435466401279, "grad_norm": 7.1634368896484375, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8832868337631226, "num_tokens": 499890955.0, "step": 13107 }, { "epoch": 1.6674723317644067, "ewc_loss": 0.060602232813835144, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000283756700810045, "grad_norm": 7.245594024658203, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8724679946899414, "num_tokens": 499924813.0, "step": 13108 }, { "epoch": 1.6675995420429972, "ewc_loss": 0.06055000424385071, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002832344325724989, "grad_norm": 7.195683479309082, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.860715925693512, "num_tokens": 499960032.0, "step": 13109 }, { "epoch": 1.6677267523215877, "ewc_loss": 0.06061653047800064, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028389968792907894, "grad_norm": 7.205782890319824, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8784363865852356, "num_tokens": 499995868.0, "step": 13110 }, { "epoch": 1.667853962600178, "ewc_loss": 0.06061436980962753, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002838780637830496, "grad_norm": 7.17048454284668, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8690592050552368, "num_tokens": 500031662.0, "step": 13111 }, { "epoch": 1.6679811728787686, "ewc_loss": 0.06067591533064842, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028449352248571813, "grad_norm": 7.161075592041016, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8551602363586426, "num_tokens": 500070189.0, "step": 13112 }, { "epoch": 1.668108383157359, "ewc_loss": 0.06056775897741318, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000283411965938285, "grad_norm": 7.164097785949707, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8601564764976501, "num_tokens": 500109999.0, "step": 13113 }, { "epoch": 1.6682355934359496, "ewc_loss": 0.06066758185625076, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002844101982191205, "grad_norm": 7.211287498474121, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8474843502044678, "num_tokens": 500151473.0, "step": 13114 }, { "epoch": 1.6683628037145402, "ewc_loss": 0.060620106756687164, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002839354274328798, "grad_norm": 7.231345176696777, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8626738786697388, "num_tokens": 500187436.0, "step": 13115 }, { "epoch": 1.6684900139931307, "ewc_loss": 0.06061916425824165, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002839260268956423, "grad_norm": 7.170318126678467, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8631846904754639, "num_tokens": 500222433.0, "step": 13116 }, { "epoch": 1.668617224271721, "ewc_loss": 0.0606687068939209, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028442146140150726, "grad_norm": 7.199085712432861, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8661172389984131, "num_tokens": 500257390.0, "step": 13117 }, { "epoch": 1.6687444345503115, "ewc_loss": 0.06068172678351402, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028455164283514023, "grad_norm": 7.181083679199219, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8643690943717957, "num_tokens": 500301260.0, "step": 13118 }, { "epoch": 1.668871644828902, "ewc_loss": 0.060697272419929504, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002847071154974401, "grad_norm": 7.208561420440674, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8697688579559326, "num_tokens": 500343453.0, "step": 13119 }, { "epoch": 1.6689988551074926, "ewc_loss": 0.060619011521339417, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028392451349645853, "grad_norm": 7.243497848510742, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8709867000579834, "num_tokens": 500375133.0, "step": 13120 }, { "epoch": 1.669126065386083, "ewc_loss": 0.06054380163550377, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028317238320596516, "grad_norm": 7.208123683929443, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8592740297317505, "num_tokens": 500408540.0, "step": 13121 }, { "epoch": 1.6692532756646736, "ewc_loss": 0.060667745769023895, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002844118280336261, "grad_norm": 7.20619535446167, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8770266175270081, "num_tokens": 500446243.0, "step": 13122 }, { "epoch": 1.6693804859432642, "ewc_loss": 0.060581039637327194, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028354476671665907, "grad_norm": 7.227560520172119, "learning_rate": 1e-06, "loss": 0.5448, "mean_token_accuracy": 0.8410981893539429, "num_tokens": 500484313.0, "step": 13123 }, { "epoch": 1.6695076962218547, "ewc_loss": 0.06058695912361145, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002836039930116385, "grad_norm": 7.180593013763428, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8759618997573853, "num_tokens": 500525566.0, "step": 13124 }, { "epoch": 1.6696349065004452, "ewc_loss": 0.06068456172943115, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002845799899660051, "grad_norm": 7.226881504058838, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8692531585693359, "num_tokens": 500566274.0, "step": 13125 }, { "epoch": 1.6697621167790357, "ewc_loss": 0.06059228628873825, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028365725302137434, "grad_norm": 7.237489223480225, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8566250801086426, "num_tokens": 500602769.0, "step": 13126 }, { "epoch": 1.6698893270576263, "ewc_loss": 0.0606813058257103, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028454745188355446, "grad_norm": 7.234708786010742, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8475065231323242, "num_tokens": 500642895.0, "step": 13127 }, { "epoch": 1.6700165373362168, "ewc_loss": 0.06062158942222595, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028395027038641274, "grad_norm": 7.204721927642822, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8616028428077698, "num_tokens": 500681461.0, "step": 13128 }, { "epoch": 1.6701437476148073, "ewc_loss": 0.060531266033649445, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000283047033008188, "grad_norm": 7.168258190155029, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8477563261985779, "num_tokens": 500723395.0, "step": 13129 }, { "epoch": 1.6702709578933979, "ewc_loss": 0.06058112904429436, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028354566893540323, "grad_norm": 7.224304676055908, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8653198480606079, "num_tokens": 500760571.0, "step": 13130 }, { "epoch": 1.6703981681719884, "ewc_loss": 0.060604583472013474, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028378021670505404, "grad_norm": 7.216787815093994, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8552088737487793, "num_tokens": 500797858.0, "step": 13131 }, { "epoch": 1.670525378450579, "ewc_loss": 0.060540299862623215, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002831373712979257, "grad_norm": 7.163960933685303, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8698854446411133, "num_tokens": 500840199.0, "step": 13132 }, { "epoch": 1.6706525887291694, "ewc_loss": 0.06072492152452469, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002849835727829486, "grad_norm": 7.199795722961426, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8580824732780457, "num_tokens": 500883290.0, "step": 13133 }, { "epoch": 1.67077979900776, "ewc_loss": 0.06064268946647644, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028416127315722406, "grad_norm": 7.312355041503906, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8677892684936523, "num_tokens": 500923072.0, "step": 13134 }, { "epoch": 1.6709070092863505, "ewc_loss": 0.060587748885154724, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002836118801496923, "grad_norm": 7.22845983505249, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8665519952774048, "num_tokens": 500958726.0, "step": 13135 }, { "epoch": 1.6710342195649408, "ewc_loss": 0.06069865822792053, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002847209689207375, "grad_norm": 7.3116631507873535, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8604781031608582, "num_tokens": 500996746.0, "step": 13136 }, { "epoch": 1.6711614298435313, "ewc_loss": 0.06050202250480652, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002827546268235892, "grad_norm": 7.205899715423584, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8560779094696045, "num_tokens": 501034135.0, "step": 13137 }, { "epoch": 1.6712886401221219, "ewc_loss": 0.0605723112821579, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002834575134329498, "grad_norm": 7.22114896774292, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8660107254981995, "num_tokens": 501069810.0, "step": 13138 }, { "epoch": 1.6714158504007124, "ewc_loss": 0.060540635138750076, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028314071823842824, "grad_norm": 7.215902805328369, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8573755621910095, "num_tokens": 501110597.0, "step": 13139 }, { "epoch": 1.671543060679303, "ewc_loss": 0.060502439737319946, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002827587886713445, "grad_norm": 7.24537467956543, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8519961833953857, "num_tokens": 501145927.0, "step": 13140 }, { "epoch": 1.6716702709578934, "ewc_loss": 0.06053648889064789, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002830992452800274, "grad_norm": 7.167418479919434, "learning_rate": 1e-06, "loss": 0.5218, "mean_token_accuracy": 0.8439345359802246, "num_tokens": 501188473.0, "step": 13141 }, { "epoch": 1.6717974812364838, "ewc_loss": 0.0606827586889267, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028456197469495237, "grad_norm": 7.258225917816162, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8763726353645325, "num_tokens": 501226143.0, "step": 13142 }, { "epoch": 1.6719246915150743, "ewc_loss": 0.060517072677612305, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002829050936270505, "grad_norm": 7.170777797698975, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.854692816734314, "num_tokens": 501266980.0, "step": 13143 }, { "epoch": 1.6720519017936648, "ewc_loss": 0.060595281422138214, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002836872008629143, "grad_norm": 7.206762790679932, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8605902791023254, "num_tokens": 501305414.0, "step": 13144 }, { "epoch": 1.6721791120722553, "ewc_loss": 0.060668859630823135, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000284422974800691, "grad_norm": 7.269922733306885, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8553171753883362, "num_tokens": 501343009.0, "step": 13145 }, { "epoch": 1.6723063223508459, "ewc_loss": 0.06052786856889725, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000283013068838045, "grad_norm": 7.142330169677734, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8693305850028992, "num_tokens": 501383571.0, "step": 13146 }, { "epoch": 1.6724335326294364, "ewc_loss": 0.06058527156710625, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028358708368614316, "grad_norm": 7.135377407073975, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8662393093109131, "num_tokens": 501425444.0, "step": 13147 }, { "epoch": 1.672560742908027, "ewc_loss": 0.06067084148526192, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028444279450923204, "grad_norm": 7.1849164962768555, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8760455846786499, "num_tokens": 501466536.0, "step": 13148 }, { "epoch": 1.6726879531866174, "ewc_loss": 0.06060636788606644, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028379802824929357, "grad_norm": 7.193920612335205, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8664199113845825, "num_tokens": 501507346.0, "step": 13149 }, { "epoch": 1.672815163465208, "ewc_loss": 0.06072685122489929, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002850028977263719, "grad_norm": 7.23588752746582, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8708589673042297, "num_tokens": 501545515.0, "step": 13150 }, { "epoch": 1.6729423737437985, "ewc_loss": 0.06067657470703125, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002845001290552318, "grad_norm": 7.176061153411865, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8565882444381714, "num_tokens": 501588793.0, "step": 13151 }, { "epoch": 1.673069584022389, "ewc_loss": 0.060751937329769135, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028525374364107847, "grad_norm": 7.244768142700195, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8618724942207336, "num_tokens": 501624384.0, "step": 13152 }, { "epoch": 1.6731967943009796, "ewc_loss": 0.06060933321714401, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028382771415635943, "grad_norm": 7.161574840545654, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8639260530471802, "num_tokens": 501663150.0, "step": 13153 }, { "epoch": 1.67332400457957, "ewc_loss": 0.06084801256656647, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028621451929211617, "grad_norm": 7.258060455322266, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8566461801528931, "num_tokens": 501706177.0, "step": 13154 }, { "epoch": 1.6734512148581606, "ewc_loss": 0.060592204332351685, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002836564090102911, "grad_norm": 7.1983819007873535, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8749367594718933, "num_tokens": 501745267.0, "step": 13155 }, { "epoch": 1.6735784251367511, "ewc_loss": 0.06083159148693085, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002860503154806793, "grad_norm": 7.2504706382751465, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8591580986976624, "num_tokens": 501783591.0, "step": 13156 }, { "epoch": 1.6737056354153417, "ewc_loss": 0.060677506029605865, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002845094131771475, "grad_norm": 7.228943824768066, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8537408709526062, "num_tokens": 501825190.0, "step": 13157 }, { "epoch": 1.6738328456939322, "ewc_loss": 0.0607459582388401, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028519396437332034, "grad_norm": 7.296098709106445, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.862648606300354, "num_tokens": 501855813.0, "step": 13158 }, { "epoch": 1.6739600559725227, "ewc_loss": 0.060628436505794525, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000284018722595647, "grad_norm": 7.2335076332092285, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8603572845458984, "num_tokens": 501890106.0, "step": 13159 }, { "epoch": 1.674087266251113, "ewc_loss": 0.06066029518842697, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028433732222765684, "grad_norm": 7.180915832519531, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8495725989341736, "num_tokens": 501935437.0, "step": 13160 }, { "epoch": 1.6742144765297036, "ewc_loss": 0.06071038544178009, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002848382282536477, "grad_norm": 7.210372447967529, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8514729738235474, "num_tokens": 501980134.0, "step": 13161 }, { "epoch": 1.674341686808294, "ewc_loss": 0.06072959676384926, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002850303426384926, "grad_norm": 7.2610063552856445, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8588221669197083, "num_tokens": 502015977.0, "step": 13162 }, { "epoch": 1.6744688970868846, "ewc_loss": 0.06069384515285492, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002846728020813316, "grad_norm": 7.260085582733154, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.864823579788208, "num_tokens": 502061717.0, "step": 13163 }, { "epoch": 1.6745961073654752, "ewc_loss": 0.06067512556910515, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028448563534766436, "grad_norm": 7.2030181884765625, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8624891042709351, "num_tokens": 502101594.0, "step": 13164 }, { "epoch": 1.6747233176440657, "ewc_loss": 0.0607764832675457, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028549920534715056, "grad_norm": 7.326610565185547, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8632851839065552, "num_tokens": 502136521.0, "step": 13165 }, { "epoch": 1.674850527922656, "ewc_loss": 0.06059323623776436, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028366674087010324, "grad_norm": 7.2168684005737305, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8432340621948242, "num_tokens": 502174821.0, "step": 13166 }, { "epoch": 1.6749777382012465, "ewc_loss": 0.060762353241443634, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002853579062502831, "grad_norm": 7.260950088500977, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8571585416793823, "num_tokens": 502210615.0, "step": 13167 }, { "epoch": 1.675104948479837, "ewc_loss": 0.06061229854822159, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028385737095959485, "grad_norm": 7.261909484863281, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8635472655296326, "num_tokens": 502248220.0, "step": 13168 }, { "epoch": 1.6752321587584276, "ewc_loss": 0.060763172805309296, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028536608442664146, "grad_norm": 7.184385299682617, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8628831505775452, "num_tokens": 502290686.0, "step": 13169 }, { "epoch": 1.675359369037018, "ewc_loss": 0.060814015567302704, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028587455744855106, "grad_norm": 7.2718400955200195, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8512775897979736, "num_tokens": 502331789.0, "step": 13170 }, { "epoch": 1.6754865793156086, "ewc_loss": 0.060648880898952484, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028422317700460553, "grad_norm": 7.21291446685791, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8698086738586426, "num_tokens": 502371490.0, "step": 13171 }, { "epoch": 1.6756137895941992, "ewc_loss": 0.06080051138997078, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028573948657140136, "grad_norm": 7.235795021057129, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8620820045471191, "num_tokens": 502415606.0, "step": 13172 }, { "epoch": 1.6757409998727897, "ewc_loss": 0.06064832955598831, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028421764727681875, "grad_norm": 7.174949645996094, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8626607656478882, "num_tokens": 502456203.0, "step": 13173 }, { "epoch": 1.6758682101513802, "ewc_loss": 0.06088455766439438, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028657997609116137, "grad_norm": 7.262528896331787, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8788148164749146, "num_tokens": 502491508.0, "step": 13174 }, { "epoch": 1.6759954204299707, "ewc_loss": 0.0606469102203846, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002842034737113863, "grad_norm": 7.123610019683838, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8523588180541992, "num_tokens": 502533408.0, "step": 13175 }, { "epoch": 1.6761226307085613, "ewc_loss": 0.06101061403751373, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002878405211959034, "grad_norm": 7.296416282653809, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8552483320236206, "num_tokens": 502569824.0, "step": 13176 }, { "epoch": 1.6762498409871518, "ewc_loss": 0.06067509204149246, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028448531520552933, "grad_norm": 7.182265758514404, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8707848191261292, "num_tokens": 502604776.0, "step": 13177 }, { "epoch": 1.6763770512657423, "ewc_loss": 0.06101951003074646, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028792949160560966, "grad_norm": 7.278294086456299, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8449889421463013, "num_tokens": 502645732.0, "step": 13178 }, { "epoch": 1.6765042615443329, "ewc_loss": 0.06077319383621216, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000285466288914904, "grad_norm": 7.22764778137207, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8662490844726562, "num_tokens": 502686044.0, "step": 13179 }, { "epoch": 1.6766314718229234, "ewc_loss": 0.06090192496776581, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028675360954366624, "grad_norm": 7.254339218139648, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.854224443435669, "num_tokens": 502728288.0, "step": 13180 }, { "epoch": 1.676758682101514, "ewc_loss": 0.06087508797645569, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002864852431230247, "grad_norm": 7.219332695007324, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8550403118133545, "num_tokens": 502760772.0, "step": 13181 }, { "epoch": 1.6768858923801044, "ewc_loss": 0.06092781573534012, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002870125463232398, "grad_norm": 7.233974933624268, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8632746934890747, "num_tokens": 502791585.0, "step": 13182 }, { "epoch": 1.677013102658695, "ewc_loss": 0.060878559947013855, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002865199639927596, "grad_norm": 7.234550476074219, "learning_rate": 1e-06, "loss": 0.4086, "mean_token_accuracy": 0.880070686340332, "num_tokens": 502826339.0, "step": 13183 }, { "epoch": 1.6771403129372855, "ewc_loss": 0.06085710972547531, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002863054978661239, "grad_norm": 7.207494735717773, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8794716596603394, "num_tokens": 502868716.0, "step": 13184 }, { "epoch": 1.6772675232158758, "ewc_loss": 0.060924410820007324, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002869784948416054, "grad_norm": 7.2263617515563965, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8623150587081909, "num_tokens": 502907992.0, "step": 13185 }, { "epoch": 1.6773947334944663, "ewc_loss": 0.06090327352285385, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028676711372099817, "grad_norm": 7.248819351196289, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8564850091934204, "num_tokens": 502947197.0, "step": 13186 }, { "epoch": 1.6775219437730569, "ewc_loss": 0.06088951975107193, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002866295981220901, "grad_norm": 7.236047267913818, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8726848363876343, "num_tokens": 502985890.0, "step": 13187 }, { "epoch": 1.6776491540516474, "ewc_loss": 0.06098037213087082, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002875381032936275, "grad_norm": 7.253769397735596, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8566276431083679, "num_tokens": 503033476.0, "step": 13188 }, { "epoch": 1.677776364330238, "ewc_loss": 0.06085734814405441, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002863078552763909, "grad_norm": 7.260402679443359, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8713856935501099, "num_tokens": 503068002.0, "step": 13189 }, { "epoch": 1.6779035746088284, "ewc_loss": 0.06114886701107025, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002867816365323961, "grad_norm": 7.307539463043213, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8557224273681641, "num_tokens": 503110059.0, "step": 13190 }, { "epoch": 1.6780307848874187, "ewc_loss": 0.060801535844802856, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028574973111972213, "grad_norm": 7.230919361114502, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8569118976593018, "num_tokens": 503146617.0, "step": 13191 }, { "epoch": 1.6781579951660093, "ewc_loss": 0.06090874224901199, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028682179981842637, "grad_norm": 7.249370098114014, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8605960607528687, "num_tokens": 503186436.0, "step": 13192 }, { "epoch": 1.6782852054445998, "ewc_loss": 0.060775015503168106, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028548453701660037, "grad_norm": 7.28538179397583, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.867870032787323, "num_tokens": 503227983.0, "step": 13193 }, { "epoch": 1.6784124157231903, "ewc_loss": 0.06078440696001053, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002855784259736538, "grad_norm": 7.262295722961426, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8501282334327698, "num_tokens": 503269301.0, "step": 13194 }, { "epoch": 1.6785396260017809, "ewc_loss": 0.060842081904411316, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028615520568564534, "grad_norm": 7.267640113830566, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8450438976287842, "num_tokens": 503308919.0, "step": 13195 }, { "epoch": 1.6786668362803714, "ewc_loss": 0.06083120405673981, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028604641556739807, "grad_norm": 7.303238391876221, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8719006180763245, "num_tokens": 503345030.0, "step": 13196 }, { "epoch": 1.678794046558962, "ewc_loss": 0.0607018768787384, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028475315775722265, "grad_norm": 7.224955081939697, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8554762601852417, "num_tokens": 503379142.0, "step": 13197 }, { "epoch": 1.6789212568375524, "ewc_loss": 0.06077173352241516, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002854517078958452, "grad_norm": 7.226437091827393, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8620067834854126, "num_tokens": 503411810.0, "step": 13198 }, { "epoch": 1.679048467116143, "ewc_loss": 0.060750339180231094, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002852377656381577, "grad_norm": 7.216440677642822, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8597535490989685, "num_tokens": 503450266.0, "step": 13199 }, { "epoch": 1.6791756773947335, "ewc_loss": 0.06076321750879288, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028536655008792877, "grad_norm": 7.184591770172119, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8699605464935303, "num_tokens": 503491015.0, "step": 13200 }, { "epoch": 1.679302887673324, "ewc_loss": 0.0608346164226532, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028608052525669336, "grad_norm": 7.205041408538818, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.871292769908905, "num_tokens": 503531964.0, "step": 13201 }, { "epoch": 1.6794300979519146, "ewc_loss": 0.06082263961434364, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002859607629943639, "grad_norm": 7.237064838409424, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.865622878074646, "num_tokens": 503576069.0, "step": 13202 }, { "epoch": 1.679557308230505, "ewc_loss": 0.060906972736120224, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002868041046895087, "grad_norm": 7.25681734085083, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8639551401138306, "num_tokens": 503607895.0, "step": 13203 }, { "epoch": 1.6796845185090956, "ewc_loss": 0.0608723983168602, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002864583511836827, "grad_norm": 7.353407859802246, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8574258089065552, "num_tokens": 503636891.0, "step": 13204 }, { "epoch": 1.6798117287876861, "ewc_loss": 0.06082744151353836, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002860087843146175, "grad_norm": 7.26042366027832, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8773223161697388, "num_tokens": 503682037.0, "step": 13205 }, { "epoch": 1.6799389390662767, "ewc_loss": 0.06085405498743057, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002862749097403139, "grad_norm": 7.213798999786377, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.865410566329956, "num_tokens": 503724008.0, "step": 13206 }, { "epoch": 1.6800661493448672, "ewc_loss": 0.06101234257221222, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028541640494950116, "grad_norm": 7.262453079223633, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8719708323478699, "num_tokens": 503759008.0, "step": 13207 }, { "epoch": 1.6801933596234577, "ewc_loss": 0.06079934909939766, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002857278741430491, "grad_norm": 7.235760688781738, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8590496182441711, "num_tokens": 503796599.0, "step": 13208 }, { "epoch": 1.680320569902048, "ewc_loss": 0.06106076389551163, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002859006344806403, "grad_norm": 7.2409987449646, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8604973554611206, "num_tokens": 503835539.0, "step": 13209 }, { "epoch": 1.6804477801806386, "ewc_loss": 0.06073596328496933, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002850940218195319, "grad_norm": 7.189274311065674, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8594775795936584, "num_tokens": 503875933.0, "step": 13210 }, { "epoch": 1.680574990459229, "ewc_loss": 0.06088098883628845, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002865442365873605, "grad_norm": 7.254812240600586, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8599197864532471, "num_tokens": 503913740.0, "step": 13211 }, { "epoch": 1.6807022007378196, "ewc_loss": 0.06101818382740021, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028547478723339736, "grad_norm": 7.2041473388671875, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8660173416137695, "num_tokens": 503955105.0, "step": 13212 }, { "epoch": 1.6808294110164101, "ewc_loss": 0.060993995517492294, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028767433832399547, "grad_norm": 7.321962833404541, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8635162115097046, "num_tokens": 503987641.0, "step": 13213 }, { "epoch": 1.6809566212950007, "ewc_loss": 0.0609600767493248, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028489375836215913, "grad_norm": 7.158000469207764, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8548529148101807, "num_tokens": 504028699.0, "step": 13214 }, { "epoch": 1.681083831573591, "ewc_loss": 0.06131013482809067, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002883943379856646, "grad_norm": 7.33017635345459, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8569483757019043, "num_tokens": 504065597.0, "step": 13215 }, { "epoch": 1.6812110418521815, "ewc_loss": 0.06092619150876999, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002845548733603209, "grad_norm": 7.207633972167969, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8480905294418335, "num_tokens": 504105968.0, "step": 13216 }, { "epoch": 1.681338252130772, "ewc_loss": 0.0612991526722908, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028828447102569044, "grad_norm": 7.351411819458008, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8671225309371948, "num_tokens": 504143743.0, "step": 13217 }, { "epoch": 1.6814654624093626, "ewc_loss": 0.06062242388725281, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028395859408192337, "grad_norm": 7.227868556976318, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8665546178817749, "num_tokens": 504177023.0, "step": 13218 }, { "epoch": 1.681592672687953, "ewc_loss": 0.060854099690914154, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002862753754016012, "grad_norm": 7.358346939086914, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.861363410949707, "num_tokens": 504210492.0, "step": 13219 }, { "epoch": 1.6817198829665436, "ewc_loss": 0.0605936236679554, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000283670611679554, "grad_norm": 7.210049152374268, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8625583648681641, "num_tokens": 504248322.0, "step": 13220 }, { "epoch": 1.6818470932451342, "ewc_loss": 0.06108275428414345, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002861205139197409, "grad_norm": 7.298574447631836, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8481225967407227, "num_tokens": 504285915.0, "step": 13221 }, { "epoch": 1.6819743035237247, "ewc_loss": 0.06060915067791939, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028382588061504066, "grad_norm": 7.228147983551025, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8608490824699402, "num_tokens": 504325578.0, "step": 13222 }, { "epoch": 1.6821015138023152, "ewc_loss": 0.061037465929985046, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002856676292140037, "grad_norm": 7.237120628356934, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8827501535415649, "num_tokens": 504362876.0, "step": 13223 }, { "epoch": 1.6822287240809057, "ewc_loss": 0.06099589169025421, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002852519100997597, "grad_norm": 7.352664470672607, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8610846996307373, "num_tokens": 504402303.0, "step": 13224 }, { "epoch": 1.6823559343594963, "ewc_loss": 0.060813453048467636, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002834275073837489, "grad_norm": 7.1264328956604, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8633655309677124, "num_tokens": 504440039.0, "step": 13225 }, { "epoch": 1.6824831446380868, "ewc_loss": 0.06124631315469742, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028775609098374844, "grad_norm": 7.372964859008789, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8533258438110352, "num_tokens": 504485102.0, "step": 13226 }, { "epoch": 1.6826103549166773, "ewc_loss": 0.06056499481201172, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002833843172993511, "grad_norm": 7.16953706741333, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8615530729293823, "num_tokens": 504522762.0, "step": 13227 }, { "epoch": 1.6827375651952678, "ewc_loss": 0.06102118641138077, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002879462263081223, "grad_norm": 7.267690658569336, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8677034974098206, "num_tokens": 504562818.0, "step": 13228 }, { "epoch": 1.6828647754738584, "ewc_loss": 0.060690585523843765, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002846402348950505, "grad_norm": 7.262563705444336, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8535536527633667, "num_tokens": 504602914.0, "step": 13229 }, { "epoch": 1.682991985752449, "ewc_loss": 0.06087210029363632, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002864553825929761, "grad_norm": 7.238885879516602, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.869040846824646, "num_tokens": 504641376.0, "step": 13230 }, { "epoch": 1.6831191960310394, "ewc_loss": 0.06096553057432175, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028494829894043505, "grad_norm": 7.2040791511535645, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8697001338005066, "num_tokens": 504675031.0, "step": 13231 }, { "epoch": 1.68324640630963, "ewc_loss": 0.06105029210448265, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002857958897948265, "grad_norm": 7.308201313018799, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8687335252761841, "num_tokens": 504709624.0, "step": 13232 }, { "epoch": 1.6833736165882205, "ewc_loss": 0.06094483286142349, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002847412833943963, "grad_norm": 7.168407440185547, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8663550019264221, "num_tokens": 504749310.0, "step": 13233 }, { "epoch": 1.6835008268668108, "ewc_loss": 0.061117984354496, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002864727866835892, "grad_norm": 7.260193824768066, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8646830320358276, "num_tokens": 504791700.0, "step": 13234 }, { "epoch": 1.6836280371454013, "ewc_loss": 0.06075482815504074, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000285282643744722, "grad_norm": 7.211031436920166, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8524140119552612, "num_tokens": 504833246.0, "step": 13235 }, { "epoch": 1.6837552474239919, "ewc_loss": 0.06089203804731369, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002866547729354352, "grad_norm": 7.286693572998047, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8672227263450623, "num_tokens": 504872901.0, "step": 13236 }, { "epoch": 1.6838824577025824, "ewc_loss": 0.06105372682213783, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028583023231476545, "grad_norm": 7.256992340087891, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.856988251209259, "num_tokens": 504906138.0, "step": 13237 }, { "epoch": 1.684009667981173, "ewc_loss": 0.061117738485336304, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002864703710656613, "grad_norm": 7.283411502838135, "learning_rate": 1e-06, "loss": 0.5352, "mean_token_accuracy": 0.8424013257026672, "num_tokens": 504947147.0, "step": 13238 }, { "epoch": 1.6841368782597634, "ewc_loss": 0.06101175397634506, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028541049687191844, "grad_norm": 7.182886600494385, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8646127581596375, "num_tokens": 504985332.0, "step": 13239 }, { "epoch": 1.6842640885383537, "ewc_loss": 0.061184294521808624, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028713588835671544, "grad_norm": 7.293065547943115, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8547446727752686, "num_tokens": 505027995.0, "step": 13240 }, { "epoch": 1.6843912988169443, "ewc_loss": 0.061060190200805664, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028589487192220986, "grad_norm": 7.201110363006592, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8656952381134033, "num_tokens": 505065311.0, "step": 13241 }, { "epoch": 1.6845185090955348, "ewc_loss": 0.06123128533363342, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028760579880326986, "grad_norm": 7.273047924041748, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8580036163330078, "num_tokens": 505105810.0, "step": 13242 }, { "epoch": 1.6846457193741253, "ewc_loss": 0.06110016256570816, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002862946130335331, "grad_norm": 7.252261638641357, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8702718615531921, "num_tokens": 505141003.0, "step": 13243 }, { "epoch": 1.6847729296527159, "ewc_loss": 0.06120070070028305, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002872999757528305, "grad_norm": 7.220500946044922, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8523908853530884, "num_tokens": 505188025.0, "step": 13244 }, { "epoch": 1.6849001399313064, "ewc_loss": 0.06121227145195007, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874156925827265, "grad_norm": 7.245267868041992, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8778745532035828, "num_tokens": 505226101.0, "step": 13245 }, { "epoch": 1.685027350209897, "ewc_loss": 0.060951318591833115, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028724755975417793, "grad_norm": 7.284839630126953, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.853593647480011, "num_tokens": 505264880.0, "step": 13246 }, { "epoch": 1.6851545604884874, "ewc_loss": 0.06114557012915611, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002867486618924886, "grad_norm": 7.212226390838623, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8559980392456055, "num_tokens": 505306208.0, "step": 13247 }, { "epoch": 1.685281770767078, "ewc_loss": 0.06128464639186859, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002881394175346941, "grad_norm": 7.269094944000244, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8845273852348328, "num_tokens": 505346024.0, "step": 13248 }, { "epoch": 1.6854089810456685, "ewc_loss": 0.0612163171172142, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874561178032309, "grad_norm": 7.247411251068115, "learning_rate": 1e-06, "loss": 0.3892, "mean_token_accuracy": 0.8860995769500732, "num_tokens": 505383644.0, "step": 13249 }, { "epoch": 1.685536191324259, "ewc_loss": 0.06100108101963997, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028774517704732716, "grad_norm": 7.323293685913086, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8678487539291382, "num_tokens": 505420365.0, "step": 13250 }, { "epoch": 1.6856634016028496, "ewc_loss": 0.060953445732593536, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002872688346542418, "grad_norm": 7.283608436584473, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8639929294586182, "num_tokens": 505458211.0, "step": 13251 }, { "epoch": 1.68579061188144, "ewc_loss": 0.06123274192214012, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002876203798223287, "grad_norm": 7.237673759460449, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8579024076461792, "num_tokens": 505501649.0, "step": 13252 }, { "epoch": 1.6859178221600306, "ewc_loss": 0.06087435036897659, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028647787985391915, "grad_norm": 7.33843469619751, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8652876615524292, "num_tokens": 505538763.0, "step": 13253 }, { "epoch": 1.6860450324386211, "ewc_loss": 0.06102929636836052, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028558593476191163, "grad_norm": 7.1894145011901855, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.8855140209197998, "num_tokens": 505574562.0, "step": 13254 }, { "epoch": 1.6861722427172117, "ewc_loss": 0.06098751723766327, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002876095240935683, "grad_norm": 7.27834415435791, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.877007007598877, "num_tokens": 505609718.0, "step": 13255 }, { "epoch": 1.6862994529958022, "ewc_loss": 0.060869552195072174, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000286429887637496, "grad_norm": 7.282442092895508, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8762872219085693, "num_tokens": 505644337.0, "step": 13256 }, { "epoch": 1.6864266632743927, "ewc_loss": 0.06098208576440811, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002875552454497665, "grad_norm": 7.272158145904541, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.872810959815979, "num_tokens": 505681929.0, "step": 13257 }, { "epoch": 1.686553873552983, "ewc_loss": 0.060828886926174164, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028602321981452405, "grad_norm": 7.285127639770508, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8415017127990723, "num_tokens": 505719373.0, "step": 13258 }, { "epoch": 1.6866810838315736, "ewc_loss": 0.06088387966156006, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028657319489866495, "grad_norm": 7.256407260894775, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8480849862098694, "num_tokens": 505757767.0, "step": 13259 }, { "epoch": 1.686808294110164, "ewc_loss": 0.06093733012676239, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028710768674500287, "grad_norm": 7.320064544677734, "learning_rate": 1e-06, "loss": 0.5888, "mean_token_accuracy": 0.8330196738243103, "num_tokens": 505794777.0, "step": 13260 }, { "epoch": 1.6869355043887546, "ewc_loss": 0.060833640396595, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002860708045773208, "grad_norm": 7.249662399291992, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.87021404504776, "num_tokens": 505832672.0, "step": 13261 }, { "epoch": 1.6870627146673451, "ewc_loss": 0.060931578278541565, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028705017757602036, "grad_norm": 7.247916221618652, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8519095182418823, "num_tokens": 505873142.0, "step": 13262 }, { "epoch": 1.6871899249459357, "ewc_loss": 0.060885801911354065, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002865924034267664, "grad_norm": 7.29551887512207, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.866861879825592, "num_tokens": 505911994.0, "step": 13263 }, { "epoch": 1.687317135224526, "ewc_loss": 0.06082445755600929, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028597895288839936, "grad_norm": 7.2213850021362305, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8676958084106445, "num_tokens": 505948874.0, "step": 13264 }, { "epoch": 1.6874443455031165, "ewc_loss": 0.06088082492351532, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002865426358766854, "grad_norm": 7.261499881744385, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8690810203552246, "num_tokens": 505989679.0, "step": 13265 }, { "epoch": 1.687571555781707, "ewc_loss": 0.06080318242311478, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002857661747839302, "grad_norm": 7.244449138641357, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8711044788360596, "num_tokens": 506033209.0, "step": 13266 }, { "epoch": 1.6876987660602976, "ewc_loss": 0.06090078875422478, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002867422590497881, "grad_norm": 7.309117317199707, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8724654912948608, "num_tokens": 506065379.0, "step": 13267 }, { "epoch": 1.687825976338888, "ewc_loss": 0.060995765030384064, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028525060042738914, "grad_norm": 7.281999111175537, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8553650379180908, "num_tokens": 506099484.0, "step": 13268 }, { "epoch": 1.6879531866174786, "ewc_loss": 0.0611368753015995, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028666172875091434, "grad_norm": 7.293759346008301, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8474726676940918, "num_tokens": 506136559.0, "step": 13269 }, { "epoch": 1.6880803968960691, "ewc_loss": 0.061039697378873825, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000285689951851964, "grad_norm": 7.301460266113281, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8620854020118713, "num_tokens": 506172997.0, "step": 13270 }, { "epoch": 1.6882076071746597, "ewc_loss": 0.06077774614095688, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002855118364095688, "grad_norm": 7.189265727996826, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8772039413452148, "num_tokens": 506209858.0, "step": 13271 }, { "epoch": 1.6883348174532502, "ewc_loss": 0.060965437442064285, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028738874243572354, "grad_norm": 7.262013912200928, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8744765520095825, "num_tokens": 506242260.0, "step": 13272 }, { "epoch": 1.6884620277318407, "ewc_loss": 0.060849882662296295, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028623317484743893, "grad_norm": 7.2746782302856445, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8500440120697021, "num_tokens": 506276856.0, "step": 13273 }, { "epoch": 1.6885892380104313, "ewc_loss": 0.060849614441394806, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028623052639886737, "grad_norm": 7.201889514923096, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8817036151885986, "num_tokens": 506311469.0, "step": 13274 }, { "epoch": 1.6887164482890218, "ewc_loss": 0.06098988652229309, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028763324371539056, "grad_norm": 7.227628231048584, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8723112344741821, "num_tokens": 506348240.0, "step": 13275 }, { "epoch": 1.6888436585676123, "ewc_loss": 0.060983411967754364, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028756848769262433, "grad_norm": 7.206399917602539, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8652558326721191, "num_tokens": 506387203.0, "step": 13276 }, { "epoch": 1.6889708688462028, "ewc_loss": 0.06125660985708237, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002878590894397348, "grad_norm": 7.297658443450928, "learning_rate": 1e-06, "loss": 0.5396, "mean_token_accuracy": 0.8403805494308472, "num_tokens": 506421869.0, "step": 13277 }, { "epoch": 1.6890980791247934, "ewc_loss": 0.06121119111776352, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874048659577966, "grad_norm": 7.230976581573486, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8638834357261658, "num_tokens": 506458873.0, "step": 13278 }, { "epoch": 1.689225289403384, "ewc_loss": 0.06132100522518158, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028850301168859005, "grad_norm": 7.251079559326172, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8503369688987732, "num_tokens": 506495947.0, "step": 13279 }, { "epoch": 1.6893524996819744, "ewc_loss": 0.061192914843559265, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002872220939025283, "grad_norm": 7.241339206695557, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8462722301483154, "num_tokens": 506533098.0, "step": 13280 }, { "epoch": 1.689479709960565, "ewc_loss": 0.060941919684410095, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002871535543818027, "grad_norm": 7.190279483795166, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.87467360496521, "num_tokens": 506571346.0, "step": 13281 }, { "epoch": 1.6896069202391555, "ewc_loss": 0.06120157986879349, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002873087942134589, "grad_norm": 7.339137554168701, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8657866716384888, "num_tokens": 506607912.0, "step": 13282 }, { "epoch": 1.6897341305177458, "ewc_loss": 0.061075642704963684, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028604938415810466, "grad_norm": 7.200405120849609, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8528419733047485, "num_tokens": 506654258.0, "step": 13283 }, { "epoch": 1.6898613407963363, "ewc_loss": 0.061231911182403564, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028761205612681806, "grad_norm": 7.242991924285889, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8706358075141907, "num_tokens": 506693436.0, "step": 13284 }, { "epoch": 1.6899885510749268, "ewc_loss": 0.06117246672511101, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028701763949356973, "grad_norm": 7.261423110961914, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8664528131484985, "num_tokens": 506737383.0, "step": 13285 }, { "epoch": 1.6901157613535174, "ewc_loss": 0.061060234904289246, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002858953084796667, "grad_norm": 7.219306945800781, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8726357221603394, "num_tokens": 506772282.0, "step": 13286 }, { "epoch": 1.690242971632108, "ewc_loss": 0.06111433729529381, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002864363486878574, "grad_norm": 7.261862277984619, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8753533363342285, "num_tokens": 506807208.0, "step": 13287 }, { "epoch": 1.6903701819106984, "ewc_loss": 0.06100662797689438, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002853592741303146, "grad_norm": 7.2272772789001465, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8711029291152954, "num_tokens": 506847952.0, "step": 13288 }, { "epoch": 1.6904973921892887, "ewc_loss": 0.061154529452323914, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002868382434826344, "grad_norm": 7.290873050689697, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8669299483299255, "num_tokens": 506888265.0, "step": 13289 }, { "epoch": 1.6906246024678793, "ewc_loss": 0.060970790684223175, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002850008604582399, "grad_norm": 7.24926233291626, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8676543831825256, "num_tokens": 506926096.0, "step": 13290 }, { "epoch": 1.6907518127464698, "ewc_loss": 0.06084801256656647, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028621451929211617, "grad_norm": 7.272327899932861, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8574935793876648, "num_tokens": 506962252.0, "step": 13291 }, { "epoch": 1.6908790230250603, "ewc_loss": 0.06096496433019638, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000284942623693496, "grad_norm": 7.240690231323242, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8684020638465881, "num_tokens": 507004709.0, "step": 13292 }, { "epoch": 1.6910062333036509, "ewc_loss": 0.060752496123313904, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002852593606803566, "grad_norm": 7.252357006072998, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8801500797271729, "num_tokens": 507041041.0, "step": 13293 }, { "epoch": 1.6911334435822414, "ewc_loss": 0.06073639169335365, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028509830008260906, "grad_norm": 7.253735542297363, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8574295043945312, "num_tokens": 507080251.0, "step": 13294 }, { "epoch": 1.691260653860832, "ewc_loss": 0.06106530874967575, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002859460364561528, "grad_norm": 7.280226707458496, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8637397289276123, "num_tokens": 507120813.0, "step": 13295 }, { "epoch": 1.6913878641394224, "ewc_loss": 0.06101176515221596, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028541061328724027, "grad_norm": 7.338689804077148, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8505180478096008, "num_tokens": 507160284.0, "step": 13296 }, { "epoch": 1.691515074418013, "ewc_loss": 0.06110269948840141, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002863199624698609, "grad_norm": 7.231545925140381, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8559278249740601, "num_tokens": 507205069.0, "step": 13297 }, { "epoch": 1.6916422846966035, "ewc_loss": 0.06115470081567764, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028683996060863137, "grad_norm": 7.271364212036133, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8730879426002502, "num_tokens": 507244689.0, "step": 13298 }, { "epoch": 1.691769494975194, "ewc_loss": 0.06100809574127197, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002853739133570343, "grad_norm": 7.3225884437561035, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8653891086578369, "num_tokens": 507283917.0, "step": 13299 }, { "epoch": 1.6918967052537845, "ewc_loss": 0.060854360461235046, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028627796564251184, "grad_norm": 7.3093132972717285, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8662230968475342, "num_tokens": 507327937.0, "step": 13300 }, { "epoch": 1.692023915532375, "ewc_loss": 0.06105387210845947, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028583171661011875, "grad_norm": 7.189755439758301, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8763877153396606, "num_tokens": 507369521.0, "step": 13301 }, { "epoch": 1.6921511258109656, "ewc_loss": 0.06090777367353439, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028681213734671474, "grad_norm": 7.387100696563721, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8739267587661743, "num_tokens": 507408157.0, "step": 13302 }, { "epoch": 1.6922783360895561, "ewc_loss": 0.06067252904176712, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028445967473089695, "grad_norm": 7.173443794250488, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8704571723937988, "num_tokens": 507448003.0, "step": 13303 }, { "epoch": 1.6924055463681467, "ewc_loss": 0.061017975211143494, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002879141247831285, "grad_norm": 7.285312652587891, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8722219467163086, "num_tokens": 507489970.0, "step": 13304 }, { "epoch": 1.6925327566467372, "ewc_loss": 0.06082139164209366, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028594827745109797, "grad_norm": 7.288559436798096, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8659472465515137, "num_tokens": 507524536.0, "step": 13305 }, { "epoch": 1.6926599669253277, "ewc_loss": 0.06102794036269188, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028801377629861236, "grad_norm": 7.3137288093566895, "learning_rate": 1e-06, "loss": 0.3788, "mean_token_accuracy": 0.8849159479141235, "num_tokens": 507561575.0, "step": 13306 }, { "epoch": 1.692787177203918, "ewc_loss": 0.06089366227388382, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002866709837689996, "grad_norm": 7.287840843200684, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8658612370491028, "num_tokens": 507597212.0, "step": 13307 }, { "epoch": 1.6929143874825086, "ewc_loss": 0.06093500554561615, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028708443278446794, "grad_norm": 7.272261619567871, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.866990327835083, "num_tokens": 507634251.0, "step": 13308 }, { "epoch": 1.693041597761099, "ewc_loss": 0.06043708324432373, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028698804089799523, "grad_norm": 7.298111438751221, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8698521852493286, "num_tokens": 507671345.0, "step": 13309 }, { "epoch": 1.6931688080396896, "ewc_loss": 0.06099255010485649, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002876598737202585, "grad_norm": 7.320554256439209, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.853034496307373, "num_tokens": 507709797.0, "step": 13310 }, { "epoch": 1.6932960183182801, "ewc_loss": 0.06045215576887131, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.00028713876963593066, "grad_norm": 7.303869247436523, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8641693592071533, "num_tokens": 507747047.0, "step": 13311 }, { "epoch": 1.6934232285968707, "ewc_loss": 0.06042154133319855, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002868326264433563, "grad_norm": 7.262057781219482, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8644555807113647, "num_tokens": 507782187.0, "step": 13312 }, { "epoch": 1.693550438875461, "ewc_loss": 0.06069980934262276, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002871738688554615, "grad_norm": 7.277604103088379, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8513898849487305, "num_tokens": 507826714.0, "step": 13313 }, { "epoch": 1.6936776491540515, "ewc_loss": 0.06049878150224686, "ewc_loss_diag": 3.170967102050781e-05, "ewc_loss_parallel": 0.0002876049838960171, "grad_norm": 7.188722610473633, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.85640549659729, "num_tokens": 507869423.0, "step": 13314 }, { "epoch": 1.693804859432642, "ewc_loss": 0.06117483600974083, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002894827339332551, "grad_norm": 7.368148326873779, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8743239641189575, "num_tokens": 507904711.0, "step": 13315 }, { "epoch": 1.6939320697112326, "ewc_loss": 0.06095730513334274, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028730742633342743, "grad_norm": 7.191291332244873, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8595660924911499, "num_tokens": 507947760.0, "step": 13316 }, { "epoch": 1.694059279989823, "ewc_loss": 0.0613776333630085, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029151071794331074, "grad_norm": 7.386653900146484, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.861433207988739, "num_tokens": 507984249.0, "step": 13317 }, { "epoch": 1.6941864902684136, "ewc_loss": 0.06093785911798477, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028711295453831553, "grad_norm": 7.253247261047363, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8614354729652405, "num_tokens": 508017206.0, "step": 13318 }, { "epoch": 1.6943137005470041, "ewc_loss": 0.06118784472346306, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002896128280553967, "grad_norm": 7.296935081481934, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8582854270935059, "num_tokens": 508056357.0, "step": 13319 }, { "epoch": 1.6944409108255947, "ewc_loss": 0.06132078170776367, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028850079979747534, "grad_norm": 7.255327224731445, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8560274839401245, "num_tokens": 508094870.0, "step": 13320 }, { "epoch": 1.6945681211041852, "ewc_loss": 0.061421290040016174, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002895058423746377, "grad_norm": 7.39056396484375, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8637326955795288, "num_tokens": 508120323.0, "step": 13321 }, { "epoch": 1.6946953313827757, "ewc_loss": 0.061006758362054825, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028780195862054825, "grad_norm": 7.219379425048828, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8560808897018433, "num_tokens": 508160018.0, "step": 13322 }, { "epoch": 1.6948225416613663, "ewc_loss": 0.06122715026140213, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029000587528571486, "grad_norm": 7.307408809661865, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8818660378456116, "num_tokens": 508200947.0, "step": 13323 }, { "epoch": 1.6949497519399568, "ewc_loss": 0.06119568273425102, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002872498007491231, "grad_norm": 7.189814567565918, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8504359722137451, "num_tokens": 508246457.0, "step": 13324 }, { "epoch": 1.6950769622185473, "ewc_loss": 0.061256274580955505, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002902971173170954, "grad_norm": 7.353557586669922, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8587960004806519, "num_tokens": 508281951.0, "step": 13325 }, { "epoch": 1.6952041724971378, "ewc_loss": 0.0609971359372139, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002877057413570583, "grad_norm": 7.257013320922852, "learning_rate": 1e-06, "loss": 0.5246, "mean_token_accuracy": 0.8447849750518799, "num_tokens": 508322571.0, "step": 13326 }, { "epoch": 1.6953313827757284, "ewc_loss": 0.061062686145305634, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002883612469304353, "grad_norm": 7.278380870819092, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8702785968780518, "num_tokens": 508357347.0, "step": 13327 }, { "epoch": 1.695458593054319, "ewc_loss": 0.06131218001246452, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028841476887464523, "grad_norm": 7.2008843421936035, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8662959337234497, "num_tokens": 508401813.0, "step": 13328 }, { "epoch": 1.6955858033329094, "ewc_loss": 0.061109233647584915, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028882670449092984, "grad_norm": 7.334125518798828, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8521791696548462, "num_tokens": 508440457.0, "step": 13329 }, { "epoch": 1.6957130136115, "ewc_loss": 0.061032600700855255, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002880603715311736, "grad_norm": 7.349431037902832, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8542330861091614, "num_tokens": 508472272.0, "step": 13330 }, { "epoch": 1.6958402238900905, "ewc_loss": 0.06098296493291855, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000287564005702734, "grad_norm": 7.260613918304443, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8762288093566895, "num_tokens": 508509352.0, "step": 13331 }, { "epoch": 1.6959674341686808, "ewc_loss": 0.06132950633764267, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002885880530811846, "grad_norm": 7.283261775970459, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8727476000785828, "num_tokens": 508552515.0, "step": 13332 }, { "epoch": 1.6960946444472713, "ewc_loss": 0.06121574714779854, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874504425562918, "grad_norm": 7.261420249938965, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8760758638381958, "num_tokens": 508591399.0, "step": 13333 }, { "epoch": 1.6962218547258618, "ewc_loss": 0.061248309910297394, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028777605621144176, "grad_norm": 7.340877532958984, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.868472695350647, "num_tokens": 508630427.0, "step": 13334 }, { "epoch": 1.6963490650044524, "ewc_loss": 0.06119426339864731, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028723562718369067, "grad_norm": 7.2551960945129395, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8722178936004639, "num_tokens": 508667469.0, "step": 13335 }, { "epoch": 1.696476275283043, "ewc_loss": 0.061265356838703156, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028794651734642684, "grad_norm": 7.309770584106445, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8681759834289551, "num_tokens": 508706308.0, "step": 13336 }, { "epoch": 1.6966034855616334, "ewc_loss": 0.061193596571683884, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002872289333026856, "grad_norm": 7.299920082092285, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8522642254829407, "num_tokens": 508738968.0, "step": 13337 }, { "epoch": 1.6967306958402237, "ewc_loss": 0.061006657779216766, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002878009690903127, "grad_norm": 7.338736057281494, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8703010082244873, "num_tokens": 508769879.0, "step": 13338 }, { "epoch": 1.6968579061188143, "ewc_loss": 0.06109001487493515, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002861930988729, "grad_norm": 7.247802257537842, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8502026796340942, "num_tokens": 508812277.0, "step": 13339 }, { "epoch": 1.6969851163974048, "ewc_loss": 0.061226312071084976, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028755608946084976, "grad_norm": 7.311086177825928, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8543741106987, "num_tokens": 508849293.0, "step": 13340 }, { "epoch": 1.6971123266759953, "ewc_loss": 0.060851939022541046, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002862537803594023, "grad_norm": 7.258089065551758, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8653405904769897, "num_tokens": 508885798.0, "step": 13341 }, { "epoch": 1.6972395369545858, "ewc_loss": 0.06091243028640747, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028685867437161505, "grad_norm": 7.343141078948975, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.862144947052002, "num_tokens": 508920491.0, "step": 13342 }, { "epoch": 1.6973667472331764, "ewc_loss": 0.06080777943134308, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002858121879398823, "grad_norm": 7.25456428527832, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8632453680038452, "num_tokens": 508956161.0, "step": 13343 }, { "epoch": 1.697493957511767, "ewc_loss": 0.06121961027383804, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874890633393079, "grad_norm": 7.324926376342773, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8602670431137085, "num_tokens": 508995696.0, "step": 13344 }, { "epoch": 1.6976211677903574, "ewc_loss": 0.06102489307522774, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002855419006664306, "grad_norm": 7.287991523742676, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8689768314361572, "num_tokens": 509029505.0, "step": 13345 }, { "epoch": 1.697748378068948, "ewc_loss": 0.06113481521606445, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028664112323895097, "grad_norm": 7.308065414428711, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8713104724884033, "num_tokens": 509062815.0, "step": 13346 }, { "epoch": 1.6978755883475385, "ewc_loss": 0.06084522604942322, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028618660871870816, "grad_norm": 7.261198997497559, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8708680868148804, "num_tokens": 509103376.0, "step": 13347 }, { "epoch": 1.698002798626129, "ewc_loss": 0.061153560876846313, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028682855190709233, "grad_norm": 7.3008599281311035, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8737274408340454, "num_tokens": 509142797.0, "step": 13348 }, { "epoch": 1.6981300089047195, "ewc_loss": 0.06104828044772148, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002857757790479809, "grad_norm": 7.335855007171631, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.853400468826294, "num_tokens": 509182724.0, "step": 13349 }, { "epoch": 1.69825721918331, "ewc_loss": 0.06093012914061546, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002845942508429289, "grad_norm": 7.268107891082764, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8723913431167603, "num_tokens": 509217856.0, "step": 13350 }, { "epoch": 1.6983844294619006, "ewc_loss": 0.06108228862285614, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028611585730686784, "grad_norm": 7.345384120941162, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8648355007171631, "num_tokens": 509251581.0, "step": 13351 }, { "epoch": 1.6985116397404911, "ewc_loss": 0.0609094500541687, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002843874681275338, "grad_norm": 7.210623741149902, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8808014392852783, "num_tokens": 509286687.0, "step": 13352 }, { "epoch": 1.6986388500190817, "ewc_loss": 0.06120511516928673, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028734412626363337, "grad_norm": 7.397454738616943, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8589639067649841, "num_tokens": 509323372.0, "step": 13353 }, { "epoch": 1.6987660602976722, "ewc_loss": 0.06089266389608383, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002842196263372898, "grad_norm": 7.2179460525512695, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8569832444190979, "num_tokens": 509362472.0, "step": 13354 }, { "epoch": 1.6988932705762627, "ewc_loss": 0.06122679263353348, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002875608915928751, "grad_norm": 7.347959518432617, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8650927543640137, "num_tokens": 509397676.0, "step": 13355 }, { "epoch": 1.699020480854853, "ewc_loss": 0.06099257618188858, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000285218731733039, "grad_norm": 7.253017425537109, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8682711124420166, "num_tokens": 509441100.0, "step": 13356 }, { "epoch": 1.6991476911334435, "ewc_loss": 0.06120828166604042, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002873757912311703, "grad_norm": 7.333029747009277, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8603264093399048, "num_tokens": 509485430.0, "step": 13357 }, { "epoch": 1.699274901412034, "ewc_loss": 0.06092468649148941, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002845398266799748, "grad_norm": 7.263896465301514, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.8348073959350586, "num_tokens": 509520102.0, "step": 13358 }, { "epoch": 1.6994021116906246, "ewc_loss": 0.061246760189533234, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002877605438698083, "grad_norm": 7.322545051574707, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8654229044914246, "num_tokens": 509556525.0, "step": 13359 }, { "epoch": 1.6995293219692151, "ewc_loss": 0.061081916093826294, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002861121320165694, "grad_norm": 7.292984962463379, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8561273217201233, "num_tokens": 509596899.0, "step": 13360 }, { "epoch": 1.6996565322478057, "ewc_loss": 0.061195578426122665, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028724875301122665, "grad_norm": 7.312035083770752, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8659588098526001, "num_tokens": 509632125.0, "step": 13361 }, { "epoch": 1.699783742526396, "ewc_loss": 0.06112877279520035, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028658067458309233, "grad_norm": 7.284724712371826, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8593942523002625, "num_tokens": 509668131.0, "step": 13362 }, { "epoch": 1.6999109528049865, "ewc_loss": 0.061119891703128815, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028649187879636884, "grad_norm": 7.222538948059082, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8779460191726685, "num_tokens": 509704078.0, "step": 13363 }, { "epoch": 1.700038163083577, "ewc_loss": 0.06131390482187271, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002884319983422756, "grad_norm": 7.304443836212158, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8766449689865112, "num_tokens": 509739464.0, "step": 13364 }, { "epoch": 1.7001653733621676, "ewc_loss": 0.06103283911943436, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002856213541235775, "grad_norm": 7.265032768249512, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8638108968734741, "num_tokens": 509777970.0, "step": 13365 }, { "epoch": 1.700292583640758, "ewc_loss": 0.061273135244846344, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002880243118852377, "grad_norm": 7.301909446716309, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8507175445556641, "num_tokens": 509817010.0, "step": 13366 }, { "epoch": 1.7004197939193486, "ewc_loss": 0.061144452542066574, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002867374860215932, "grad_norm": 7.272320747375488, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.869878888130188, "num_tokens": 509854192.0, "step": 13367 }, { "epoch": 1.7005470041979391, "ewc_loss": 0.06113622337579727, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028665518038906157, "grad_norm": 7.306951999664307, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8775991201400757, "num_tokens": 509888633.0, "step": 13368 }, { "epoch": 1.7006742144765297, "ewc_loss": 0.06121298670768738, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028742285212501884, "grad_norm": 7.2781877517700195, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8658638000488281, "num_tokens": 509928474.0, "step": 13369 }, { "epoch": 1.7008014247551202, "ewc_loss": 0.06118213012814522, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002871142642106861, "grad_norm": 7.367318630218506, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8565274477005005, "num_tokens": 509968875.0, "step": 13370 }, { "epoch": 1.7009286350337107, "ewc_loss": 0.06107452139258385, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002860381791833788, "grad_norm": 7.294760227203369, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8651230931282043, "num_tokens": 510006021.0, "step": 13371 }, { "epoch": 1.7010558453123013, "ewc_loss": 0.06114818900823593, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002867748844437301, "grad_norm": 7.275738716125488, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8692760467529297, "num_tokens": 510042111.0, "step": 13372 }, { "epoch": 1.7011830555908918, "ewc_loss": 0.06119115650653839, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002872045442927629, "grad_norm": 7.462790489196777, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8705159425735474, "num_tokens": 510074894.0, "step": 13373 }, { "epoch": 1.7013102658694823, "ewc_loss": 0.060961902141571045, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028491197736002505, "grad_norm": 7.250466346740723, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8563928008079529, "num_tokens": 510111516.0, "step": 13374 }, { "epoch": 1.7014374761480728, "ewc_loss": 0.061207365244627, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002873666235245764, "grad_norm": 7.349287986755371, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8671911954879761, "num_tokens": 510151549.0, "step": 13375 }, { "epoch": 1.7015646864266634, "ewc_loss": 0.0609482005238533, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028477495652623475, "grad_norm": 7.300040245056152, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8642150163650513, "num_tokens": 510189215.0, "step": 13376 }, { "epoch": 1.701691896705254, "ewc_loss": 0.06104275584220886, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000285720539977774, "grad_norm": 7.24432373046875, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.88448566198349, "num_tokens": 510231189.0, "step": 13377 }, { "epoch": 1.7018191069838444, "ewc_loss": 0.0610690675675869, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002859836386051029, "grad_norm": 7.292914390563965, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8687642216682434, "num_tokens": 510273956.0, "step": 13378 }, { "epoch": 1.701946317262435, "ewc_loss": 0.06106013432145119, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002858943189494312, "grad_norm": 7.2669830322265625, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.880479097366333, "num_tokens": 510315095.0, "step": 13379 }, { "epoch": 1.7020735275410255, "ewc_loss": 0.06107252091169357, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028601818485185504, "grad_norm": 7.337047576904297, "learning_rate": 1e-06, "loss": 0.5333, "mean_token_accuracy": 0.8429510593414307, "num_tokens": 510350753.0, "step": 13380 }, { "epoch": 1.7022007378196158, "ewc_loss": 0.06108997017145157, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028619266231544316, "grad_norm": 7.3066205978393555, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8698663711547852, "num_tokens": 510389257.0, "step": 13381 }, { "epoch": 1.7023279480982063, "ewc_loss": 0.06108497828245163, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028614274924620986, "grad_norm": 7.32426118850708, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8655213117599487, "num_tokens": 510428879.0, "step": 13382 }, { "epoch": 1.7024551583767968, "ewc_loss": 0.06120038032531738, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002872967452276498, "grad_norm": 7.356639862060547, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8658143281936646, "num_tokens": 510468409.0, "step": 13383 }, { "epoch": 1.7025823686553874, "ewc_loss": 0.0610758513212204, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000286051508737728, "grad_norm": 7.329854965209961, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8796045780181885, "num_tokens": 510503988.0, "step": 13384 }, { "epoch": 1.702709578933978, "ewc_loss": 0.06110497564077377, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002863427216652781, "grad_norm": 7.362427711486816, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8626094460487366, "num_tokens": 510539009.0, "step": 13385 }, { "epoch": 1.7028367892125684, "ewc_loss": 0.06106742471456528, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002859672240447253, "grad_norm": 7.312585353851318, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8791757822036743, "num_tokens": 510576177.0, "step": 13386 }, { "epoch": 1.7029639994911587, "ewc_loss": 0.06112678349018097, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028656082577072084, "grad_norm": 7.268035411834717, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.864166259765625, "num_tokens": 510622194.0, "step": 13387 }, { "epoch": 1.7030912097697493, "ewc_loss": 0.06115325912833214, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002868255542125553, "grad_norm": 7.331047534942627, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8822194933891296, "num_tokens": 510663564.0, "step": 13388 }, { "epoch": 1.7032184200483398, "ewc_loss": 0.0611809603869915, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002871025644708425, "grad_norm": 7.346027851104736, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8745807409286499, "num_tokens": 510702153.0, "step": 13389 }, { "epoch": 1.7033456303269303, "ewc_loss": 0.061110761016607285, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028640058008022606, "grad_norm": 7.299772262573242, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8606067895889282, "num_tokens": 510742812.0, "step": 13390 }, { "epoch": 1.7034728406055208, "ewc_loss": 0.06111665815114975, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002864595444407314, "grad_norm": 7.386280059814453, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.859382688999176, "num_tokens": 510783363.0, "step": 13391 }, { "epoch": 1.7036000508841114, "ewc_loss": 0.0610421746969223, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002857146901078522, "grad_norm": 7.309633255004883, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8500663638114929, "num_tokens": 510818362.0, "step": 13392 }, { "epoch": 1.703727261162702, "ewc_loss": 0.0611143633723259, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002864366106223315, "grad_norm": 7.327098846435547, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.875058114528656, "num_tokens": 510854732.0, "step": 13393 }, { "epoch": 1.7038544714412924, "ewc_loss": 0.060820966958999634, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028594405739568174, "grad_norm": 7.2595295906066895, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8575912714004517, "num_tokens": 510895107.0, "step": 13394 }, { "epoch": 1.703981681719883, "ewc_loss": 0.06089550256729126, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002866894064936787, "grad_norm": 7.363265037536621, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8611267805099487, "num_tokens": 510930737.0, "step": 13395 }, { "epoch": 1.7041088919984735, "ewc_loss": 0.061013564467430115, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000285428628558293, "grad_norm": 7.297537326812744, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.873161792755127, "num_tokens": 510964646.0, "step": 13396 }, { "epoch": 1.704236102277064, "ewc_loss": 0.060969866812229156, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002874330384656787, "grad_norm": 7.3273024559021, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8682092428207397, "num_tokens": 511000583.0, "step": 13397 }, { "epoch": 1.7043633125556545, "ewc_loss": 0.06105584651231766, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028585141990333796, "grad_norm": 7.295591354370117, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8672506213188171, "num_tokens": 511033729.0, "step": 13398 }, { "epoch": 1.704490522834245, "ewc_loss": 0.06091628968715668, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002868972660508007, "grad_norm": 7.2762556076049805, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.869795560836792, "num_tokens": 511074099.0, "step": 13399 }, { "epoch": 1.7046177331128356, "ewc_loss": 0.06093721091747284, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028710649348795414, "grad_norm": 7.3351593017578125, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8538957834243774, "num_tokens": 511112473.0, "step": 13400 }, { "epoch": 1.7047449433914261, "ewc_loss": 0.060918405652046204, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002869184536393732, "grad_norm": 7.280324935913086, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8648781776428223, "num_tokens": 511149839.0, "step": 13401 }, { "epoch": 1.7048721536700167, "ewc_loss": 0.06101016700267792, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002878360392060131, "grad_norm": 7.344217300415039, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8544139862060547, "num_tokens": 511183054.0, "step": 13402 }, { "epoch": 1.7049993639486072, "ewc_loss": 0.060969606041908264, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002874304191209376, "grad_norm": 7.320876121520996, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8690045475959778, "num_tokens": 511218498.0, "step": 13403 }, { "epoch": 1.7051265742271977, "ewc_loss": 0.06100437417626381, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002877781225834042, "grad_norm": 7.293961048126221, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8607592582702637, "num_tokens": 511259776.0, "step": 13404 }, { "epoch": 1.705253784505788, "ewc_loss": 0.061042312532663345, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002881574910134077, "grad_norm": 7.298207759857178, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.870904266834259, "num_tokens": 511297331.0, "step": 13405 }, { "epoch": 1.7053809947843785, "ewc_loss": 0.06102056801319122, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028794005629606545, "grad_norm": 7.3334736824035645, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8766050338745117, "num_tokens": 511331055.0, "step": 13406 }, { "epoch": 1.705508205062969, "ewc_loss": 0.0610068142414093, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002878025406971574, "grad_norm": 7.324558258056641, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8821402788162231, "num_tokens": 511368651.0, "step": 13407 }, { "epoch": 1.7056354153415596, "ewc_loss": 0.061071693897247314, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002884513232856989, "grad_norm": 7.343869686126709, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8692517280578613, "num_tokens": 511402163.0, "step": 13408 }, { "epoch": 1.7057626256201501, "ewc_loss": 0.06094738841056824, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002872082404792309, "grad_norm": 7.267786026000977, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8580321669578552, "num_tokens": 511443769.0, "step": 13409 }, { "epoch": 1.7058898358987407, "ewc_loss": 0.060963645577430725, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002873708144761622, "grad_norm": 7.315277099609375, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8619645833969116, "num_tokens": 511481038.0, "step": 13410 }, { "epoch": 1.706017046177331, "ewc_loss": 0.06090960651636124, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028683041455224156, "grad_norm": 7.289113998413086, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8672904372215271, "num_tokens": 511522940.0, "step": 13411 }, { "epoch": 1.7061442564559215, "ewc_loss": 0.06121615692973137, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874545461963862, "grad_norm": 7.301420211791992, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8619810938835144, "num_tokens": 511562630.0, "step": 13412 }, { "epoch": 1.706271466734512, "ewc_loss": 0.060871683061122894, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002864511916413903, "grad_norm": 7.288844108581543, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8490312099456787, "num_tokens": 511598986.0, "step": 13413 }, { "epoch": 1.7063986770131025, "ewc_loss": 0.06099622696638107, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028769663185812533, "grad_norm": 7.304726600646973, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8670835494995117, "num_tokens": 511633582.0, "step": 13414 }, { "epoch": 1.706525887291693, "ewc_loss": 0.06098712235689163, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028760559507645667, "grad_norm": 7.28292989730835, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8645798563957214, "num_tokens": 511673296.0, "step": 13415 }, { "epoch": 1.7066530975702836, "ewc_loss": 0.06100369989871979, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002877713704947382, "grad_norm": 7.3432207107543945, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8566185832023621, "num_tokens": 511707890.0, "step": 13416 }, { "epoch": 1.7067803078488741, "ewc_loss": 0.06089560687541962, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002866904251277447, "grad_norm": 7.239394664764404, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8718858957290649, "num_tokens": 511741597.0, "step": 13417 }, { "epoch": 1.7069075181274647, "ewc_loss": 0.06107277795672417, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002884621499106288, "grad_norm": 7.306546688079834, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8684633374214172, "num_tokens": 511776816.0, "step": 13418 }, { "epoch": 1.7070347284060552, "ewc_loss": 0.06121065467596054, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028739951085299253, "grad_norm": 7.247128963470459, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8585189580917358, "num_tokens": 511814866.0, "step": 13419 }, { "epoch": 1.7071619386846457, "ewc_loss": 0.06110544130206108, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002887887821998447, "grad_norm": 7.267417907714844, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.865449070930481, "num_tokens": 511852458.0, "step": 13420 }, { "epoch": 1.7072891489632362, "ewc_loss": 0.061028335243463516, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028801773441955447, "grad_norm": 7.27466344833374, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8593571186065674, "num_tokens": 511891201.0, "step": 13421 }, { "epoch": 1.7074163592418268, "ewc_loss": 0.0610760860145092, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002884952409658581, "grad_norm": 7.258294105529785, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8645898103713989, "num_tokens": 511930862.0, "step": 13422 }, { "epoch": 1.7075435695204173, "ewc_loss": 0.06106813997030258, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028841575840488076, "grad_norm": 7.290258884429932, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.871853232383728, "num_tokens": 511967695.0, "step": 13423 }, { "epoch": 1.7076707797990078, "ewc_loss": 0.06099598854780197, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028769427444785833, "grad_norm": 7.217043399810791, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8790857195854187, "num_tokens": 512005976.0, "step": 13424 }, { "epoch": 1.7077979900775984, "ewc_loss": 0.06136976554989815, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028899061726406217, "grad_norm": 7.305638313293457, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8661704063415527, "num_tokens": 512041575.0, "step": 13425 }, { "epoch": 1.7079252003561889, "ewc_loss": 0.0609905868768692, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002876402286347002, "grad_norm": 7.257221221923828, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8601894378662109, "num_tokens": 512076171.0, "step": 13426 }, { "epoch": 1.7080524106347794, "ewc_loss": 0.06119685247540474, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028970290441066027, "grad_norm": 7.35533332824707, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8616343140602112, "num_tokens": 512112082.0, "step": 13427 }, { "epoch": 1.70817962091337, "ewc_loss": 0.06096084415912628, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002873428165912628, "grad_norm": 7.264300346374512, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8517876863479614, "num_tokens": 512149126.0, "step": 13428 }, { "epoch": 1.7083068311919605, "ewc_loss": 0.06120654195547104, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002897998201660812, "grad_norm": 7.309291362762451, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8659027218818665, "num_tokens": 512182368.0, "step": 13429 }, { "epoch": 1.7084340414705508, "ewc_loss": 0.06103862076997757, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028812058735638857, "grad_norm": 7.326984405517578, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8457780480384827, "num_tokens": 512218224.0, "step": 13430 }, { "epoch": 1.7085612517491413, "ewc_loss": 0.06106111407279968, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002883455017581582, "grad_norm": 7.307675361633301, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8637495040893555, "num_tokens": 512255523.0, "step": 13431 }, { "epoch": 1.7086884620277318, "ewc_loss": 0.061089564114809036, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028863002080470324, "grad_norm": 7.282347202301025, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8751314282417297, "num_tokens": 512290514.0, "step": 13432 }, { "epoch": 1.7088156723063224, "ewc_loss": 0.06115129590034485, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000289247342152521, "grad_norm": 7.302458763122559, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.874413251876831, "num_tokens": 512328468.0, "step": 13433 }, { "epoch": 1.708942882584913, "ewc_loss": 0.061031244695186615, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028804680914618075, "grad_norm": 7.316524982452393, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8624683618545532, "num_tokens": 512359786.0, "step": 13434 }, { "epoch": 1.7090700928635034, "ewc_loss": 0.06108517944812775, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028858616133220494, "grad_norm": 7.319879531860352, "learning_rate": 1e-06, "loss": 0.5399, "mean_token_accuracy": 0.843185544013977, "num_tokens": 512395679.0, "step": 13435 }, { "epoch": 1.7091973031420937, "ewc_loss": 0.06097406521439552, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028747503529302776, "grad_norm": 7.331230640411377, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8623138666152954, "num_tokens": 512433491.0, "step": 13436 }, { "epoch": 1.7093245134206843, "ewc_loss": 0.06103777140378952, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002881120890378952, "grad_norm": 7.327608585357666, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8582897186279297, "num_tokens": 512470215.0, "step": 13437 }, { "epoch": 1.7094517236992748, "ewc_loss": 0.060961492359638214, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028734930674545467, "grad_norm": 7.280502796173096, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8600598573684692, "num_tokens": 512507055.0, "step": 13438 }, { "epoch": 1.7095789339778653, "ewc_loss": 0.06127780303359032, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002880709944292903, "grad_norm": 7.317554950714111, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8472237586975098, "num_tokens": 512547877.0, "step": 13439 }, { "epoch": 1.7097061442564558, "ewc_loss": 0.06098010763525963, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028753545484505594, "grad_norm": 7.263465881347656, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8508851528167725, "num_tokens": 512586558.0, "step": 13440 }, { "epoch": 1.7098333545350464, "ewc_loss": 0.061391159892082214, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002892045595217496, "grad_norm": 7.255486965179443, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8618630170822144, "num_tokens": 512631181.0, "step": 13441 }, { "epoch": 1.709960564813637, "ewc_loss": 0.06123754382133484, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002876684011425823, "grad_norm": 7.3118157386779785, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.874266505241394, "num_tokens": 512667925.0, "step": 13442 }, { "epoch": 1.7100877750922274, "ewc_loss": 0.06100763380527496, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002878107188735157, "grad_norm": 7.281551837921143, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8465436100959778, "num_tokens": 512703136.0, "step": 13443 }, { "epoch": 1.710214985370818, "ewc_loss": 0.061060965061187744, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028834404656663537, "grad_norm": 7.301828861236572, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8661190867424011, "num_tokens": 512739001.0, "step": 13444 }, { "epoch": 1.7103421956494085, "ewc_loss": 0.06123320013284683, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028762497822754085, "grad_norm": 7.221932888031006, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8686211705207825, "num_tokens": 512780517.0, "step": 13445 }, { "epoch": 1.710469405927999, "ewc_loss": 0.06138984113931656, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002891913754865527, "grad_norm": 7.318631649017334, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8718754053115845, "num_tokens": 512815964.0, "step": 13446 }, { "epoch": 1.7105966162065895, "ewc_loss": 0.06131480261683464, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028844099142588675, "grad_norm": 7.317425727844238, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8669908046722412, "num_tokens": 512855451.0, "step": 13447 }, { "epoch": 1.71072382648518, "ewc_loss": 0.06131860241293907, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028847900102846324, "grad_norm": 7.281742095947266, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8680403828620911, "num_tokens": 512895809.0, "step": 13448 }, { "epoch": 1.7108510367637706, "ewc_loss": 0.06131473556160927, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028844032203778625, "grad_norm": 7.259953022003174, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.869181215763092, "num_tokens": 512936424.0, "step": 13449 }, { "epoch": 1.7109782470423611, "ewc_loss": 0.06133054941892624, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028859847225248814, "grad_norm": 7.294027328491211, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8788875341415405, "num_tokens": 512976486.0, "step": 13450 }, { "epoch": 1.7111054573209517, "ewc_loss": 0.0612930990755558, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002882239641621709, "grad_norm": 7.331977844238281, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8547474145889282, "num_tokens": 513014131.0, "step": 13451 }, { "epoch": 1.7112326675995422, "ewc_loss": 0.06108436733484268, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002885780413635075, "grad_norm": 7.26508903503418, "learning_rate": 1e-06, "loss": 0.5457, "mean_token_accuracy": 0.8361576199531555, "num_tokens": 513054706.0, "step": 13452 }, { "epoch": 1.7113598778781327, "ewc_loss": 0.061344899237155914, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002887419832404703, "grad_norm": 7.271014213562012, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8698539733886719, "num_tokens": 513094089.0, "step": 13453 }, { "epoch": 1.711487088156723, "ewc_loss": 0.06099241226911545, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000287658505840227, "grad_norm": 7.27095890045166, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8617219924926758, "num_tokens": 513136306.0, "step": 13454 }, { "epoch": 1.7116142984353135, "ewc_loss": 0.06138342246413231, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002891272015403956, "grad_norm": 7.286575794219971, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8595218658447266, "num_tokens": 513174376.0, "step": 13455 }, { "epoch": 1.711741508713904, "ewc_loss": 0.06133382394909859, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028863121406175196, "grad_norm": 7.315367221832275, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8673586845397949, "num_tokens": 513211834.0, "step": 13456 }, { "epoch": 1.7118687189924946, "ewc_loss": 0.061272032558918, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028801331063732505, "grad_norm": 7.328435897827148, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8560522794723511, "num_tokens": 513241765.0, "step": 13457 }, { "epoch": 1.7119959292710851, "ewc_loss": 0.06129110977053642, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002882040571421385, "grad_norm": 7.271387577056885, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8511770367622375, "num_tokens": 513283510.0, "step": 13458 }, { "epoch": 1.7121231395496757, "ewc_loss": 0.06141413748264313, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028943433426320553, "grad_norm": 7.321680068969727, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8627321720123291, "num_tokens": 513320746.0, "step": 13459 }, { "epoch": 1.712250349828266, "ewc_loss": 0.06105007603764534, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028823514003306627, "grad_norm": 7.295613765716553, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8601798415184021, "num_tokens": 513358036.0, "step": 13460 }, { "epoch": 1.7123775601068565, "ewc_loss": 0.06109742075204849, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028870857204310596, "grad_norm": 7.258105754852295, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8700319528579712, "num_tokens": 513395156.0, "step": 13461 }, { "epoch": 1.712504770385447, "ewc_loss": 0.06134086474776268, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002887016162276268, "grad_norm": 7.338996887207031, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8594958782196045, "num_tokens": 513438631.0, "step": 13462 }, { "epoch": 1.7126319806640375, "ewc_loss": 0.06127649545669556, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002880579268094152, "grad_norm": 7.402590274810791, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8543536067008972, "num_tokens": 513471225.0, "step": 13463 }, { "epoch": 1.712759190942628, "ewc_loss": 0.06121031939983368, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028739616391249, "grad_norm": 7.235865592956543, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.855194091796875, "num_tokens": 513513824.0, "step": 13464 }, { "epoch": 1.7128864012212186, "ewc_loss": 0.06119946017861366, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002897289814427495, "grad_norm": 7.39619779586792, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8568390011787415, "num_tokens": 513556263.0, "step": 13465 }, { "epoch": 1.7130136114998091, "ewc_loss": 0.06097327172756195, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002874671190511435, "grad_norm": 7.263927459716797, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8679153323173523, "num_tokens": 513595093.0, "step": 13466 }, { "epoch": 1.7131408217783997, "ewc_loss": 0.06149926036596298, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002902855630964041, "grad_norm": 7.350464344024658, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8555570840835571, "num_tokens": 513633761.0, "step": 13467 }, { "epoch": 1.7132680320569902, "ewc_loss": 0.061225466430187225, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002875476493500173, "grad_norm": 7.273807048797607, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8596903681755066, "num_tokens": 513671404.0, "step": 13468 }, { "epoch": 1.7133952423355807, "ewc_loss": 0.061362601816654205, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002889189636334777, "grad_norm": 7.353390693664551, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8614649772644043, "num_tokens": 513713752.0, "step": 13469 }, { "epoch": 1.7135224526141712, "ewc_loss": 0.06121712177991867, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028746420866809785, "grad_norm": 7.2833251953125, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8597719073295593, "num_tokens": 513755261.0, "step": 13470 }, { "epoch": 1.7136496628927618, "ewc_loss": 0.0613572932779789, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028886590735055506, "grad_norm": 7.335740566253662, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8688331246376038, "num_tokens": 513799042.0, "step": 13471 }, { "epoch": 1.7137768731713523, "ewc_loss": 0.061218246817588806, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874754136428237, "grad_norm": 7.28480339050293, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8532847166061401, "num_tokens": 513838148.0, "step": 13472 }, { "epoch": 1.7139040834499428, "ewc_loss": 0.061091870069503784, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002886531001422554, "grad_norm": 7.2916975021362305, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.870772659778595, "num_tokens": 513878960.0, "step": 13473 }, { "epoch": 1.7140312937285334, "ewc_loss": 0.061033397912979126, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028806834598071873, "grad_norm": 7.354861736297607, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8655704855918884, "num_tokens": 513914083.0, "step": 13474 }, { "epoch": 1.7141585040071239, "ewc_loss": 0.06098029762506485, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002875373756978661, "grad_norm": 7.242037296295166, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8523521423339844, "num_tokens": 513951591.0, "step": 13475 }, { "epoch": 1.7142857142857144, "ewc_loss": 0.06116567179560661, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002893910859711468, "grad_norm": 7.358387470245361, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8494588732719421, "num_tokens": 513986127.0, "step": 13476 }, { "epoch": 1.714412924564305, "ewc_loss": 0.06105078384280205, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028824221226386726, "grad_norm": 7.293911457061768, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8576499819755554, "num_tokens": 514028248.0, "step": 13477 }, { "epoch": 1.7145401348428955, "ewc_loss": 0.06120048463344574, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028973922599107027, "grad_norm": 7.373472213745117, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8658994436264038, "num_tokens": 514062757.0, "step": 13478 }, { "epoch": 1.7146673451214858, "ewc_loss": 0.06108027696609497, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028853712137788534, "grad_norm": 7.280885219573975, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8507078886032104, "num_tokens": 514102635.0, "step": 13479 }, { "epoch": 1.7147945554000763, "ewc_loss": 0.061175212264060974, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028948651743121445, "grad_norm": 7.321349143981934, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.862642765045166, "num_tokens": 514140883.0, "step": 13480 }, { "epoch": 1.7149217656786668, "ewc_loss": 0.061032384634017944, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002880582178477198, "grad_norm": 7.313507556915283, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8730642795562744, "num_tokens": 514176536.0, "step": 13481 }, { "epoch": 1.7150489759572574, "ewc_loss": 0.061443619430065155, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028972915606573224, "grad_norm": 7.375000953674316, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.857600212097168, "num_tokens": 514209969.0, "step": 13482 }, { "epoch": 1.7151761862358479, "ewc_loss": 0.06128793954849243, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002881723630707711, "grad_norm": 7.257737159729004, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8628195524215698, "num_tokens": 514249679.0, "step": 13483 }, { "epoch": 1.7153033965144384, "ewc_loss": 0.06147488206624985, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002900417894124985, "grad_norm": 7.3372955322265625, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8572292327880859, "num_tokens": 514292719.0, "step": 13484 }, { "epoch": 1.7154306067930287, "ewc_loss": 0.061321988701820374, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002885128778871149, "grad_norm": 7.36322021484375, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8578764796257019, "num_tokens": 514323396.0, "step": 13485 }, { "epoch": 1.7155578170716193, "ewc_loss": 0.06142493337392807, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002895423094742, "grad_norm": 7.287905693054199, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8702146410942078, "num_tokens": 514360951.0, "step": 13486 }, { "epoch": 1.7156850273502098, "ewc_loss": 0.06119256094098091, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002896599762607366, "grad_norm": 7.3801164627075195, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8485249280929565, "num_tokens": 514403879.0, "step": 13487 }, { "epoch": 1.7158122376288003, "ewc_loss": 0.06105619668960571, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002882963453885168, "grad_norm": 7.329421043395996, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8710465431213379, "num_tokens": 514446628.0, "step": 13488 }, { "epoch": 1.7159394479073908, "ewc_loss": 0.061108849942684174, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028882289188914, "grad_norm": 7.408246994018555, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8637657761573792, "num_tokens": 514486132.0, "step": 13489 }, { "epoch": 1.7160666581859814, "ewc_loss": 0.060918278992176056, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002869171730708331, "grad_norm": 7.326083183288574, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8515766859054565, "num_tokens": 514526384.0, "step": 13490 }, { "epoch": 1.716193868464572, "ewc_loss": 0.061053626239299774, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002882706467062235, "grad_norm": 7.425249099731445, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8609030246734619, "num_tokens": 514565828.0, "step": 13491 }, { "epoch": 1.7163210787431624, "ewc_loss": 0.061149854212999344, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002867915027309209, "grad_norm": 7.313704490661621, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8592255115509033, "num_tokens": 514605260.0, "step": 13492 }, { "epoch": 1.716448289021753, "ewc_loss": 0.06097429618239403, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028747733449563384, "grad_norm": 7.327338695526123, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8652360439300537, "num_tokens": 514643433.0, "step": 13493 }, { "epoch": 1.7165754993003435, "ewc_loss": 0.060831449925899506, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028604886028915644, "grad_norm": 7.3070068359375, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8662365674972534, "num_tokens": 514683264.0, "step": 13494 }, { "epoch": 1.716702709578934, "ewc_loss": 0.06099042296409607, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028763862792402506, "grad_norm": 7.344240665435791, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8717662692070007, "num_tokens": 514719775.0, "step": 13495 }, { "epoch": 1.7168299198575245, "ewc_loss": 0.061108969151973724, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002863826521206647, "grad_norm": 7.331209659576416, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8553872108459473, "num_tokens": 514766287.0, "step": 13496 }, { "epoch": 1.716957130136115, "ewc_loss": 0.06096532940864563, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002873876946978271, "grad_norm": 7.3462934494018555, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8730884790420532, "num_tokens": 514806725.0, "step": 13497 }, { "epoch": 1.7170843404147056, "ewc_loss": 0.06097117066383362, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002874460769817233, "grad_norm": 7.412190914154053, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8676691055297852, "num_tokens": 514844008.0, "step": 13498 }, { "epoch": 1.7172115506932961, "ewc_loss": 0.061023686081171036, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028552982257679105, "grad_norm": 7.4065046310424805, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8510589003562927, "num_tokens": 514880356.0, "step": 13499 }, { "epoch": 1.7173387609718866, "ewc_loss": 0.06084191054105759, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002861534885596484, "grad_norm": 7.3346099853515625, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8752216100692749, "num_tokens": 514915309.0, "step": 13500 }, { "epoch": 1.7174659712504772, "ewc_loss": 0.061057813465595245, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028587112319655716, "grad_norm": 7.5528950691223145, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8573166131973267, "num_tokens": 514956496.0, "step": 13501 }, { "epoch": 1.7175931815290677, "ewc_loss": 0.06081010773777962, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028339403797872365, "grad_norm": 7.221602916717529, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8635202646255493, "num_tokens": 514992247.0, "step": 13502 }, { "epoch": 1.717720391807658, "ewc_loss": 0.06140446290373802, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028933759313076735, "grad_norm": 7.423125267028809, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.855549693107605, "num_tokens": 515035443.0, "step": 13503 }, { "epoch": 1.7178476020862485, "ewc_loss": 0.060852814465761185, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028382110758684576, "grad_norm": 7.225528717041016, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.868010938167572, "num_tokens": 515072279.0, "step": 13504 }, { "epoch": 1.717974812364839, "ewc_loss": 0.06134909391403198, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028878392186015844, "grad_norm": 7.40714168548584, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8615727424621582, "num_tokens": 515112547.0, "step": 13505 }, { "epoch": 1.7181020226434296, "ewc_loss": 0.06098523363471031, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002851453027687967, "grad_norm": 7.2573347091674805, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8509873151779175, "num_tokens": 515146814.0, "step": 13506 }, { "epoch": 1.7182292329220201, "ewc_loss": 0.061288006603717804, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028817306156270206, "grad_norm": 7.299337863922119, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8658413290977478, "num_tokens": 515184393.0, "step": 13507 }, { "epoch": 1.7183564432006107, "ewc_loss": 0.061260003596544266, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002878929954022169, "grad_norm": 7.3221116065979, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.856635332107544, "num_tokens": 515225312.0, "step": 13508 }, { "epoch": 1.718483653479201, "ewc_loss": 0.061252839863300323, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002878213708754629, "grad_norm": 7.306222915649414, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.8411532044410706, "num_tokens": 515261330.0, "step": 13509 }, { "epoch": 1.7186108637577915, "ewc_loss": 0.06110954284667969, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028882978949695826, "grad_norm": 7.312424182891846, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8693544864654541, "num_tokens": 515299325.0, "step": 13510 }, { "epoch": 1.718738074036382, "ewc_loss": 0.06135771423578262, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002888701274059713, "grad_norm": 7.292529582977295, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8634117245674133, "num_tokens": 515335213.0, "step": 13511 }, { "epoch": 1.7188652843149725, "ewc_loss": 0.06148209422826767, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002901139378082007, "grad_norm": 7.379639148712158, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8743563890457153, "num_tokens": 515365699.0, "step": 13512 }, { "epoch": 1.718992494593563, "ewc_loss": 0.06100516393780708, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028778600972145796, "grad_norm": 7.308277130126953, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8717509508132935, "num_tokens": 515396025.0, "step": 13513 }, { "epoch": 1.7191197048721536, "ewc_loss": 0.06145312637090683, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002898242382798344, "grad_norm": 7.34650182723999, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8651151657104492, "num_tokens": 515429724.0, "step": 13514 }, { "epoch": 1.7192469151507441, "ewc_loss": 0.06109717860817909, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028870615642517805, "grad_norm": 7.24697208404541, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8595223426818848, "num_tokens": 515465894.0, "step": 13515 }, { "epoch": 1.7193741254293347, "ewc_loss": 0.06122349947690964, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028996934997849166, "grad_norm": 7.290233612060547, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8827241063117981, "num_tokens": 515498894.0, "step": 13516 }, { "epoch": 1.7195013357079252, "ewc_loss": 0.06109781190752983, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002887125010602176, "grad_norm": 7.260688304901123, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8633524179458618, "num_tokens": 515538783.0, "step": 13517 }, { "epoch": 1.7196285459865157, "ewc_loss": 0.0614849254488945, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029014222673140466, "grad_norm": 7.353505611419678, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8623000979423523, "num_tokens": 515573107.0, "step": 13518 }, { "epoch": 1.7197557562651062, "ewc_loss": 0.061115704476833344, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002888914314098656, "grad_norm": 7.201897144317627, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.865433394908905, "num_tokens": 515614901.0, "step": 13519 }, { "epoch": 1.7198829665436968, "ewc_loss": 0.061664141714572906, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029193441150709987, "grad_norm": 7.352088928222656, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8668874502182007, "num_tokens": 515654197.0, "step": 13520 }, { "epoch": 1.7200101768222873, "ewc_loss": 0.061048999428749084, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002882243425119668, "grad_norm": 7.181276321411133, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8822037577629089, "num_tokens": 515697003.0, "step": 13521 }, { "epoch": 1.7201373871008778, "ewc_loss": 0.06138567626476288, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002915911318268627, "grad_norm": 7.355207920074463, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8594632148742676, "num_tokens": 515737801.0, "step": 13522 }, { "epoch": 1.7202645973794684, "ewc_loss": 0.061136722564697266, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002891015901695937, "grad_norm": 7.30805778503418, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8557858467102051, "num_tokens": 515774711.0, "step": 13523 }, { "epoch": 1.7203918076580589, "ewc_loss": 0.0615798644721508, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029109162278473377, "grad_norm": 7.3252153396606445, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8561878204345703, "num_tokens": 515811728.0, "step": 13524 }, { "epoch": 1.7205190179366494, "ewc_loss": 0.06102357804775238, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002879701496567577, "grad_norm": 7.270373821258545, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8692688941955566, "num_tokens": 515844425.0, "step": 13525 }, { "epoch": 1.72064622821524, "ewc_loss": 0.06122322380542755, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028996661421842873, "grad_norm": 7.313298225402832, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8651230931282043, "num_tokens": 515882439.0, "step": 13526 }, { "epoch": 1.7207734384938305, "ewc_loss": 0.061076320707798004, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002884975983761251, "grad_norm": 7.301759243011475, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8419337272644043, "num_tokens": 515922168.0, "step": 13527 }, { "epoch": 1.7209006487724208, "ewc_loss": 0.06113401800394058, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028907458181492984, "grad_norm": 7.3013153076171875, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8714892268180847, "num_tokens": 515961840.0, "step": 13528 }, { "epoch": 1.7210278590510113, "ewc_loss": 0.06104518473148346, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028818624559789896, "grad_norm": 7.278011322021484, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8630204200744629, "num_tokens": 515997586.0, "step": 13529 }, { "epoch": 1.7211550693296018, "ewc_loss": 0.06108138710260391, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002885482390411198, "grad_norm": 7.344799041748047, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.869958221912384, "num_tokens": 516036660.0, "step": 13530 }, { "epoch": 1.7212822796081924, "ewc_loss": 0.06104322522878647, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028816660051234066, "grad_norm": 7.323256492614746, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8566006422042847, "num_tokens": 516070846.0, "step": 13531 }, { "epoch": 1.7214094898867829, "ewc_loss": 0.06104489415884018, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002881833352148533, "grad_norm": 7.303760528564453, "learning_rate": 1e-06, "loss": 0.4006, "mean_token_accuracy": 0.8857243061065674, "num_tokens": 516107027.0, "step": 13532 }, { "epoch": 1.7215367001653734, "ewc_loss": 0.06109236925840378, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002886580768972635, "grad_norm": 7.321378231048584, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8649619817733765, "num_tokens": 516143971.0, "step": 13533 }, { "epoch": 1.7216639104439637, "ewc_loss": 0.06108605116605759, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028859489248134196, "grad_norm": 7.323092937469482, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.865938663482666, "num_tokens": 516180617.0, "step": 13534 }, { "epoch": 1.7217911207225542, "ewc_loss": 0.06098388507962227, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002875732316169888, "grad_norm": 7.252807140350342, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8642657399177551, "num_tokens": 516220371.0, "step": 13535 }, { "epoch": 1.7219183310011448, "ewc_loss": 0.06104182451963425, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028815263067372143, "grad_norm": 7.2881011962890625, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.865562915802002, "num_tokens": 516258642.0, "step": 13536 }, { "epoch": 1.7220455412797353, "ewc_loss": 0.06095258891582489, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002872602781280875, "grad_norm": 7.290619850158691, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8576526641845703, "num_tokens": 516298971.0, "step": 13537 }, { "epoch": 1.7221727515583258, "ewc_loss": 0.06094329431653023, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002871673204936087, "grad_norm": 7.3286519050598145, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8758343458175659, "num_tokens": 516334457.0, "step": 13538 }, { "epoch": 1.7222999618369164, "ewc_loss": 0.06098989024758339, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000287633272819221, "grad_norm": 7.250253677368164, "learning_rate": 1e-06, "loss": 0.3963, "mean_token_accuracy": 0.8894046545028687, "num_tokens": 516374585.0, "step": 13539 }, { "epoch": 1.7224271721155069, "ewc_loss": 0.06113552302122116, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002890895993914455, "grad_norm": 7.429477214813232, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8501196503639221, "num_tokens": 516414180.0, "step": 13540 }, { "epoch": 1.7225543823940974, "ewc_loss": 0.06114867329597473, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002867796865757555, "grad_norm": 7.228051662445068, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8712781667709351, "num_tokens": 516450189.0, "step": 13541 }, { "epoch": 1.722681592672688, "ewc_loss": 0.06153854355216026, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029067840659990907, "grad_norm": 7.494897842407227, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8620564937591553, "num_tokens": 516487549.0, "step": 13542 }, { "epoch": 1.7228088029512785, "ewc_loss": 0.06102658435702324, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028555880999192595, "grad_norm": 7.190620422363281, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8500373959541321, "num_tokens": 516526649.0, "step": 13543 }, { "epoch": 1.722936013229869, "ewc_loss": 0.06157848611474037, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002910778275690973, "grad_norm": 7.478734493255615, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8688803911209106, "num_tokens": 516563829.0, "step": 13544 }, { "epoch": 1.7230632235084595, "ewc_loss": 0.060858942568302155, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002863238041754812, "grad_norm": 7.277653217315674, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8468548655509949, "num_tokens": 516605179.0, "step": 13545 }, { "epoch": 1.72319043378705, "ewc_loss": 0.06113777309656143, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028911209665238857, "grad_norm": 7.3257012367248535, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8628188371658325, "num_tokens": 516641576.0, "step": 13546 }, { "epoch": 1.7233176440656406, "ewc_loss": 0.061032503843307495, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028805944020859897, "grad_norm": 7.310680389404297, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8611609935760498, "num_tokens": 516678687.0, "step": 13547 }, { "epoch": 1.7234448543442311, "ewc_loss": 0.06128188967704773, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000288111885311082, "grad_norm": 7.337364196777344, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8585318922996521, "num_tokens": 516711484.0, "step": 13548 }, { "epoch": 1.7235720646228216, "ewc_loss": 0.0612458735704422, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028775169630534947, "grad_norm": 7.278206825256348, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8590050339698792, "num_tokens": 516750598.0, "step": 13549 }, { "epoch": 1.7236992749014122, "ewc_loss": 0.06129712611436844, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000288264243863523, "grad_norm": 7.255016803741455, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8710731863975525, "num_tokens": 516793438.0, "step": 13550 }, { "epoch": 1.7238264851800027, "ewc_loss": 0.06134159490466118, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028870892128907144, "grad_norm": 7.375568866729736, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8781461119651794, "num_tokens": 516828692.0, "step": 13551 }, { "epoch": 1.723953695458593, "ewc_loss": 0.06121154502034187, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002874084166251123, "grad_norm": 7.29348087310791, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8650498390197754, "num_tokens": 516867592.0, "step": 13552 }, { "epoch": 1.7240809057371835, "ewc_loss": 0.061315394937992096, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002884469286072999, "grad_norm": 7.350523471832275, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8518761992454529, "num_tokens": 516907668.0, "step": 13553 }, { "epoch": 1.724208116015774, "ewc_loss": 0.06114380061626434, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028673099586740136, "grad_norm": 7.27064323425293, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8604992628097534, "num_tokens": 516944628.0, "step": 13554 }, { "epoch": 1.7243353262943646, "ewc_loss": 0.0613638237118721, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028893121634609997, "grad_norm": 7.292425632476807, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8577735424041748, "num_tokens": 516989027.0, "step": 13555 }, { "epoch": 1.7244625365729551, "ewc_loss": 0.06132690981030464, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028856206336058676, "grad_norm": 7.361532688140869, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8589217066764832, "num_tokens": 517024464.0, "step": 13556 }, { "epoch": 1.7245897468515456, "ewc_loss": 0.06122060865163803, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028749904595315456, "grad_norm": 7.298605918884277, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8680669069290161, "num_tokens": 517067714.0, "step": 13557 }, { "epoch": 1.724716957130136, "ewc_loss": 0.061290815472602844, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028820111765526235, "grad_norm": 7.270328521728516, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8669105768203735, "num_tokens": 517107564.0, "step": 13558 }, { "epoch": 1.7248441674087265, "ewc_loss": 0.061245258897542953, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002877455553971231, "grad_norm": 7.296024799346924, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8682793378829956, "num_tokens": 517148240.0, "step": 13559 }, { "epoch": 1.724971377687317, "ewc_loss": 0.06123749166727066, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028766790637746453, "grad_norm": 7.2845282554626465, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8580343723297119, "num_tokens": 517187841.0, "step": 13560 }, { "epoch": 1.7250985879659075, "ewc_loss": 0.06105542555451393, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028828863287344575, "grad_norm": 7.3361053466796875, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8479056358337402, "num_tokens": 517225732.0, "step": 13561 }, { "epoch": 1.725225798244498, "ewc_loss": 0.060999296605587006, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002877273363992572, "grad_norm": 7.36258602142334, "learning_rate": 1e-06, "loss": 0.5207, "mean_token_accuracy": 0.8475881218910217, "num_tokens": 517260423.0, "step": 13562 }, { "epoch": 1.7253530085230886, "ewc_loss": 0.061026014387607574, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028799450956285, "grad_norm": 7.303779125213623, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.851026713848114, "num_tokens": 517300862.0, "step": 13563 }, { "epoch": 1.7254802188016791, "ewc_loss": 0.061002328991889954, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002877576625905931, "grad_norm": 7.327731609344482, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8683347702026367, "num_tokens": 517341010.0, "step": 13564 }, { "epoch": 1.7256074290802697, "ewc_loss": 0.06129986792802811, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002882916305679828, "grad_norm": 7.317924976348877, "learning_rate": 1e-06, "loss": 0.5134, "mean_token_accuracy": 0.8464095592498779, "num_tokens": 517376853.0, "step": 13565 }, { "epoch": 1.7257346393588602, "ewc_loss": 0.06104370206594467, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028817137354053557, "grad_norm": 7.273364543914795, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8572031855583191, "num_tokens": 517421854.0, "step": 13566 }, { "epoch": 1.7258618496374507, "ewc_loss": 0.06130503863096237, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028834337717853487, "grad_norm": 7.287759304046631, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8572791814804077, "num_tokens": 517461711.0, "step": 13567 }, { "epoch": 1.7259890599160412, "ewc_loss": 0.061074983328580856, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000288484210614115, "grad_norm": 7.303958892822266, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8697625994682312, "num_tokens": 517494755.0, "step": 13568 }, { "epoch": 1.7261162701946318, "ewc_loss": 0.06118486076593399, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028958296752534807, "grad_norm": 7.327765941619873, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8552451133728027, "num_tokens": 517539061.0, "step": 13569 }, { "epoch": 1.7262434804732223, "ewc_loss": 0.061176154762506485, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000289495917968452, "grad_norm": 7.324031829833984, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8595736026763916, "num_tokens": 517579768.0, "step": 13570 }, { "epoch": 1.7263706907518128, "ewc_loss": 0.06111301854252815, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028886456857435405, "grad_norm": 7.357038497924805, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8750303983688354, "num_tokens": 517612006.0, "step": 13571 }, { "epoch": 1.7264979010304033, "ewc_loss": 0.06126999855041504, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002879929670598358, "grad_norm": 7.321537494659424, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8633977174758911, "num_tokens": 517647547.0, "step": 13572 }, { "epoch": 1.7266251113089939, "ewc_loss": 0.061145149171352386, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002891858748625964, "grad_norm": 7.313661575317383, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8541261553764343, "num_tokens": 517689067.0, "step": 13573 }, { "epoch": 1.7267523215875844, "ewc_loss": 0.06108815595507622, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002886159345507622, "grad_norm": 7.253820419311523, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8684122562408447, "num_tokens": 517729967.0, "step": 13574 }, { "epoch": 1.726879531866175, "ewc_loss": 0.0611729696393013, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002894640783779323, "grad_norm": 7.330832481384277, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.862546443939209, "num_tokens": 517770427.0, "step": 13575 }, { "epoch": 1.7270067421447655, "ewc_loss": 0.06114601716399193, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002891945478040725, "grad_norm": 7.362057685852051, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.860623836517334, "num_tokens": 517804202.0, "step": 13576 }, { "epoch": 1.7271339524233558, "ewc_loss": 0.061173856258392334, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002894729550462216, "grad_norm": 7.354180335998535, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.856387734413147, "num_tokens": 517834825.0, "step": 13577 }, { "epoch": 1.7272611627019463, "ewc_loss": 0.061102159321308136, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002887559530790895, "grad_norm": 7.36010217666626, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.855424702167511, "num_tokens": 517874600.0, "step": 13578 }, { "epoch": 1.7273883729805368, "ewc_loss": 0.06115496903657913, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002892840711865574, "grad_norm": 7.318081378936768, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8539196252822876, "num_tokens": 517913488.0, "step": 13579 }, { "epoch": 1.7275155832591274, "ewc_loss": 0.06120465323328972, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002897809026762843, "grad_norm": 7.316935062408447, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8767209053039551, "num_tokens": 517951094.0, "step": 13580 }, { "epoch": 1.7276427935377179, "ewc_loss": 0.06107018142938614, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002884361892938614, "grad_norm": 7.275141716003418, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8521565198898315, "num_tokens": 517993811.0, "step": 13581 }, { "epoch": 1.7277700038163084, "ewc_loss": 0.06147916242480278, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029008460114710033, "grad_norm": 7.31476354598999, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8652480840682983, "num_tokens": 518032860.0, "step": 13582 }, { "epoch": 1.7278972140948987, "ewc_loss": 0.061359621584415436, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028888916131109, "grad_norm": 7.303670883178711, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8582435250282288, "num_tokens": 518070256.0, "step": 13583 }, { "epoch": 1.7280244243734892, "ewc_loss": 0.0612189806997776, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002899241808336228, "grad_norm": 7.359549045562744, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8702001571655273, "num_tokens": 518102828.0, "step": 13584 }, { "epoch": 1.7281516346520798, "ewc_loss": 0.06137680262327194, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028906099032610655, "grad_norm": 7.283166885375977, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8754655122756958, "num_tokens": 518146859.0, "step": 13585 }, { "epoch": 1.7282788449306703, "ewc_loss": 0.06143464148044586, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002896393707487732, "grad_norm": 7.307209491729736, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8697950839996338, "num_tokens": 518186173.0, "step": 13586 }, { "epoch": 1.7284060552092608, "ewc_loss": 0.06136322021484375, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028892516274936497, "grad_norm": 7.3213419914245605, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8467774987220764, "num_tokens": 518221903.0, "step": 13587 }, { "epoch": 1.7285332654878514, "ewc_loss": 0.0613282211124897, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028857518918812275, "grad_norm": 7.362353801727295, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8529790043830872, "num_tokens": 518258048.0, "step": 13588 }, { "epoch": 1.7286604757664419, "ewc_loss": 0.06132432073354721, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002885361900553107, "grad_norm": 7.282212734222412, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8548145294189453, "num_tokens": 518299127.0, "step": 13589 }, { "epoch": 1.7287876860450324, "ewc_loss": 0.06147366017103195, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002900295949075371, "grad_norm": 7.324398040771484, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.873846173286438, "num_tokens": 518336520.0, "step": 13590 }, { "epoch": 1.728914896323623, "ewc_loss": 0.06144072860479355, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002897002559620887, "grad_norm": 7.370919704437256, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8588657379150391, "num_tokens": 518368484.0, "step": 13591 }, { "epoch": 1.7290421066022135, "ewc_loss": 0.06110941618680954, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002888285380322486, "grad_norm": 7.3146185874938965, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8600736856460571, "num_tokens": 518408439.0, "step": 13592 }, { "epoch": 1.729169316880804, "ewc_loss": 0.06154762953519821, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029076923965476453, "grad_norm": 7.416725158691406, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8427969217300415, "num_tokens": 518447414.0, "step": 13593 }, { "epoch": 1.7292965271593945, "ewc_loss": 0.061302848160266876, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028832146199420094, "grad_norm": 7.287397861480713, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8579612970352173, "num_tokens": 518486455.0, "step": 13594 }, { "epoch": 1.729423737437985, "ewc_loss": 0.06157810240983963, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029107401496730745, "grad_norm": 7.391583442687988, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8793741464614868, "num_tokens": 518527023.0, "step": 13595 }, { "epoch": 1.7295509477165756, "ewc_loss": 0.06137821823358536, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000289075163891539, "grad_norm": 7.377726078033447, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8562116622924805, "num_tokens": 518563285.0, "step": 13596 }, { "epoch": 1.729678157995166, "ewc_loss": 0.06115160882472992, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002892504562623799, "grad_norm": 7.312707424163818, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8634021282196045, "num_tokens": 518607610.0, "step": 13597 }, { "epoch": 1.7298053682737566, "ewc_loss": 0.06133535876870155, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028864655178040266, "grad_norm": 7.25248908996582, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8720935583114624, "num_tokens": 518649434.0, "step": 13598 }, { "epoch": 1.7299325785523472, "ewc_loss": 0.06139305979013443, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002892235934268683, "grad_norm": 7.340902805328369, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8692420721054077, "num_tokens": 518684508.0, "step": 13599 }, { "epoch": 1.7300597888309377, "ewc_loss": 0.06140437722206116, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002893367491196841, "grad_norm": 7.3386664390563965, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8687616586685181, "num_tokens": 518717883.0, "step": 13600 }, { "epoch": 1.730186999109528, "ewc_loss": 0.06119316816329956, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002896660298574716, "grad_norm": 7.325819969177246, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8683475852012634, "num_tokens": 518756158.0, "step": 13601 }, { "epoch": 1.7303142093881185, "ewc_loss": 0.061223287135362625, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002899672545026988, "grad_norm": 7.393439769744873, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8630387783050537, "num_tokens": 518793003.0, "step": 13602 }, { "epoch": 1.730441419666709, "ewc_loss": 0.061102040112018585, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002887547598220408, "grad_norm": 7.325704574584961, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8566117882728577, "num_tokens": 518838485.0, "step": 13603 }, { "epoch": 1.7305686299452996, "ewc_loss": 0.061375997960567474, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002890529576689005, "grad_norm": 7.277580261230469, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8564983606338501, "num_tokens": 518885936.0, "step": 13604 }, { "epoch": 1.7306958402238901, "ewc_loss": 0.06144072115421295, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028970016865059733, "grad_norm": 7.34782600402832, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8502233624458313, "num_tokens": 518925684.0, "step": 13605 }, { "epoch": 1.7308230505024806, "ewc_loss": 0.06143156439065933, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002896086371038109, "grad_norm": 7.363945007324219, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8645387887954712, "num_tokens": 518966438.0, "step": 13606 }, { "epoch": 1.730950260781071, "ewc_loss": 0.061491817235946655, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002902111446019262, "grad_norm": 7.32258415222168, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8651025891304016, "num_tokens": 519013576.0, "step": 13607 }, { "epoch": 1.7310774710596615, "ewc_loss": 0.06148315966129303, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002901245607063174, "grad_norm": 7.451717376708984, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8599754571914673, "num_tokens": 519045342.0, "step": 13608 }, { "epoch": 1.731204681338252, "ewc_loss": 0.06133832037448883, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002886761794798076, "grad_norm": 7.272568702697754, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8485870361328125, "num_tokens": 519088566.0, "step": 13609 }, { "epoch": 1.7313318916168425, "ewc_loss": 0.061545733362436295, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002907502930611372, "grad_norm": 7.379349708557129, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8657891750335693, "num_tokens": 519126516.0, "step": 13610 }, { "epoch": 1.731459101895433, "ewc_loss": 0.06144345924258232, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002897275553550571, "grad_norm": 7.364077091217041, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8724381923675537, "num_tokens": 519165431.0, "step": 13611 }, { "epoch": 1.7315863121740236, "ewc_loss": 0.06150670349597931, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029035998159088194, "grad_norm": 7.405166149139404, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8787438869476318, "num_tokens": 519200991.0, "step": 13612 }, { "epoch": 1.7317135224526141, "ewc_loss": 0.06136758252978325, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002889687893912196, "grad_norm": 7.259355545043945, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8694980144500732, "num_tokens": 519245988.0, "step": 13613 }, { "epoch": 1.7318407327312046, "ewc_loss": 0.061529532074928284, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029058827203698456, "grad_norm": 7.390383720397949, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8675656914710999, "num_tokens": 519282595.0, "step": 13614 }, { "epoch": 1.7319679430097952, "ewc_loss": 0.061391137540340424, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028920432669110596, "grad_norm": 7.343466281890869, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8555041551589966, "num_tokens": 519319061.0, "step": 13615 }, { "epoch": 1.7320951532883857, "ewc_loss": 0.06139804795384407, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002892734482884407, "grad_norm": 7.3165507316589355, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8600971698760986, "num_tokens": 519358617.0, "step": 13616 }, { "epoch": 1.7322223635669762, "ewc_loss": 0.06145165115594864, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002898094826377928, "grad_norm": 7.374007701873779, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8615123629570007, "num_tokens": 519392461.0, "step": 13617 }, { "epoch": 1.7323495738455668, "ewc_loss": 0.06140565127134323, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028934949659742415, "grad_norm": 7.271383285522461, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8704046607017517, "num_tokens": 519432893.0, "step": 13618 }, { "epoch": 1.7324767841241573, "ewc_loss": 0.06161918491125107, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002914848446380347, "grad_norm": 7.389542102813721, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8624122142791748, "num_tokens": 519473445.0, "step": 13619 }, { "epoch": 1.7326039944027478, "ewc_loss": 0.061344366520643234, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028873662813566625, "grad_norm": 7.260220527648926, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8644197583198547, "num_tokens": 519518883.0, "step": 13620 }, { "epoch": 1.7327312046813383, "ewc_loss": 0.06168035417795181, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002920965198427439, "grad_norm": 10.357406616210938, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.867074728012085, "num_tokens": 519550123.0, "step": 13621 }, { "epoch": 1.7328584149599289, "ewc_loss": 0.06432902812957764, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00031858327565714717, "grad_norm": 7.538443088531494, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8545109033584595, "num_tokens": 519592538.0, "step": 13622 }, { "epoch": 1.7329856252385194, "ewc_loss": 0.062464307993650436, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002999360440298915, "grad_norm": 7.587440490722656, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8630852699279785, "num_tokens": 519633466.0, "step": 13623 }, { "epoch": 1.73311283551711, "ewc_loss": 0.06172524392604828, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002925453882198781, "grad_norm": 7.38218355178833, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8588649034500122, "num_tokens": 519672569.0, "step": 13624 }, { "epoch": 1.7332400457957005, "ewc_loss": 0.0627330094575882, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003026230842806399, "grad_norm": 7.546308517456055, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8693351149559021, "num_tokens": 519707383.0, "step": 13625 }, { "epoch": 1.7333672560742908, "ewc_loss": 0.06176327168941498, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029292565886862576, "grad_norm": 7.455781936645508, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.845720112323761, "num_tokens": 519748187.0, "step": 13626 }, { "epoch": 1.7334944663528813, "ewc_loss": 0.06210710108280182, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002963639562949538, "grad_norm": 7.5045247077941895, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8607407808303833, "num_tokens": 519790215.0, "step": 13627 }, { "epoch": 1.7336216766314718, "ewc_loss": 0.061652082949876785, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029181380523368716, "grad_norm": 7.397324562072754, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8540116548538208, "num_tokens": 519827278.0, "step": 13628 }, { "epoch": 1.7337488869100623, "ewc_loss": 0.06182485073804855, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029354149592109025, "grad_norm": 7.406417369842529, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8699722290039062, "num_tokens": 519860216.0, "step": 13629 }, { "epoch": 1.7338760971886529, "ewc_loss": 0.06160149723291397, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002913079224526882, "grad_norm": 7.4203386306762695, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8625666499137878, "num_tokens": 519894442.0, "step": 13630 }, { "epoch": 1.7340033074672434, "ewc_loss": 0.06168080121278763, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029210097272880375, "grad_norm": 7.40283203125, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8624066114425659, "num_tokens": 519932524.0, "step": 13631 }, { "epoch": 1.7341305177458337, "ewc_loss": 0.06159980222582817, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002912909840233624, "grad_norm": 7.355703353881836, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8617809414863586, "num_tokens": 519974366.0, "step": 13632 }, { "epoch": 1.7342577280244242, "ewc_loss": 0.06166650727391243, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029195804381743073, "grad_norm": 7.3620734214782715, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8723841905593872, "num_tokens": 520013983.0, "step": 13633 }, { "epoch": 1.7343849383030148, "ewc_loss": 0.0616542249917984, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029183519654907286, "grad_norm": 10.389992713928223, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8652030825614929, "num_tokens": 520047671.0, "step": 13634 }, { "epoch": 1.7345121485816053, "ewc_loss": 0.06415562331676483, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003168492403347045, "grad_norm": 7.516151428222656, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8586536645889282, "num_tokens": 520086574.0, "step": 13635 }, { "epoch": 1.7346393588601958, "ewc_loss": 0.062419772148132324, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029949069721624255, "grad_norm": 7.6130805015563965, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.848871648311615, "num_tokens": 520130613.0, "step": 13636 }, { "epoch": 1.7347665691387864, "ewc_loss": 0.06173919886350632, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002926849410869181, "grad_norm": 7.407482624053955, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8815128207206726, "num_tokens": 520163763.0, "step": 13637 }, { "epoch": 1.7348937794173769, "ewc_loss": 0.0626576617360115, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030186958611011505, "grad_norm": 7.6016387939453125, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8603788614273071, "num_tokens": 520198663.0, "step": 13638 }, { "epoch": 1.7350209896959674, "ewc_loss": 0.06172271817922592, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029252012609504163, "grad_norm": 7.458858013153076, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8657448291778564, "num_tokens": 520231737.0, "step": 13639 }, { "epoch": 1.735148199974558, "ewc_loss": 0.06199450418353081, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029523801640607417, "grad_norm": 7.401294231414795, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8679647445678711, "num_tokens": 520275664.0, "step": 13640 }, { "epoch": 1.7352754102531485, "ewc_loss": 0.0617879182100296, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002931721683125943, "grad_norm": 7.391831874847412, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8518872857093811, "num_tokens": 520316100.0, "step": 13641 }, { "epoch": 1.735402620531739, "ewc_loss": 0.061576999723911285, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002935043885372579, "grad_norm": 7.474338531494141, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8593389987945557, "num_tokens": 520348167.0, "step": 13642 }, { "epoch": 1.7355298308103295, "ewc_loss": 0.06144361570477486, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029217053088359535, "grad_norm": 7.433635234832764, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8496009707450867, "num_tokens": 520385234.0, "step": 13643 }, { "epoch": 1.73565704108892, "ewc_loss": 0.06149175018072128, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029265187913551927, "grad_norm": 7.397328853607178, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8465988636016846, "num_tokens": 520423051.0, "step": 13644 }, { "epoch": 1.7357842513675106, "ewc_loss": 0.06167180836200714, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002920110709965229, "grad_norm": 7.393471717834473, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8539610505104065, "num_tokens": 520467202.0, "step": 13645 }, { "epoch": 1.735911461646101, "ewc_loss": 0.06136130541563034, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000291347416350618, "grad_norm": 7.375516891479492, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8609341382980347, "num_tokens": 520501434.0, "step": 13646 }, { "epoch": 1.7360386719246916, "ewc_loss": 0.06173127889633179, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029260574956424534, "grad_norm": 7.370323181152344, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8587267398834229, "num_tokens": 520542757.0, "step": 13647 }, { "epoch": 1.7361658822032822, "ewc_loss": 0.061411645263433456, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029185082530602813, "grad_norm": 7.366142272949219, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8526260256767273, "num_tokens": 520585216.0, "step": 13648 }, { "epoch": 1.7362930924818727, "ewc_loss": 0.061542607843875885, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029071903554722667, "grad_norm": 7.417879581451416, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.863525390625, "num_tokens": 520620066.0, "step": 13649 }, { "epoch": 1.736420302760463, "ewc_loss": 0.06161933019757271, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002914862707257271, "grad_norm": 7.371571063995361, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8693103194236755, "num_tokens": 520660917.0, "step": 13650 }, { "epoch": 1.7365475130390535, "ewc_loss": 0.06122174859046936, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002899518876802176, "grad_norm": 7.285267353057861, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8543423414230347, "num_tokens": 520704255.0, "step": 13651 }, { "epoch": 1.736674723317644, "ewc_loss": 0.06146456301212311, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029237999115139246, "grad_norm": 7.354920387268066, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8498880863189697, "num_tokens": 520743036.0, "step": 13652 }, { "epoch": 1.7368019335962346, "ewc_loss": 0.06157520413398743, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002910449984483421, "grad_norm": 7.313405513763428, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8555477857589722, "num_tokens": 520778325.0, "step": 13653 }, { "epoch": 1.736929143874825, "ewc_loss": 0.06172047182917595, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002924976870417595, "grad_norm": 7.351318836212158, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8752473592758179, "num_tokens": 520815226.0, "step": 13654 }, { "epoch": 1.7370563541534156, "ewc_loss": 0.061740271747112274, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002926957095041871, "grad_norm": 7.352727890014648, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8428088426589966, "num_tokens": 520856606.0, "step": 13655 }, { "epoch": 1.737183564432006, "ewc_loss": 0.061638034880161285, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002916733210440725, "grad_norm": 7.3789238929748535, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8531482815742493, "num_tokens": 520893493.0, "step": 13656 }, { "epoch": 1.7373107747105965, "ewc_loss": 0.06174267455935478, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002927197201643139, "grad_norm": 7.396920680999756, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8697023987770081, "num_tokens": 520930391.0, "step": 13657 }, { "epoch": 1.737437984989187, "ewc_loss": 0.06167687103152275, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029206168255768716, "grad_norm": 7.34638786315918, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8623396158218384, "num_tokens": 520970512.0, "step": 13658 }, { "epoch": 1.7375651952677775, "ewc_loss": 0.06177155673503876, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029300854657776654, "grad_norm": 7.3715362548828125, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8544274568557739, "num_tokens": 521011787.0, "step": 13659 }, { "epoch": 1.737692405546368, "ewc_loss": 0.0616268664598465, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029156162054277956, "grad_norm": 7.3645758628845215, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8648127317428589, "num_tokens": 521053494.0, "step": 13660 }, { "epoch": 1.7378196158249586, "ewc_loss": 0.06176323443651199, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002929253096226603, "grad_norm": 7.408079147338867, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8508315086364746, "num_tokens": 521089291.0, "step": 13661 }, { "epoch": 1.7379468261035491, "ewc_loss": 0.06136423721909523, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002913767530117184, "grad_norm": 7.321345329284668, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.874562680721283, "num_tokens": 521128910.0, "step": 13662 }, { "epoch": 1.7380740363821396, "ewc_loss": 0.06144874170422554, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029222178272902966, "grad_norm": 7.427145481109619, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8577972650527954, "num_tokens": 521168175.0, "step": 13663 }, { "epoch": 1.7382012466607302, "ewc_loss": 0.061507053673267365, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029036353225819767, "grad_norm": 7.353131294250488, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8593553304672241, "num_tokens": 521208020.0, "step": 13664 }, { "epoch": 1.7383284569393207, "ewc_loss": 0.06142788007855415, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002920131664723158, "grad_norm": 7.414761543273926, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8741352558135986, "num_tokens": 521249828.0, "step": 13665 }, { "epoch": 1.7384556672179112, "ewc_loss": 0.06123416870832443, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029007604462094605, "grad_norm": 7.3264994621276855, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8652898669242859, "num_tokens": 521289940.0, "step": 13666 }, { "epoch": 1.7385828774965018, "ewc_loss": 0.06140834838151932, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002918178797699511, "grad_norm": 7.368823528289795, "learning_rate": 1e-06, "loss": 0.4058, "mean_token_accuracy": 0.8804560899734497, "num_tokens": 521327426.0, "step": 13667 }, { "epoch": 1.7387100877750923, "ewc_loss": 0.0613124705851078, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029085908317938447, "grad_norm": 7.363397121429443, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8528900146484375, "num_tokens": 521372112.0, "step": 13668 }, { "epoch": 1.7388372980536828, "ewc_loss": 0.06127258017659187, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002904601860791445, "grad_norm": 7.4038777351379395, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8618523478507996, "num_tokens": 521410842.0, "step": 13669 }, { "epoch": 1.7389645083322733, "ewc_loss": 0.0615578256547451, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029087121947668493, "grad_norm": 7.340371131896973, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8721617460250854, "num_tokens": 521446854.0, "step": 13670 }, { "epoch": 1.7390917186108639, "ewc_loss": 0.061616696417331696, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029145993175916374, "grad_norm": 7.3939313888549805, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8527035713195801, "num_tokens": 521486284.0, "step": 13671 }, { "epoch": 1.7392189288894544, "ewc_loss": 0.0613132119178772, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002908664755523205, "grad_norm": 7.404362678527832, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8494614362716675, "num_tokens": 521522716.0, "step": 13672 }, { "epoch": 1.739346139168045, "ewc_loss": 0.0612676665186882, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029041102970950305, "grad_norm": 7.40651273727417, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8567956686019897, "num_tokens": 521551549.0, "step": 13673 }, { "epoch": 1.7394733494466355, "ewc_loss": 0.061581265181303024, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029110562172718346, "grad_norm": 7.372894287109375, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8654899001121521, "num_tokens": 521589552.0, "step": 13674 }, { "epoch": 1.7396005597252258, "ewc_loss": 0.06154055893421173, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002906985755544156, "grad_norm": 7.379032611846924, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8742365837097168, "num_tokens": 521627906.0, "step": 13675 }, { "epoch": 1.7397277700038163, "ewc_loss": 0.06138304993510246, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002915648801717907, "grad_norm": 7.386409759521484, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8459031581878662, "num_tokens": 521666615.0, "step": 13676 }, { "epoch": 1.7398549802824068, "ewc_loss": 0.061510901898145676, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000290401978418231, "grad_norm": 7.345206260681152, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8597614169120789, "num_tokens": 521707742.0, "step": 13677 }, { "epoch": 1.7399821905609973, "ewc_loss": 0.06154022365808487, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029069522861391306, "grad_norm": 7.403441905975342, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8515018820762634, "num_tokens": 521746848.0, "step": 13678 }, { "epoch": 1.7401094008395879, "ewc_loss": 0.06153842434287071, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029067721334286034, "grad_norm": 7.336165428161621, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8636727333068848, "num_tokens": 521782917.0, "step": 13679 }, { "epoch": 1.7402366111181784, "ewc_loss": 0.06140979379415512, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002918322861660272, "grad_norm": 7.357309818267822, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8600344657897949, "num_tokens": 521825609.0, "step": 13680 }, { "epoch": 1.7403638213967687, "ewc_loss": 0.06127326190471649, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029046699637547135, "grad_norm": 7.2999982833862305, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.875374972820282, "num_tokens": 521862277.0, "step": 13681 }, { "epoch": 1.7404910316753592, "ewc_loss": 0.061668239533901215, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029197538970038295, "grad_norm": 7.426333427429199, "learning_rate": 1e-06, "loss": 0.3995, "mean_token_accuracy": 0.879436194896698, "num_tokens": 521894951.0, "step": 13682 }, { "epoch": 1.7406182419539498, "ewc_loss": 0.061230652034282684, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029004091629758477, "grad_norm": 7.290009498596191, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8614980578422546, "num_tokens": 521932158.0, "step": 13683 }, { "epoch": 1.7407454522325403, "ewc_loss": 0.06177244335412979, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029301742324605584, "grad_norm": 7.374746799468994, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8676835894584656, "num_tokens": 521971050.0, "step": 13684 }, { "epoch": 1.7408726625111308, "ewc_loss": 0.0615171380341053, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002904643479268998, "grad_norm": 7.333168029785156, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8727273941040039, "num_tokens": 522007617.0, "step": 13685 }, { "epoch": 1.7409998727897213, "ewc_loss": 0.061689868569374084, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002921916311606765, "grad_norm": 7.347393035888672, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8508729934692383, "num_tokens": 522053484.0, "step": 13686 }, { "epoch": 1.7411270830683119, "ewc_loss": 0.06133342534303665, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002910686016548425, "grad_norm": 7.3949995040893555, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8577917814254761, "num_tokens": 522088047.0, "step": 13687 }, { "epoch": 1.7412542933469024, "ewc_loss": 0.0616411417722702, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002917044039350003, "grad_norm": 7.392971992492676, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8620604276657104, "num_tokens": 522129865.0, "step": 13688 }, { "epoch": 1.741381503625493, "ewc_loss": 0.061305075883865356, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002907851303461939, "grad_norm": 7.308682918548584, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8586084842681885, "num_tokens": 522170154.0, "step": 13689 }, { "epoch": 1.7415087139040835, "ewc_loss": 0.06164328008890152, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000291725795250386, "grad_norm": 7.377719879150391, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.8864409923553467, "num_tokens": 522204586.0, "step": 13690 }, { "epoch": 1.741635924182674, "ewc_loss": 0.06125904619693756, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002903248241636902, "grad_norm": 7.338220596313477, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8666073083877563, "num_tokens": 522241942.0, "step": 13691 }, { "epoch": 1.7417631344612645, "ewc_loss": 0.06165866553783417, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002918796380981803, "grad_norm": 7.382857799530029, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8581002950668335, "num_tokens": 522275800.0, "step": 13692 }, { "epoch": 1.741890344739855, "ewc_loss": 0.0615815706551075, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002911086776293814, "grad_norm": 7.343028545379639, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8566733598709106, "num_tokens": 522317024.0, "step": 13693 }, { "epoch": 1.7420175550184456, "ewc_loss": 0.06156935542821884, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002909865288529545, "grad_norm": 7.307774066925049, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8724852800369263, "num_tokens": 522358510.0, "step": 13694 }, { "epoch": 1.742144765297036, "ewc_loss": 0.06141440197825432, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029187838663347065, "grad_norm": 7.403949737548828, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8547849059104919, "num_tokens": 522395960.0, "step": 13695 }, { "epoch": 1.7422719755756266, "ewc_loss": 0.06167275086045265, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029202047153376043, "grad_norm": 7.717425346374512, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8708481192588806, "num_tokens": 522428807.0, "step": 13696 }, { "epoch": 1.7423991858542172, "ewc_loss": 0.061105769127607346, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028879207093268633, "grad_norm": 7.249766826629639, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8628296852111816, "num_tokens": 522464472.0, "step": 13697 }, { "epoch": 1.7425263961328077, "ewc_loss": 0.06159626320004463, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002936969976872206, "grad_norm": 7.36979341506958, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8757668733596802, "num_tokens": 522509522.0, "step": 13698 }, { "epoch": 1.742653606411398, "ewc_loss": 0.06120111048221588, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00028974548331461847, "grad_norm": 7.356904983520508, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8476547598838806, "num_tokens": 522545581.0, "step": 13699 }, { "epoch": 1.7427808166899885, "ewc_loss": 0.06156338378787041, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029336821171455085, "grad_norm": 7.377882957458496, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8658257722854614, "num_tokens": 522582072.0, "step": 13700 }, { "epoch": 1.742908026968579, "ewc_loss": 0.061356499791145325, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002912993950303644, "grad_norm": 7.306204319000244, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8660105466842651, "num_tokens": 522618183.0, "step": 13701 }, { "epoch": 1.7430352372471696, "ewc_loss": 0.06176610291004181, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002929540059994906, "grad_norm": 7.354010105133057, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8520700931549072, "num_tokens": 522660578.0, "step": 13702 }, { "epoch": 1.74316244752576, "ewc_loss": 0.06160522997379303, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029378666658885777, "grad_norm": 7.362882614135742, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8739935159683228, "num_tokens": 522701934.0, "step": 13703 }, { "epoch": 1.7432896578043506, "ewc_loss": 0.06181401386857033, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029343311325646937, "grad_norm": 7.352210998535156, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8808815479278564, "num_tokens": 522737404.0, "step": 13704 }, { "epoch": 1.743416868082941, "ewc_loss": 0.06152990832924843, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002930334594566375, "grad_norm": 7.3541460037231445, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8582100868225098, "num_tokens": 522774188.0, "step": 13705 }, { "epoch": 1.7435440783615315, "ewc_loss": 0.06185067445039749, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029379973420873284, "grad_norm": 7.400469779968262, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8722168207168579, "num_tokens": 522810467.0, "step": 13706 }, { "epoch": 1.743671288640122, "ewc_loss": 0.061715610325336456, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029244908364489675, "grad_norm": 7.372313499450684, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8515641689300537, "num_tokens": 522845246.0, "step": 13707 }, { "epoch": 1.7437984989187125, "ewc_loss": 0.06156120076775551, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002933463838417083, "grad_norm": 7.398142337799072, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8730740547180176, "num_tokens": 522877150.0, "step": 13708 }, { "epoch": 1.743925709197303, "ewc_loss": 0.06140819936990738, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002918163954745978, "grad_norm": 7.302803039550781, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8771642446517944, "num_tokens": 522913161.0, "step": 13709 }, { "epoch": 1.7440529194758936, "ewc_loss": 0.06169843673706055, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029471871675923467, "grad_norm": 7.383458137512207, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.849287748336792, "num_tokens": 522953464.0, "step": 13710 }, { "epoch": 1.744180129754484, "ewc_loss": 0.061644017696380615, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029173315851949155, "grad_norm": 7.295688152313232, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8674285411834717, "num_tokens": 522996874.0, "step": 13711 }, { "epoch": 1.7443073400330746, "ewc_loss": 0.06194336712360382, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029472666210494936, "grad_norm": 7.447246074676514, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8645514845848083, "num_tokens": 523031349.0, "step": 13712 }, { "epoch": 1.7444345503116652, "ewc_loss": 0.0616651251912117, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002919442194979638, "grad_norm": 7.308714866638184, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8550951480865479, "num_tokens": 523068910.0, "step": 13713 }, { "epoch": 1.7445617605902557, "ewc_loss": 0.06163957715034485, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002941301208920777, "grad_norm": 7.35361385345459, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8770800232887268, "num_tokens": 523106179.0, "step": 13714 }, { "epoch": 1.7446889708688462, "ewc_loss": 0.06181516498327255, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002934446092694998, "grad_norm": 7.396414279937744, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8655648827552795, "num_tokens": 523140827.0, "step": 13715 }, { "epoch": 1.7448161811474368, "ewc_loss": 0.06148865073919296, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029262088355608284, "grad_norm": 7.35850715637207, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8770272135734558, "num_tokens": 523182880.0, "step": 13716 }, { "epoch": 1.7449433914260273, "ewc_loss": 0.06161869317293167, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002939213009085506, "grad_norm": 7.42009162902832, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8667086362838745, "num_tokens": 523213059.0, "step": 13717 }, { "epoch": 1.7450706017046178, "ewc_loss": 0.06161690503358841, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029146199813112617, "grad_norm": 7.297195911407471, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8618980646133423, "num_tokens": 523254398.0, "step": 13718 }, { "epoch": 1.7451978119832083, "ewc_loss": 0.06192769855260849, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029456993797793984, "grad_norm": 7.4067535400390625, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8757502436637878, "num_tokens": 523293325.0, "step": 13719 }, { "epoch": 1.7453250222617989, "ewc_loss": 0.06137470155954361, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029148138128221035, "grad_norm": 7.288448333740234, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8527826070785522, "num_tokens": 523334402.0, "step": 13720 }, { "epoch": 1.7454522325403894, "ewc_loss": 0.06176186352968216, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002953530347440392, "grad_norm": 7.48377799987793, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8673831224441528, "num_tokens": 523376238.0, "step": 13721 }, { "epoch": 1.74557944281898, "ewc_loss": 0.06136740371584892, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002914084179792553, "grad_norm": 7.3296685218811035, "learning_rate": 1e-06, "loss": 0.3927, "mean_token_accuracy": 0.8858100175857544, "num_tokens": 523417074.0, "step": 13722 }, { "epoch": 1.7457066530975704, "ewc_loss": 0.061875708401203156, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002940500562544912, "grad_norm": 7.3499603271484375, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8704079389572144, "num_tokens": 523457984.0, "step": 13723 }, { "epoch": 1.7458338633761608, "ewc_loss": 0.06142695993185043, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029200396966189146, "grad_norm": 7.433507442474365, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8589887619018555, "num_tokens": 523490071.0, "step": 13724 }, { "epoch": 1.7459610736547513, "ewc_loss": 0.06140076369047165, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002917420060839504, "grad_norm": 7.339315414428711, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.857254147529602, "num_tokens": 523530105.0, "step": 13725 }, { "epoch": 1.7460882839333418, "ewc_loss": 0.061578720808029175, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029352158890105784, "grad_norm": 7.330592155456543, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8767898082733154, "num_tokens": 523576698.0, "step": 13726 }, { "epoch": 1.7462154942119323, "ewc_loss": 0.06164991483092308, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002917921228799969, "grad_norm": 7.452016353607178, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8640908002853394, "num_tokens": 523614387.0, "step": 13727 }, { "epoch": 1.7463427044905229, "ewc_loss": 0.06173190474510193, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029261200688779354, "grad_norm": 7.450037956237793, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8589979410171509, "num_tokens": 523651149.0, "step": 13728 }, { "epoch": 1.7464699147691134, "ewc_loss": 0.06136350333690643, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002913694188464433, "grad_norm": 7.329832553863525, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8762410879135132, "num_tokens": 523688448.0, "step": 13729 }, { "epoch": 1.7465971250477037, "ewc_loss": 0.06181315332651138, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029342452762648463, "grad_norm": 7.435767650604248, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8671693205833435, "num_tokens": 523725805.0, "step": 13730 }, { "epoch": 1.7467243353262942, "ewc_loss": 0.061219923198223114, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002899336104746908, "grad_norm": 7.306204319000244, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8643412590026855, "num_tokens": 523763564.0, "step": 13731 }, { "epoch": 1.7468515456048848, "ewc_loss": 0.06152520328760147, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002929864276666194, "grad_norm": 7.3520355224609375, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8590561747550964, "num_tokens": 523804422.0, "step": 13732 }, { "epoch": 1.7469787558834753, "ewc_loss": 0.06158506125211716, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029114357312209904, "grad_norm": 7.346048355102539, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8735597133636475, "num_tokens": 523845889.0, "step": 13733 }, { "epoch": 1.7471059661620658, "ewc_loss": 0.06151454895734787, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029287984943948686, "grad_norm": 7.362703323364258, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8743020296096802, "num_tokens": 523883623.0, "step": 13734 }, { "epoch": 1.7472331764406563, "ewc_loss": 0.061652395874261856, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029181691934354603, "grad_norm": 7.450104713439941, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8698515295982361, "num_tokens": 523917273.0, "step": 13735 }, { "epoch": 1.7473603867192469, "ewc_loss": 0.061445869505405426, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002921930863521993, "grad_norm": 7.344571590423584, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8603613376617432, "num_tokens": 523956149.0, "step": 13736 }, { "epoch": 1.7474875969978374, "ewc_loss": 0.06183682382106781, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002936611999757588, "grad_norm": 7.308602333068848, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8605929017066956, "num_tokens": 523997229.0, "step": 13737 }, { "epoch": 1.747614807276428, "ewc_loss": 0.06172370910644531, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029253007960505784, "grad_norm": 7.380759239196777, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8683086633682251, "num_tokens": 524032921.0, "step": 13738 }, { "epoch": 1.7477420175550185, "ewc_loss": 0.061720989644527435, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002925028675235808, "grad_norm": 7.3960065841674805, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8659930229187012, "num_tokens": 524065016.0, "step": 13739 }, { "epoch": 1.747869227833609, "ewc_loss": 0.06169786676764488, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029227163759060204, "grad_norm": 7.305140018463135, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8520376682281494, "num_tokens": 524107001.0, "step": 13740 }, { "epoch": 1.7479964381121995, "ewc_loss": 0.06146583706140518, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029239276773296297, "grad_norm": 7.432211875915527, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8595229387283325, "num_tokens": 524149283.0, "step": 13741 }, { "epoch": 1.74812364839079, "ewc_loss": 0.06163374334573746, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002916304219979793, "grad_norm": 7.301612854003906, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8635678291320801, "num_tokens": 524187240.0, "step": 13742 }, { "epoch": 1.7482508586693806, "ewc_loss": 0.06154032051563263, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029313756385818124, "grad_norm": 7.442957401275635, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8734695911407471, "num_tokens": 524216793.0, "step": 13743 }, { "epoch": 1.748378068947971, "ewc_loss": 0.06132630258798599, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002909974136855453, "grad_norm": 7.309062480926514, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8688961267471313, "num_tokens": 524257629.0, "step": 13744 }, { "epoch": 1.7485052792265616, "ewc_loss": 0.06162859499454498, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002940203412435949, "grad_norm": 7.378332138061523, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8739289045333862, "num_tokens": 524291090.0, "step": 13745 }, { "epoch": 1.7486324895051522, "ewc_loss": 0.06143440306186676, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002920784172601998, "grad_norm": 7.328327178955078, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.876836895942688, "num_tokens": 524324882.0, "step": 13746 }, { "epoch": 1.7487596997837427, "ewc_loss": 0.06184307485818863, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029372371500357985, "grad_norm": 7.452994346618652, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8471959829330444, "num_tokens": 524359422.0, "step": 13747 }, { "epoch": 1.748886910062333, "ewc_loss": 0.061659812927246094, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002918910759035498, "grad_norm": 7.273411750793457, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8692953586578369, "num_tokens": 524398383.0, "step": 13748 }, { "epoch": 1.7490141203409235, "ewc_loss": 0.061661574989557266, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029435011674650013, "grad_norm": 7.444150447845459, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8610875010490417, "num_tokens": 524437984.0, "step": 13749 }, { "epoch": 1.749141330619514, "ewc_loss": 0.06174178421497345, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029271081439219415, "grad_norm": 7.372066020965576, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.869672417640686, "num_tokens": 524471324.0, "step": 13750 }, { "epoch": 1.7492685408981046, "ewc_loss": 0.06177905201911926, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002930834889411926, "grad_norm": 7.36902379989624, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8551751375198364, "num_tokens": 524511471.0, "step": 13751 }, { "epoch": 1.749395751176695, "ewc_loss": 0.061520785093307495, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029294221894815564, "grad_norm": 7.352078437805176, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8665101528167725, "num_tokens": 524541743.0, "step": 13752 }, { "epoch": 1.7495229614552856, "ewc_loss": 0.06173574924468994, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029265048215165734, "grad_norm": 7.34696102142334, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8642893433570862, "num_tokens": 524581470.0, "step": 13753 }, { "epoch": 1.749650171733876, "ewc_loss": 0.06173136085271835, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029260656447149813, "grad_norm": 7.335215091705322, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8655512928962708, "num_tokens": 524612722.0, "step": 13754 }, { "epoch": 1.7497773820124665, "ewc_loss": 0.06180085241794586, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000293301505735144, "grad_norm": 7.345290184020996, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8728112578392029, "num_tokens": 524647640.0, "step": 13755 }, { "epoch": 1.749904592291057, "ewc_loss": 0.06146290525794029, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002923634310718626, "grad_norm": 7.406098365783691, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8580447435379028, "num_tokens": 524682366.0, "step": 13756 }, { "epoch": 1.7500318025696475, "ewc_loss": 0.061648257076740265, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000291775562800467, "grad_norm": 7.33333683013916, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8640286922454834, "num_tokens": 524724242.0, "step": 13757 }, { "epoch": 1.750159012848238, "ewc_loss": 0.061524711549282074, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002929814800154418, "grad_norm": 7.384756565093994, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8502077460289001, "num_tokens": 524757373.0, "step": 13758 }, { "epoch": 1.7502862231268286, "ewc_loss": 0.0616232231259346, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002915252116508782, "grad_norm": 7.310555458068848, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.853384792804718, "num_tokens": 524797704.0, "step": 13759 }, { "epoch": 1.750413433405419, "ewc_loss": 0.061811819672584534, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029341113986447453, "grad_norm": 7.3732476234436035, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8670228719711304, "num_tokens": 524836710.0, "step": 13760 }, { "epoch": 1.7505406436840096, "ewc_loss": 0.06166798621416092, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029197282856330276, "grad_norm": 7.374552249908447, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8610092997550964, "num_tokens": 524870224.0, "step": 13761 }, { "epoch": 1.7506678539626002, "ewc_loss": 0.061739481985569, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002926877641584724, "grad_norm": 7.382230758666992, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8673259019851685, "num_tokens": 524905345.0, "step": 13762 }, { "epoch": 1.7507950642411907, "ewc_loss": 0.06166137382388115, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029190670466050506, "grad_norm": 7.345229625701904, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8468669652938843, "num_tokens": 524943893.0, "step": 13763 }, { "epoch": 1.7509222745197812, "ewc_loss": 0.06143171340227127, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002920514962170273, "grad_norm": 7.313187122344971, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8623055219650269, "num_tokens": 524982924.0, "step": 13764 }, { "epoch": 1.7510494847983717, "ewc_loss": 0.06171046197414398, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002923975698649883, "grad_norm": 7.387465476989746, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8626781702041626, "num_tokens": 525016749.0, "step": 13765 }, { "epoch": 1.7511766950769623, "ewc_loss": 0.06165593862533569, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029185236780904233, "grad_norm": 7.394686222076416, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8668023347854614, "num_tokens": 525049178.0, "step": 13766 }, { "epoch": 1.7513039053555528, "ewc_loss": 0.061684612184762955, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002921390987467021, "grad_norm": 7.394665718078613, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8492838144302368, "num_tokens": 525083456.0, "step": 13767 }, { "epoch": 1.7514311156341433, "ewc_loss": 0.06165986508131027, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000291891599772498, "grad_norm": 7.357120990753174, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8665324449539185, "num_tokens": 525120159.0, "step": 13768 }, { "epoch": 1.7515583259127339, "ewc_loss": 0.061538003385066986, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029067302239127457, "grad_norm": 7.328483581542969, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8688005805015564, "num_tokens": 525159174.0, "step": 13769 }, { "epoch": 1.7516855361913244, "ewc_loss": 0.061667464673519135, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000291967618977651, "grad_norm": 7.355807304382324, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8615570664405823, "num_tokens": 525198373.0, "step": 13770 }, { "epoch": 1.751812746469915, "ewc_loss": 0.0616692379117012, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029198534321039915, "grad_norm": 7.309741020202637, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8698298335075378, "num_tokens": 525240618.0, "step": 13771 }, { "epoch": 1.7519399567485054, "ewc_loss": 0.06168666481971741, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029215961694717407, "grad_norm": 7.389434814453125, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8788942694664001, "num_tokens": 525274899.0, "step": 13772 }, { "epoch": 1.7520671670270958, "ewc_loss": 0.06164764612913132, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002917694509960711, "grad_norm": 7.375975131988525, "learning_rate": 1e-06, "loss": 0.538, "mean_token_accuracy": 0.8433446884155273, "num_tokens": 525310144.0, "step": 13773 }, { "epoch": 1.7521943773056863, "ewc_loss": 0.06167782098054886, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029207117040641606, "grad_norm": 7.3474249839782715, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.873292088508606, "num_tokens": 525348565.0, "step": 13774 }, { "epoch": 1.7523215875842768, "ewc_loss": 0.06163548678159714, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002916478260885924, "grad_norm": 7.359001636505127, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8702664375305176, "num_tokens": 525383709.0, "step": 13775 }, { "epoch": 1.7524487978628673, "ewc_loss": 0.0616346038877964, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000291639007627964, "grad_norm": 7.403487682342529, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.857650876045227, "num_tokens": 525423983.0, "step": 13776 }, { "epoch": 1.7525760081414579, "ewc_loss": 0.06169360131025314, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029222897137515247, "grad_norm": 7.370829105377197, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8707330226898193, "num_tokens": 525457357.0, "step": 13777 }, { "epoch": 1.7527032184200484, "ewc_loss": 0.06151772662997246, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029047022690065205, "grad_norm": 7.331115245819092, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8622610569000244, "num_tokens": 525497021.0, "step": 13778 }, { "epoch": 1.7528304286986387, "ewc_loss": 0.06171071529388428, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029240010189823806, "grad_norm": 7.337507724761963, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8806450366973877, "num_tokens": 525533592.0, "step": 13779 }, { "epoch": 1.7529576389772292, "ewc_loss": 0.061663996428251266, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002919329272117466, "grad_norm": 7.373837471008301, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8558133840560913, "num_tokens": 525572747.0, "step": 13780 }, { "epoch": 1.7530848492558198, "ewc_loss": 0.061682116240262985, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002921141276601702, "grad_norm": 7.352906227111816, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8691283464431763, "num_tokens": 525613578.0, "step": 13781 }, { "epoch": 1.7532120595344103, "ewc_loss": 0.0616907924413681, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002922009152825922, "grad_norm": 7.326436996459961, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8709437847137451, "num_tokens": 525655047.0, "step": 13782 }, { "epoch": 1.7533392698130008, "ewc_loss": 0.06160769239068031, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002913698845077306, "grad_norm": 7.364863395690918, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8605830669403076, "num_tokens": 525689561.0, "step": 13783 }, { "epoch": 1.7534664800915913, "ewc_loss": 0.061740659177303314, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029269958031363785, "grad_norm": 7.3328070640563965, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8612961173057556, "num_tokens": 525726946.0, "step": 13784 }, { "epoch": 1.7535936903701819, "ewc_loss": 0.06165262311697006, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002918192185461521, "grad_norm": 7.366448402404785, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8562756180763245, "num_tokens": 525762089.0, "step": 13785 }, { "epoch": 1.7537209006487724, "ewc_loss": 0.06162947788834572, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002915877557825297, "grad_norm": 7.377639293670654, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8686853647232056, "num_tokens": 525796081.0, "step": 13786 }, { "epoch": 1.753848110927363, "ewc_loss": 0.061664558947086334, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002919385442510247, "grad_norm": 7.369356632232666, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8522157669067383, "num_tokens": 525829945.0, "step": 13787 }, { "epoch": 1.7539753212059535, "ewc_loss": 0.06167430803179741, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002920360420830548, "grad_norm": 7.3103861808776855, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8721820116043091, "num_tokens": 525871289.0, "step": 13788 }, { "epoch": 1.754102531484544, "ewc_loss": 0.061655860394239426, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029185158200562, "grad_norm": 7.3701629638671875, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8652143478393555, "num_tokens": 525908455.0, "step": 13789 }, { "epoch": 1.7542297417631345, "ewc_loss": 0.06165458261966705, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002918388054240495, "grad_norm": 7.352921009063721, "learning_rate": 1e-06, "loss": 0.3891, "mean_token_accuracy": 0.8845429420471191, "num_tokens": 525943734.0, "step": 13790 }, { "epoch": 1.754356952041725, "ewc_loss": 0.0617218092083931, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029251104569993913, "grad_norm": 7.35878849029541, "learning_rate": 1e-06, "loss": 0.3875, "mean_token_accuracy": 0.8878156542778015, "num_tokens": 525988070.0, "step": 13791 }, { "epoch": 1.7544841623203156, "ewc_loss": 0.06162188574671745, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002915118238888681, "grad_norm": 7.36460018157959, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8784632086753845, "num_tokens": 526027940.0, "step": 13792 }, { "epoch": 1.754611372598906, "ewc_loss": 0.06160339340567589, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000291326898150146, "grad_norm": 7.397126197814941, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8740171790122986, "num_tokens": 526061357.0, "step": 13793 }, { "epoch": 1.7547385828774966, "ewc_loss": 0.06160842627286911, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029137724777683616, "grad_norm": 7.352993488311768, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.866647481918335, "num_tokens": 526102157.0, "step": 13794 }, { "epoch": 1.7548657931560872, "ewc_loss": 0.061681851744651794, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002921115083154291, "grad_norm": 7.451257228851318, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8555197715759277, "num_tokens": 526137241.0, "step": 13795 }, { "epoch": 1.7549930034346777, "ewc_loss": 0.06149783357977867, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002902713022194803, "grad_norm": 7.361311912536621, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8566816449165344, "num_tokens": 526169079.0, "step": 13796 }, { "epoch": 1.755120213713268, "ewc_loss": 0.061408769339323044, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002918220707215369, "grad_norm": 7.338939666748047, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8686701059341431, "num_tokens": 526212212.0, "step": 13797 }, { "epoch": 1.7552474239918585, "ewc_loss": 0.061582546681165695, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002911184274125844, "grad_norm": 7.3917155265808105, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8717020750045776, "num_tokens": 526250380.0, "step": 13798 }, { "epoch": 1.755374634270449, "ewc_loss": 0.061588481068611145, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029117779922671616, "grad_norm": 7.336937427520752, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8712202310562134, "num_tokens": 526292525.0, "step": 13799 }, { "epoch": 1.7555018445490396, "ewc_loss": 0.06164927035570145, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002917856618296355, "grad_norm": 7.394646167755127, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.865827202796936, "num_tokens": 526330427.0, "step": 13800 }, { "epoch": 1.75562905482763, "ewc_loss": 0.061588071286678314, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002911736664827913, "grad_norm": 7.4183502197265625, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8659396767616272, "num_tokens": 526362954.0, "step": 13801 }, { "epoch": 1.7557562651062206, "ewc_loss": 0.06155212223529816, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029081417596898973, "grad_norm": 7.376908302307129, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8579908609390259, "num_tokens": 526394310.0, "step": 13802 }, { "epoch": 1.755883475384811, "ewc_loss": 0.061681997030973434, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029211293440312147, "grad_norm": 7.362430095672607, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8459086418151855, "num_tokens": 526438507.0, "step": 13803 }, { "epoch": 1.7560106856634015, "ewc_loss": 0.06169762462377548, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029226922197267413, "grad_norm": 7.416685581207275, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8611372709274292, "num_tokens": 526475124.0, "step": 13804 }, { "epoch": 1.756137895941992, "ewc_loss": 0.06164967641234398, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029178973636589944, "grad_norm": 8.017078399658203, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8724597096443176, "num_tokens": 526514936.0, "step": 13805 }, { "epoch": 1.7562651062205825, "ewc_loss": 0.06100147217512131, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002877491060644388, "grad_norm": 7.255904197692871, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.858096718788147, "num_tokens": 526548649.0, "step": 13806 }, { "epoch": 1.756392316499173, "ewc_loss": 0.0619654506444931, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002949474728666246, "grad_norm": 7.4928483963012695, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8495712280273438, "num_tokens": 526589028.0, "step": 13807 }, { "epoch": 1.7565195267777636, "ewc_loss": 0.06113094463944435, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002866024151444435, "grad_norm": 7.20035982131958, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8652524352073669, "num_tokens": 526626327.0, "step": 13808 }, { "epoch": 1.756646737056354, "ewc_loss": 0.062089480459690094, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002961877908091992, "grad_norm": 7.513275623321533, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8606998920440674, "num_tokens": 526672264.0, "step": 13809 }, { "epoch": 1.7567739473349446, "ewc_loss": 0.06147439777851105, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029003695817664266, "grad_norm": 7.290914535522461, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8581047058105469, "num_tokens": 526709979.0, "step": 13810 }, { "epoch": 1.7569011576135352, "ewc_loss": 0.06181936711072922, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029348666430450976, "grad_norm": 7.416550636291504, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8774561882019043, "num_tokens": 526745002.0, "step": 13811 }, { "epoch": 1.7570283678921257, "ewc_loss": 0.06163962930440903, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029168924083933234, "grad_norm": 7.295938014984131, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8806159496307373, "num_tokens": 526785684.0, "step": 13812 }, { "epoch": 1.7571555781707162, "ewc_loss": 0.061794720590114594, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002932401839643717, "grad_norm": 7.386904239654541, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.8838413953781128, "num_tokens": 526820246.0, "step": 13813 }, { "epoch": 1.7572827884493067, "ewc_loss": 0.06170642003417015, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002923571737483144, "grad_norm": 8.054488182067871, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8684281706809998, "num_tokens": 526854815.0, "step": 13814 }, { "epoch": 1.7574099987278973, "ewc_loss": 0.06131678447127342, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002884608111344278, "grad_norm": 7.194169044494629, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8705422878265381, "num_tokens": 526896336.0, "step": 13815 }, { "epoch": 1.7575372090064878, "ewc_loss": 0.06215275451540947, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002968205080833286, "grad_norm": 7.460215091705322, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8753795623779297, "num_tokens": 526939586.0, "step": 13816 }, { "epoch": 1.7576644192850783, "ewc_loss": 0.06126803904771805, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00028797335107810795, "grad_norm": 7.23407506942749, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8694905042648315, "num_tokens": 526975027.0, "step": 13817 }, { "epoch": 1.7577916295636689, "ewc_loss": 0.0622221864759922, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002975148381665349, "grad_norm": 7.505224227905273, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8677161931991577, "num_tokens": 527017820.0, "step": 13818 }, { "epoch": 1.7579188398422594, "ewc_loss": 0.06152046471834183, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029049761360511184, "grad_norm": 7.332179069519043, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8760815262794495, "num_tokens": 527051647.0, "step": 13819 }, { "epoch": 1.75804605012085, "ewc_loss": 0.06194903329014778, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029478329815901816, "grad_norm": 7.4987640380859375, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8693934082984924, "num_tokens": 527089590.0, "step": 13820 }, { "epoch": 1.7581732603994404, "ewc_loss": 0.06155826896429062, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029087564325891435, "grad_norm": 7.395977020263672, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8631066083908081, "num_tokens": 527126696.0, "step": 13821 }, { "epoch": 1.7583004706780307, "ewc_loss": 0.06175875663757324, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029288051882758737, "grad_norm": 7.442493438720703, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.854755163192749, "num_tokens": 527164823.0, "step": 13822 }, { "epoch": 1.7584276809566213, "ewc_loss": 0.06164902448654175, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002917831880040467, "grad_norm": 7.38181734085083, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8478599190711975, "num_tokens": 527208032.0, "step": 13823 }, { "epoch": 1.7585548912352118, "ewc_loss": 0.06161894649267197, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002914824290201068, "grad_norm": 7.446758270263672, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.866752564907074, "num_tokens": 527242773.0, "step": 13824 }, { "epoch": 1.7586821015138023, "ewc_loss": 0.06133430078625679, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029107739101164043, "grad_norm": 7.349514007568359, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8684412240982056, "num_tokens": 527277968.0, "step": 13825 }, { "epoch": 1.7588093117923929, "ewc_loss": 0.06142522394657135, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002919865946751088, "grad_norm": 7.415708541870117, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8788546323776245, "num_tokens": 527311077.0, "step": 13826 }, { "epoch": 1.7589365220709834, "ewc_loss": 0.061308860778808594, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002908229944296181, "grad_norm": 7.434151649475098, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8718394637107849, "num_tokens": 527340543.0, "step": 13827 }, { "epoch": 1.7590637323495737, "ewc_loss": 0.06137169897556305, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000291451346129179, "grad_norm": 7.373831272125244, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8632068634033203, "num_tokens": 527379596.0, "step": 13828 }, { "epoch": 1.7591909426281642, "ewc_loss": 0.06133503466844559, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029108469607308507, "grad_norm": 7.440549373626709, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8644633889198303, "num_tokens": 527417873.0, "step": 13829 }, { "epoch": 1.7593181529067548, "ewc_loss": 0.0612143948674202, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000289878313196823, "grad_norm": 7.326129913330078, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8681813478469849, "num_tokens": 527458607.0, "step": 13830 }, { "epoch": 1.7594453631853453, "ewc_loss": 0.06172415614128113, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029253450338728726, "grad_norm": 7.461831092834473, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8574331998825073, "num_tokens": 527493299.0, "step": 13831 }, { "epoch": 1.7595725734639358, "ewc_loss": 0.061259105801582336, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029032540624029934, "grad_norm": 7.34544038772583, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8525141477584839, "num_tokens": 527536065.0, "step": 13832 }, { "epoch": 1.7596997837425263, "ewc_loss": 0.06147552281618118, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000292489625280723, "grad_norm": 7.438753604888916, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8496721982955933, "num_tokens": 527576477.0, "step": 13833 }, { "epoch": 1.7598269940211169, "ewc_loss": 0.06128483638167381, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002905827423091978, "grad_norm": 7.301597595214844, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8629535436630249, "num_tokens": 527611777.0, "step": 13834 }, { "epoch": 1.7599542042997074, "ewc_loss": 0.061814963817596436, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002934426302090287, "grad_norm": 7.407986640930176, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8818779587745667, "num_tokens": 527650236.0, "step": 13835 }, { "epoch": 1.760081414578298, "ewc_loss": 0.06161724403500557, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002914654032792896, "grad_norm": 7.376289367675781, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.870287299156189, "num_tokens": 527685486.0, "step": 13836 }, { "epoch": 1.7602086248568884, "ewc_loss": 0.061804354190826416, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029333651764318347, "grad_norm": 7.44388484954834, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.852057695388794, "num_tokens": 527725386.0, "step": 13837 }, { "epoch": 1.760335835135479, "ewc_loss": 0.061346836388111115, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002912027121055871, "grad_norm": 7.446146011352539, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8530324697494507, "num_tokens": 527756213.0, "step": 13838 }, { "epoch": 1.7604630454140695, "ewc_loss": 0.06142490729689598, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029198345146141946, "grad_norm": 7.393529891967773, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8694193363189697, "num_tokens": 527793884.0, "step": 13839 }, { "epoch": 1.76059025569266, "ewc_loss": 0.06134432554244995, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002911776246037334, "grad_norm": 7.3798627853393555, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8629975914955139, "num_tokens": 527826428.0, "step": 13840 }, { "epoch": 1.7607174659712506, "ewc_loss": 0.06308017671108246, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00029144625295884907, "grad_norm": 36.027767181396484, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8581545352935791, "num_tokens": 527866691.0, "step": 13841 }, { "epoch": 1.760844676249841, "ewc_loss": 0.09182647615671158, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00059355772100389, "grad_norm": 11.244185447692871, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8745182752609253, "num_tokens": 527909482.0, "step": 13842 }, { "epoch": 1.7609718865284316, "ewc_loss": 0.060329459607601166, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00027858756948262453, "grad_norm": 6.181969165802002, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8532054424285889, "num_tokens": 527947818.0, "step": 13843 }, { "epoch": 1.7610990968070221, "ewc_loss": 0.07855510711669922, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0004608440212905407, "grad_norm": 12.774763107299805, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8668186664581299, "num_tokens": 527986760.0, "step": 13844 }, { "epoch": 1.7612263070856127, "ewc_loss": 0.09382008016109467, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0006134937284514308, "grad_norm": 11.475728034973145, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8594369888305664, "num_tokens": 528024742.0, "step": 13845 }, { "epoch": 1.761353517364203, "ewc_loss": 0.06858830899000168, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00036117606214247644, "grad_norm": 7.331780433654785, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8491528630256653, "num_tokens": 528060781.0, "step": 13846 }, { "epoch": 1.7614807276427935, "ewc_loss": 0.0727211982011795, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0004025049856863916, "grad_norm": 9.509521484375, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8667827248573303, "num_tokens": 528099051.0, "step": 13847 }, { "epoch": 1.761607937921384, "ewc_loss": 0.07878367602825165, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00046312977792695165, "grad_norm": 9.409163475036621, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8586837649345398, "num_tokens": 528138325.0, "step": 13848 }, { "epoch": 1.7617351481999746, "ewc_loss": 0.06849775463342667, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00036027049645781517, "grad_norm": 7.825201034545898, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8560831546783447, "num_tokens": 528177997.0, "step": 13849 }, { "epoch": 1.761862358478565, "ewc_loss": 0.06872499734163284, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00036254292353987694, "grad_norm": 8.642834663391113, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8616119027137756, "num_tokens": 528216360.0, "step": 13850 }, { "epoch": 1.7619895687571556, "ewc_loss": 0.07028283923864365, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00037812135997228324, "grad_norm": 8.268665313720703, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8811818957328796, "num_tokens": 528251100.0, "step": 13851 }, { "epoch": 1.762116779035746, "ewc_loss": 0.06628430634737015, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003381360147614032, "grad_norm": 8.20863151550293, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8690153956413269, "num_tokens": 528286298.0, "step": 13852 }, { "epoch": 1.7622439893143365, "ewc_loss": 0.066273033618927, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003380232665222138, "grad_norm": 7.987668037414551, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8806827068328857, "num_tokens": 528322970.0, "step": 13853 }, { "epoch": 1.762371199592927, "ewc_loss": 0.06549754738807678, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003302684344816953, "grad_norm": 7.874161243438721, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8821465969085693, "num_tokens": 528355191.0, "step": 13854 }, { "epoch": 1.7624984098715175, "ewc_loss": 0.06425482034683228, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0003202826192136854, "grad_norm": 7.842486381530762, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8644411563873291, "num_tokens": 528388975.0, "step": 13855 }, { "epoch": 1.762625620150108, "ewc_loss": 0.06409475207328796, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0003186818794347346, "grad_norm": 7.691272735595703, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8694473505020142, "num_tokens": 528432139.0, "step": 13856 }, { "epoch": 1.7627528304286986, "ewc_loss": 0.06339694559574127, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00031170385773293674, "grad_norm": 7.675187110900879, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.874792218208313, "num_tokens": 528468315.0, "step": 13857 }, { "epoch": 1.762880040707289, "ewc_loss": 0.06309471279382706, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0003086815122514963, "grad_norm": 7.598354816436768, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8544653654098511, "num_tokens": 528510368.0, "step": 13858 }, { "epoch": 1.7630072509858796, "ewc_loss": 0.06282804906368256, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00030601484468206763, "grad_norm": 7.577578067779541, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8672993779182434, "num_tokens": 528545637.0, "step": 13859 }, { "epoch": 1.7631344612644702, "ewc_loss": 0.06258554756641388, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0003035898262169212, "grad_norm": 7.522663116455078, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8701062798500061, "num_tokens": 528586267.0, "step": 13860 }, { "epoch": 1.7632616715430607, "ewc_loss": 0.0624031201004982, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0003017655690200627, "grad_norm": 7.517498970031738, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8687599897384644, "num_tokens": 528629640.0, "step": 13861 }, { "epoch": 1.7633888818216512, "ewc_loss": 0.06225268170237541, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0003002611920237541, "grad_norm": 7.51617956161499, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.850798487663269, "num_tokens": 528665373.0, "step": 13862 }, { "epoch": 1.7635160921002417, "ewc_loss": 0.06201866269111633, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002979209821205586, "grad_norm": 7.410704135894775, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8805081844329834, "num_tokens": 528703426.0, "step": 13863 }, { "epoch": 1.7636433023788323, "ewc_loss": 0.062071554362773895, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029844994423910975, "grad_norm": 7.496983528137207, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8459385633468628, "num_tokens": 528744886.0, "step": 13864 }, { "epoch": 1.7637705126574228, "ewc_loss": 0.06181131675839424, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029584753792732954, "grad_norm": 7.397636890411377, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8769069910049438, "num_tokens": 528779773.0, "step": 13865 }, { "epoch": 1.7638977229360133, "ewc_loss": 0.06190949305891991, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029682929744012654, "grad_norm": 7.377057075500488, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8677222728729248, "num_tokens": 528821878.0, "step": 13866 }, { "epoch": 1.7640249332146039, "ewc_loss": 0.06189361959695816, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029667059425264597, "grad_norm": 7.4882941246032715, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8663923740386963, "num_tokens": 528858056.0, "step": 13867 }, { "epoch": 1.7641521434931944, "ewc_loss": 0.061785899102687836, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029559334507212043, "grad_norm": 7.388021945953369, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.861478328704834, "num_tokens": 528903116.0, "step": 13868 }, { "epoch": 1.764279353771785, "ewc_loss": 0.06183939427137375, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002961283316835761, "grad_norm": 7.427957534790039, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.859396755695343, "num_tokens": 528941561.0, "step": 13869 }, { "epoch": 1.7644065640503754, "ewc_loss": 0.061717644333839417, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029491083114407957, "grad_norm": 7.364049434661865, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8811987042427063, "num_tokens": 528976215.0, "step": 13870 }, { "epoch": 1.7645337743289657, "ewc_loss": 0.06184755265712738, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029620990972034633, "grad_norm": 7.413323879241943, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8724714517593384, "num_tokens": 529019280.0, "step": 13871 }, { "epoch": 1.7646609846075563, "ewc_loss": 0.06169060245156288, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002946403983514756, "grad_norm": 7.367114543914795, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8688696026802063, "num_tokens": 529060733.0, "step": 13872 }, { "epoch": 1.7647881948861468, "ewc_loss": 0.06183350831270218, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029606945463456213, "grad_norm": 7.3937883377075195, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8674185276031494, "num_tokens": 529094948.0, "step": 13873 }, { "epoch": 1.7649154051647373, "ewc_loss": 0.06168116629123688, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029454604373313487, "grad_norm": 7.407132148742676, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8767890930175781, "num_tokens": 529131374.0, "step": 13874 }, { "epoch": 1.7650426154433279, "ewc_loss": 0.06175181642174721, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002952525392174721, "grad_norm": 7.402394771575928, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8627510666847229, "num_tokens": 529169133.0, "step": 13875 }, { "epoch": 1.7651698257219184, "ewc_loss": 0.061735600233078, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002950904017779976, "grad_norm": 7.410037517547607, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8635951280593872, "num_tokens": 529206467.0, "step": 13876 }, { "epoch": 1.7652970360005087, "ewc_loss": 0.061678238213062286, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002945167652796954, "grad_norm": 7.409002304077148, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8555659055709839, "num_tokens": 529244891.0, "step": 13877 }, { "epoch": 1.7654242462790992, "ewc_loss": 0.06176489591598511, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002953833609353751, "grad_norm": 7.468023777008057, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8669455051422119, "num_tokens": 529277384.0, "step": 13878 }, { "epoch": 1.7655514565576897, "ewc_loss": 0.06159196048974991, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029365395312197506, "grad_norm": 7.3595290184021, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8681495785713196, "num_tokens": 529322530.0, "step": 13879 }, { "epoch": 1.7656786668362803, "ewc_loss": 0.0617535337805748, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000295269739581272, "grad_norm": 7.493622779846191, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8589887619018555, "num_tokens": 529356345.0, "step": 13880 }, { "epoch": 1.7658058771148708, "ewc_loss": 0.06146742030978203, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029240857111290097, "grad_norm": 7.357542514801025, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8672908544540405, "num_tokens": 529391877.0, "step": 13881 }, { "epoch": 1.7659330873934613, "ewc_loss": 0.06179874390363693, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002957218384835869, "grad_norm": 7.52761173248291, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8720647692680359, "num_tokens": 529425460.0, "step": 13882 }, { "epoch": 1.7660602976720519, "ewc_loss": 0.06142280250787735, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029196240939199924, "grad_norm": 7.355506420135498, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8541468381881714, "num_tokens": 529459660.0, "step": 13883 }, { "epoch": 1.7661875079506424, "ewc_loss": 0.06180807948112488, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002958151453640312, "grad_norm": 7.4329047203063965, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8576352596282959, "num_tokens": 529501839.0, "step": 13884 }, { "epoch": 1.766314718229233, "ewc_loss": 0.061574019491672516, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029347455711103976, "grad_norm": 7.412936210632324, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8629312515258789, "num_tokens": 529540828.0, "step": 13885 }, { "epoch": 1.7664419285078234, "ewc_loss": 0.06157388538122177, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029347321833483875, "grad_norm": 7.356302261352539, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8651731610298157, "num_tokens": 529581578.0, "step": 13886 }, { "epoch": 1.766569138786414, "ewc_loss": 0.06170922517776489, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029482663376256824, "grad_norm": 7.470336437225342, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.865462064743042, "num_tokens": 529624363.0, "step": 13887 }, { "epoch": 1.7666963490650045, "ewc_loss": 0.06147036701440811, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002924380241893232, "grad_norm": 7.3992533683776855, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8725731372833252, "num_tokens": 529665286.0, "step": 13888 }, { "epoch": 1.766823559343595, "ewc_loss": 0.06171911209821701, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029492549947462976, "grad_norm": 7.453662872314453, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8614757657051086, "num_tokens": 529704150.0, "step": 13889 }, { "epoch": 1.7669507696221856, "ewc_loss": 0.06140052527189255, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029173961956985295, "grad_norm": 7.3641767501831055, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8668427467346191, "num_tokens": 529742110.0, "step": 13890 }, { "epoch": 1.767077979900776, "ewc_loss": 0.061721935868263245, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029495375929400325, "grad_norm": 7.465062618255615, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8643718957901001, "num_tokens": 529776158.0, "step": 13891 }, { "epoch": 1.7672051901793666, "ewc_loss": 0.061696477234363556, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002922577259596437, "grad_norm": 7.4195556640625, "learning_rate": 1e-06, "loss": 0.5215, "mean_token_accuracy": 0.8421710133552551, "num_tokens": 529814002.0, "step": 13892 }, { "epoch": 1.7673324004579571, "ewc_loss": 0.06161220371723175, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002938563993666321, "grad_norm": 7.441312313079834, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8675955533981323, "num_tokens": 529846696.0, "step": 13893 }, { "epoch": 1.7674596107365477, "ewc_loss": 0.061597175896167755, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029370610718615353, "grad_norm": 7.4657721519470215, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8615045547485352, "num_tokens": 529881557.0, "step": 13894 }, { "epoch": 1.767586821015138, "ewc_loss": 0.061407916247844696, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002918135141953826, "grad_norm": 7.367844581604004, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8520714640617371, "num_tokens": 529924337.0, "step": 13895 }, { "epoch": 1.7677140312937285, "ewc_loss": 0.0615985170006752, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002937195240519941, "grad_norm": 7.367762088775635, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8620771169662476, "num_tokens": 529967195.0, "step": 13896 }, { "epoch": 1.767841241572319, "ewc_loss": 0.061581965535879135, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002935540396720171, "grad_norm": 7.444651126861572, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8454198837280273, "num_tokens": 530003454.0, "step": 13897 }, { "epoch": 1.7679684518509096, "ewc_loss": 0.0614919550716877, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029265391640365124, "grad_norm": 7.374523162841797, "learning_rate": 1e-06, "loss": 0.5354, "mean_token_accuracy": 0.839078426361084, "num_tokens": 530042577.0, "step": 13898 }, { "epoch": 1.7680956621295, "ewc_loss": 0.06173645332455635, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000295098900096491, "grad_norm": 7.432024955749512, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8491758108139038, "num_tokens": 530084903.0, "step": 13899 }, { "epoch": 1.7682228724080906, "ewc_loss": 0.06160658597946167, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029380025807768106, "grad_norm": 7.431907653808594, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.861365795135498, "num_tokens": 530117598.0, "step": 13900 }, { "epoch": 1.768350082686681, "ewc_loss": 0.06187133118510246, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002940062840934843, "grad_norm": 7.407494068145752, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8670640587806702, "num_tokens": 530151891.0, "step": 13901 }, { "epoch": 1.7684772929652715, "ewc_loss": 0.061884067952632904, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029413364245556295, "grad_norm": 7.377937316894531, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8561705350875854, "num_tokens": 530191263.0, "step": 13902 }, { "epoch": 1.768604503243862, "ewc_loss": 0.06194648891687393, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002947578323073685, "grad_norm": 7.445160865783691, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8603670001029968, "num_tokens": 530231156.0, "step": 13903 }, { "epoch": 1.7687317135224525, "ewc_loss": 0.061811380088329315, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029340674518607557, "grad_norm": 7.355975151062012, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8787373304367065, "num_tokens": 530270089.0, "step": 13904 }, { "epoch": 1.768858923801043, "ewc_loss": 0.06195877492427826, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002948807377833873, "grad_norm": 7.406919479370117, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8681825995445251, "num_tokens": 530309808.0, "step": 13905 }, { "epoch": 1.7689861340796336, "ewc_loss": 0.061819083988666534, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000293483812129125, "grad_norm": 7.326298713684082, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.852344274520874, "num_tokens": 530357723.0, "step": 13906 }, { "epoch": 1.769113344358224, "ewc_loss": 0.06205511838197708, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029584416188299656, "grad_norm": 7.501278400421143, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8518767356872559, "num_tokens": 530389586.0, "step": 13907 }, { "epoch": 1.7692405546368146, "ewc_loss": 0.061757199466228485, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029286497738212347, "grad_norm": 7.35809850692749, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8538728356361389, "num_tokens": 530430356.0, "step": 13908 }, { "epoch": 1.7693677649154052, "ewc_loss": 0.06199098378419876, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002952028007712215, "grad_norm": 7.424330234527588, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8699195384979248, "num_tokens": 530472859.0, "step": 13909 }, { "epoch": 1.7694949751939957, "ewc_loss": 0.06178745999932289, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029316756990738213, "grad_norm": 7.361903667449951, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.85103440284729, "num_tokens": 530515422.0, "step": 13910 }, { "epoch": 1.7696221854725862, "ewc_loss": 0.061740271747112274, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029513711342588067, "grad_norm": 7.427889823913574, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8637704849243164, "num_tokens": 530553842.0, "step": 13911 }, { "epoch": 1.7697493957511767, "ewc_loss": 0.06161312758922577, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002938656252808869, "grad_norm": 7.427161693572998, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8637096881866455, "num_tokens": 530592593.0, "step": 13912 }, { "epoch": 1.7698766060297673, "ewc_loss": 0.06178750842809677, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002931680646724999, "grad_norm": 7.433803081512451, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8588998913764954, "num_tokens": 530632309.0, "step": 13913 }, { "epoch": 1.7700038163083578, "ewc_loss": 0.06154564023017883, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002931907947640866, "grad_norm": 7.42812967300415, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8600184321403503, "num_tokens": 530667415.0, "step": 13914 }, { "epoch": 1.7701310265869483, "ewc_loss": 0.0614335760474205, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002920701226685196, "grad_norm": 7.359013080596924, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8614304065704346, "num_tokens": 530705736.0, "step": 13915 }, { "epoch": 1.7702582368655388, "ewc_loss": 0.06164129450917244, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002941473212558776, "grad_norm": 7.429027080535889, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8720850348472595, "num_tokens": 530744931.0, "step": 13916 }, { "epoch": 1.7703854471441294, "ewc_loss": 0.061438873410224915, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029212312074378133, "grad_norm": 7.347104549407959, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8768664598464966, "num_tokens": 530784700.0, "step": 13917 }, { "epoch": 1.77051265742272, "ewc_loss": 0.06161195784807205, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029385395464487374, "grad_norm": 7.4019951820373535, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8800779581069946, "num_tokens": 530821097.0, "step": 13918 }, { "epoch": 1.7706398677013102, "ewc_loss": 0.061561986804008484, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002933542418759316, "grad_norm": 7.427478313446045, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8577479124069214, "num_tokens": 530860444.0, "step": 13919 }, { "epoch": 1.7707670779799007, "ewc_loss": 0.06161440163850784, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002938784018624574, "grad_norm": 7.355822563171387, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8615580797195435, "num_tokens": 530907304.0, "step": 13920 }, { "epoch": 1.7708942882584913, "ewc_loss": 0.06142910569906235, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002944668522104621, "grad_norm": 7.44451904296875, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8557520508766174, "num_tokens": 530945692.0, "step": 13921 }, { "epoch": 1.7710214985370818, "ewc_loss": 0.061232443898916245, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002925002190750092, "grad_norm": 7.3591485023498535, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.873130202293396, "num_tokens": 530986695.0, "step": 13922 }, { "epoch": 1.7711487088156723, "ewc_loss": 0.06150491535663605, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00029522491968236864, "grad_norm": 7.418859004974365, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8754782676696777, "num_tokens": 531022656.0, "step": 13923 }, { "epoch": 1.7712759190942629, "ewc_loss": 0.06138041615486145, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002939799160230905, "grad_norm": 7.417642593383789, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8717854022979736, "num_tokens": 531056896.0, "step": 13924 }, { "epoch": 1.7714031293728534, "ewc_loss": 0.06136106699705124, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002937864337582141, "grad_norm": 7.4619598388671875, "learning_rate": 1e-06, "loss": 0.5443, "mean_token_accuracy": 0.838119387626648, "num_tokens": 531097286.0, "step": 13925 }, { "epoch": 1.7715303396514437, "ewc_loss": 0.061417900025844574, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002943547733593732, "grad_norm": 7.3606791496276855, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8716259002685547, "num_tokens": 531136430.0, "step": 13926 }, { "epoch": 1.7716575499300342, "ewc_loss": 0.06145860254764557, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002947618195321411, "grad_norm": 7.409216403961182, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8554251194000244, "num_tokens": 531176698.0, "step": 13927 }, { "epoch": 1.7717847602086247, "ewc_loss": 0.06140138581395149, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002941896382253617, "grad_norm": 7.396601676940918, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8626047968864441, "num_tokens": 531214343.0, "step": 13928 }, { "epoch": 1.7719119704872153, "ewc_loss": 0.06145313382148743, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00029470710433088243, "grad_norm": 7.360562801361084, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8760508894920349, "num_tokens": 531252325.0, "step": 13929 }, { "epoch": 1.7720391807658058, "ewc_loss": 0.061492882668972015, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002951046044472605, "grad_norm": 7.421385288238525, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.851621687412262, "num_tokens": 531290378.0, "step": 13930 }, { "epoch": 1.7721663910443963, "ewc_loss": 0.061650797724723816, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002942423743661493, "grad_norm": 7.353066921234131, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.853905200958252, "num_tokens": 531330406.0, "step": 13931 }, { "epoch": 1.7722936013229869, "ewc_loss": 0.06151805818080902, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.0002953563816845417, "grad_norm": 7.362109661102295, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8633190393447876, "num_tokens": 531367425.0, "step": 13932 }, { "epoch": 1.7724208116015774, "ewc_loss": 0.061756640672683716, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029530079336836934, "grad_norm": 7.368044853210449, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8834480047225952, "num_tokens": 531406465.0, "step": 13933 }, { "epoch": 1.772548021880168, "ewc_loss": 0.06178048998117447, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002955392701551318, "grad_norm": 7.400383949279785, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8534177541732788, "num_tokens": 531445356.0, "step": 13934 }, { "epoch": 1.7726752321587584, "ewc_loss": 0.06180643290281296, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029579870169982314, "grad_norm": 7.36906623840332, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8605408668518066, "num_tokens": 531484737.0, "step": 13935 }, { "epoch": 1.772802442437349, "ewc_loss": 0.06159898638725281, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00029616564279422164, "grad_norm": 7.416594982147217, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8605979681015015, "num_tokens": 531523453.0, "step": 13936 }, { "epoch": 1.7729296527159395, "ewc_loss": 0.06157282739877701, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00029590402846224606, "grad_norm": 7.385097980499268, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8720176219940186, "num_tokens": 531559627.0, "step": 13937 }, { "epoch": 1.77305686299453, "ewc_loss": 0.061895355582237244, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002966879110317677, "grad_norm": 7.440330505371094, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8686329126358032, "num_tokens": 531597677.0, "step": 13938 }, { "epoch": 1.7731840732731206, "ewc_loss": 0.06176988035440445, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029543315758928657, "grad_norm": 7.359124660491943, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8724700212478638, "num_tokens": 531642753.0, "step": 13939 }, { "epoch": 1.773311283551711, "ewc_loss": 0.06170374155044556, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00029721317696385086, "grad_norm": 7.448965549468994, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8573545217514038, "num_tokens": 531688249.0, "step": 13940 }, { "epoch": 1.7734384938303016, "ewc_loss": 0.06144671514630318, "ewc_loss_diag": 3.1948089599609375e-05, "ewc_loss_parallel": 0.00029464293038472533, "grad_norm": 7.3496551513671875, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8773962259292603, "num_tokens": 531727994.0, "step": 13941 }, { "epoch": 1.7735657041088921, "ewc_loss": 0.06196818873286247, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029741626349277794, "grad_norm": 7.425319194793701, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8664905428886414, "num_tokens": 531764428.0, "step": 13942 }, { "epoch": 1.7736929143874827, "ewc_loss": 0.061659738421440125, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029433175222948194, "grad_norm": 7.358294486999512, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8586773872375488, "num_tokens": 531806604.0, "step": 13943 }, { "epoch": 1.773820124666073, "ewc_loss": 0.06191840395331383, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029691841336898506, "grad_norm": 7.392125606536865, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8829094171524048, "num_tokens": 531845504.0, "step": 13944 }, { "epoch": 1.7739473349446635, "ewc_loss": 0.06181345880031586, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029586898745037615, "grad_norm": 7.388062477111816, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.872527003288269, "num_tokens": 531880962.0, "step": 13945 }, { "epoch": 1.774074545223254, "ewc_loss": 0.06183817982673645, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029611619538627565, "grad_norm": 7.41031551361084, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.861481249332428, "num_tokens": 531918182.0, "step": 13946 }, { "epoch": 1.7742017555018446, "ewc_loss": 0.0617741197347641, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000295475561870262, "grad_norm": 7.3961968421936035, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8658086657524109, "num_tokens": 531958340.0, "step": 13947 }, { "epoch": 1.774328965780435, "ewc_loss": 0.06186676770448685, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002964020532090217, "grad_norm": 7.4133620262146, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8526549935340881, "num_tokens": 531996951.0, "step": 13948 }, { "epoch": 1.7744561760590256, "ewc_loss": 0.0617801770567894, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002955361269414425, "grad_norm": 7.358703136444092, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.86807781457901, "num_tokens": 532036963.0, "step": 13949 }, { "epoch": 1.774583386337616, "ewc_loss": 0.061836179345846176, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002960961719509214, "grad_norm": 7.427060127258301, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8580156564712524, "num_tokens": 532070937.0, "step": 13950 }, { "epoch": 1.7747105966162064, "ewc_loss": 0.06176704168319702, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029540478135459125, "grad_norm": 7.391382217407227, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8499853014945984, "num_tokens": 532107866.0, "step": 13951 }, { "epoch": 1.774837806894797, "ewc_loss": 0.06189078092575073, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029664215981028974, "grad_norm": 7.373281002044678, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8564543128013611, "num_tokens": 532151485.0, "step": 13952 }, { "epoch": 1.7749650171733875, "ewc_loss": 0.061804067343473434, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002957750402856618, "grad_norm": 7.392525672912598, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8707846999168396, "num_tokens": 532191209.0, "step": 13953 }, { "epoch": 1.775092227451978, "ewc_loss": 0.06192569434642792, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002969913184642792, "grad_norm": 7.35821533203125, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.858144998550415, "num_tokens": 532232219.0, "step": 13954 }, { "epoch": 1.7752194377305686, "ewc_loss": 0.061857931315898895, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029631369397975504, "grad_norm": 7.468670845031738, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8520726561546326, "num_tokens": 532265330.0, "step": 13955 }, { "epoch": 1.775346648009159, "ewc_loss": 0.06188351660966873, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029656951664946973, "grad_norm": 7.508515357971191, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8609414100646973, "num_tokens": 532295964.0, "step": 13956 }, { "epoch": 1.7754738582877496, "ewc_loss": 0.06187693774700165, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029650377109646797, "grad_norm": 7.400019645690918, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8665201663970947, "num_tokens": 532333804.0, "step": 13957 }, { "epoch": 1.7756010685663401, "ewc_loss": 0.06184828653931618, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029621724388562143, "grad_norm": 7.470682144165039, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8678783178329468, "num_tokens": 532373088.0, "step": 13958 }, { "epoch": 1.7757282788449307, "ewc_loss": 0.061655040830373764, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029428477864712477, "grad_norm": 7.3790507316589355, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.861503005027771, "num_tokens": 532409473.0, "step": 13959 }, { "epoch": 1.7758554891235212, "ewc_loss": 0.06188187003135681, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002965530729852617, "grad_norm": 7.4177565574646, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8764005303382874, "num_tokens": 532447883.0, "step": 13960 }, { "epoch": 1.7759826994021117, "ewc_loss": 0.061712831258773804, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029486269340850413, "grad_norm": 7.430079460144043, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8688836693763733, "num_tokens": 532485783.0, "step": 13961 }, { "epoch": 1.7761099096807023, "ewc_loss": 0.06177794188261032, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002955138043034822, "grad_norm": 7.4923505783081055, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8696507215499878, "num_tokens": 532511082.0, "step": 13962 }, { "epoch": 1.7762371199592928, "ewc_loss": 0.06166205555200577, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029435494798235595, "grad_norm": 7.361681938171387, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8821672201156616, "num_tokens": 532555516.0, "step": 13963 }, { "epoch": 1.7763643302378833, "ewc_loss": 0.06175163388252258, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002952507056761533, "grad_norm": 7.473164081573486, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8485884666442871, "num_tokens": 532599200.0, "step": 13964 }, { "epoch": 1.7764915405164738, "ewc_loss": 0.061654672026634216, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029428108246065676, "grad_norm": 7.358091831207275, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.874393105506897, "num_tokens": 532636335.0, "step": 13965 }, { "epoch": 1.7766187507950644, "ewc_loss": 0.06212133914232254, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002965063613373786, "grad_norm": 7.421509265899658, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.876274049282074, "num_tokens": 532678337.0, "step": 13966 }, { "epoch": 1.776745961073655, "ewc_loss": 0.061674073338508606, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029447508859448135, "grad_norm": 7.391173362731934, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8606255054473877, "num_tokens": 532715051.0, "step": 13967 }, { "epoch": 1.7768731713522452, "ewc_loss": 0.06216435506939888, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029693651595152915, "grad_norm": 7.431765556335449, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8748064041137695, "num_tokens": 532750836.0, "step": 13968 }, { "epoch": 1.7770003816308357, "ewc_loss": 0.06201126053929329, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002954055671580136, "grad_norm": 7.409636497497559, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8701140284538269, "num_tokens": 532790331.0, "step": 13969 }, { "epoch": 1.7771275919094263, "ewc_loss": 0.06176096573472023, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029534404166042805, "grad_norm": 7.401320934295654, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8601157069206238, "num_tokens": 532830169.0, "step": 13970 }, { "epoch": 1.7772548021880168, "ewc_loss": 0.061955567449331284, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002948486362583935, "grad_norm": 7.479463577270508, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8580820560455322, "num_tokens": 532865957.0, "step": 13971 }, { "epoch": 1.7773820124666073, "ewc_loss": 0.06163008138537407, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002940351841971278, "grad_norm": 7.339609622955322, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8602887392044067, "num_tokens": 532906112.0, "step": 13972 }, { "epoch": 1.7775092227451978, "ewc_loss": 0.06190788745880127, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002968132321257144, "grad_norm": 7.5248589515686035, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8708353042602539, "num_tokens": 532939676.0, "step": 13973 }, { "epoch": 1.7776364330237884, "ewc_loss": 0.061504386365413666, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002927782479673624, "grad_norm": 7.382288455963135, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8459900617599487, "num_tokens": 532977668.0, "step": 13974 }, { "epoch": 1.7777636433023787, "ewc_loss": 0.06188349425792694, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000296569342026487, "grad_norm": 7.476898670196533, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8610565662384033, "num_tokens": 533013862.0, "step": 13975 }, { "epoch": 1.7778908535809692, "ewc_loss": 0.06158501282334328, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002935845113825053, "grad_norm": 7.366949081420898, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8529852628707886, "num_tokens": 533051222.0, "step": 13976 }, { "epoch": 1.7780180638595597, "ewc_loss": 0.06172961741685867, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002950305351987481, "grad_norm": 7.417738914489746, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8621374368667603, "num_tokens": 533090182.0, "step": 13977 }, { "epoch": 1.7781452741381503, "ewc_loss": 0.06156554073095322, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029338980675674975, "grad_norm": 7.329357624053955, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8767460584640503, "num_tokens": 533129600.0, "step": 13978 }, { "epoch": 1.7782724844167408, "ewc_loss": 0.06181826442480087, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029591700877062976, "grad_norm": 7.414956569671631, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8640478253364563, "num_tokens": 533172351.0, "step": 13979 }, { "epoch": 1.7783996946953313, "ewc_loss": 0.06163863092660904, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029412066214717925, "grad_norm": 7.41872501373291, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.861325740814209, "num_tokens": 533211583.0, "step": 13980 }, { "epoch": 1.7785269049739219, "ewc_loss": 0.06173309311270714, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002950653142761439, "grad_norm": 7.358942985534668, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8707638382911682, "num_tokens": 533249960.0, "step": 13981 }, { "epoch": 1.7786541152525124, "ewc_loss": 0.06181296706199646, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002958640397991985, "grad_norm": 7.4168829917907715, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.871126651763916, "num_tokens": 533287328.0, "step": 13982 }, { "epoch": 1.778781325531103, "ewc_loss": 0.0617830716073513, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029556508525274694, "grad_norm": 7.377915859222412, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8647872805595398, "num_tokens": 533327772.0, "step": 13983 }, { "epoch": 1.7789085358096934, "ewc_loss": 0.06180591136217117, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002957934921141714, "grad_norm": 7.3673505783081055, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8585960865020752, "num_tokens": 533372004.0, "step": 13984 }, { "epoch": 1.779035746088284, "ewc_loss": 0.06183499097824097, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002960842684842646, "grad_norm": 7.451810836791992, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8608110547065735, "num_tokens": 533409502.0, "step": 13985 }, { "epoch": 1.7791629563668745, "ewc_loss": 0.06194569915533066, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002947499742731452, "grad_norm": 7.37873649597168, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8520991802215576, "num_tokens": 533451160.0, "step": 13986 }, { "epoch": 1.779290166645465, "ewc_loss": 0.06181847304105759, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029591910424642265, "grad_norm": 7.435391902923584, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8715705275535583, "num_tokens": 533480413.0, "step": 13987 }, { "epoch": 1.7794173769240555, "ewc_loss": 0.06171524524688721, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002948868495877832, "grad_norm": 7.437536239624023, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8515569567680359, "num_tokens": 533518826.0, "step": 13988 }, { "epoch": 1.779544587202646, "ewc_loss": 0.061785172671079636, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002955860982183367, "grad_norm": 7.327833652496338, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8538191914558411, "num_tokens": 533563760.0, "step": 13989 }, { "epoch": 1.7796717974812366, "ewc_loss": 0.06189016252756119, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002966359897982329, "grad_norm": 7.440042495727539, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8636876940727234, "num_tokens": 533601499.0, "step": 13990 }, { "epoch": 1.7797990077598271, "ewc_loss": 0.061645183712244034, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002941862039733678, "grad_norm": 7.37420654296875, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8683558702468872, "num_tokens": 533640757.0, "step": 13991 }, { "epoch": 1.7799262180384177, "ewc_loss": 0.0619005411863327, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002967398031614721, "grad_norm": 7.377618312835693, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.866459310054779, "num_tokens": 533678047.0, "step": 13992 }, { "epoch": 1.780053428317008, "ewc_loss": 0.0617791973054409, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000295526348054409, "grad_norm": 7.396839141845703, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8634693622589111, "num_tokens": 533720903.0, "step": 13993 }, { "epoch": 1.7801806385955985, "ewc_loss": 0.0618053674697876, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.000295788049697876, "grad_norm": 7.387017726898193, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8663369417190552, "num_tokens": 533759720.0, "step": 13994 }, { "epoch": 1.780307848874189, "ewc_loss": 0.061981696635484695, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002951099304482341, "grad_norm": 7.4274797439575195, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8521621227264404, "num_tokens": 533796122.0, "step": 13995 }, { "epoch": 1.7804350591527796, "ewc_loss": 0.061799656599760056, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029573094798251987, "grad_norm": 7.36895227432251, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8699990510940552, "num_tokens": 533836326.0, "step": 13996 }, { "epoch": 1.78056226943137, "ewc_loss": 0.061769016087055206, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002954245137516409, "grad_norm": 7.391796112060547, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8745142221450806, "num_tokens": 533867874.0, "step": 13997 }, { "epoch": 1.7806894797099606, "ewc_loss": 0.06186595559120178, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002963939041364938, "grad_norm": 7.38199520111084, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8754823207855225, "num_tokens": 533902877.0, "step": 13998 }, { "epoch": 1.780816689988551, "ewc_loss": 0.06176996976137161, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002954340889118612, "grad_norm": 7.3728485107421875, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8685462474822998, "num_tokens": 533937440.0, "step": 13999 }, { "epoch": 1.7809439002671414, "ewc_loss": 0.06187184900045395, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002964528393931687, "grad_norm": 7.4400200843811035, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8734698295593262, "num_tokens": 533972308.0, "step": 14000 }, { "epoch": 1.781071110545732, "ewc_loss": 0.06173459440469742, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002950803027488291, "grad_norm": 7.361507892608643, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8511543273925781, "num_tokens": 534011913.0, "step": 14001 }, { "epoch": 1.7811983208243225, "ewc_loss": 0.06204722821712494, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002957652322947979, "grad_norm": 7.358832359313965, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8746547102928162, "num_tokens": 534057235.0, "step": 14002 }, { "epoch": 1.781325531102913, "ewc_loss": 0.06182805448770523, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002960149140562862, "grad_norm": 7.386473178863525, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8643392324447632, "num_tokens": 534092850.0, "step": 14003 }, { "epoch": 1.7814527413815036, "ewc_loss": 0.06178203970193863, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029555478249676526, "grad_norm": 7.334102630615234, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8688417077064514, "num_tokens": 534135310.0, "step": 14004 }, { "epoch": 1.781579951660094, "ewc_loss": 0.06188080087304115, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029654239187948406, "grad_norm": 7.3962483406066895, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8632837533950806, "num_tokens": 534176008.0, "step": 14005 }, { "epoch": 1.7817071619386846, "ewc_loss": 0.06180571764707565, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029579157126136124, "grad_norm": 7.418722629547119, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8634649515151978, "num_tokens": 534208217.0, "step": 14006 }, { "epoch": 1.7818343722172751, "ewc_loss": 0.06178031861782074, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029553755302913487, "grad_norm": 7.3600172996521, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8675047159194946, "num_tokens": 534247564.0, "step": 14007 }, { "epoch": 1.7819615824958657, "ewc_loss": 0.06189636141061783, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002966980100609362, "grad_norm": 7.426466941833496, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8466458320617676, "num_tokens": 534281144.0, "step": 14008 }, { "epoch": 1.7820887927744562, "ewc_loss": 0.061839256435632706, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.0002961269346997142, "grad_norm": 7.3947272300720215, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.853493869304657, "num_tokens": 534317893.0, "step": 14009 }, { "epoch": 1.7822160030530467, "ewc_loss": 0.061881229281425476, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029654664103873074, "grad_norm": 7.358182430267334, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8618640303611755, "num_tokens": 534356116.0, "step": 14010 }, { "epoch": 1.7823432133316373, "ewc_loss": 0.061955682933330536, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029729120433330536, "grad_norm": 7.43747615814209, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8377837538719177, "num_tokens": 534390499.0, "step": 14011 }, { "epoch": 1.7824704236102278, "ewc_loss": 0.061797693371772766, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029571130289696157, "grad_norm": 7.394506454467773, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.859619140625, "num_tokens": 534424242.0, "step": 14012 }, { "epoch": 1.7825976338888183, "ewc_loss": 0.06189153715968132, "ewc_loss_diag": 3.218650817871094e-05, "ewc_loss_parallel": 0.00029664975591003895, "grad_norm": 7.433498859405518, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8558841943740845, "num_tokens": 534467465.0, "step": 14013 }, { "epoch": 1.7827248441674088, "ewc_loss": 0.062073227018117905, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029602524591609836, "grad_norm": 7.398187637329102, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8583633303642273, "num_tokens": 534502959.0, "step": 14014 }, { "epoch": 1.7828520544459994, "ewc_loss": 0.062076643109321594, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029605941381305456, "grad_norm": 7.2972731590271, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8652040362358093, "num_tokens": 534549888.0, "step": 14015 }, { "epoch": 1.78297926472459, "ewc_loss": 0.06212987005710602, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029659169376827776, "grad_norm": 7.406355381011963, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8661845326423645, "num_tokens": 534591686.0, "step": 14016 }, { "epoch": 1.7831064750031802, "ewc_loss": 0.062011830508708954, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002954112715087831, "grad_norm": 7.386515140533447, "learning_rate": 1e-06, "loss": 0.4066, "mean_token_accuracy": 0.8809076547622681, "num_tokens": 534628584.0, "step": 14017 }, { "epoch": 1.7832336852817707, "ewc_loss": 0.062124282121658325, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029653581441380084, "grad_norm": 7.39544153213501, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8533294200897217, "num_tokens": 534665684.0, "step": 14018 }, { "epoch": 1.7833608955603613, "ewc_loss": 0.06197638809680939, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029505687416531146, "grad_norm": 7.364558219909668, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8670274019241333, "num_tokens": 534701604.0, "step": 14019 }, { "epoch": 1.7834881058389518, "ewc_loss": 0.062049418687820435, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002957871765829623, "grad_norm": 7.396557331085205, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.849324107170105, "num_tokens": 534741765.0, "step": 14020 }, { "epoch": 1.7836153161175423, "ewc_loss": 0.062031228095293045, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029560524853877723, "grad_norm": 7.317258358001709, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8730921745300293, "num_tokens": 534780331.0, "step": 14021 }, { "epoch": 1.7837425263961328, "ewc_loss": 0.062153447419404984, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002968274347949773, "grad_norm": 7.450717926025391, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8516924977302551, "num_tokens": 534816254.0, "step": 14022 }, { "epoch": 1.7838697366747234, "ewc_loss": 0.061978649348020554, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002950794587377459, "grad_norm": 7.354989051818848, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.861807107925415, "num_tokens": 534851367.0, "step": 14023 }, { "epoch": 1.7839969469533137, "ewc_loss": 0.062221843749284744, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000297511403914541, "grad_norm": 7.37473201751709, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8616620302200317, "num_tokens": 534889077.0, "step": 14024 }, { "epoch": 1.7841241572319042, "ewc_loss": 0.062398761510849, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029683919274248183, "grad_norm": 7.406083106994629, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8532601594924927, "num_tokens": 534924249.0, "step": 14025 }, { "epoch": 1.7842513675104947, "ewc_loss": 0.062297068536281586, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029582224669866264, "grad_norm": 7.376315593719482, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8478666543960571, "num_tokens": 534966370.0, "step": 14026 }, { "epoch": 1.7843785777890853, "ewc_loss": 0.062162019312381744, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029691317467950284, "grad_norm": 7.364670276641846, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8692623972892761, "num_tokens": 535005869.0, "step": 14027 }, { "epoch": 1.7845057880676758, "ewc_loss": 0.062040064483881, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002956936077680439, "grad_norm": 7.368776321411133, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8610221743583679, "num_tokens": 535042057.0, "step": 14028 }, { "epoch": 1.7846329983462663, "ewc_loss": 0.06236521154642105, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002965036837849766, "grad_norm": 7.817480564117432, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8577028512954712, "num_tokens": 535074410.0, "step": 14029 }, { "epoch": 1.7847602086248568, "ewc_loss": 0.06171686202287674, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029246159829199314, "grad_norm": 7.217393398284912, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8545233607292175, "num_tokens": 535120356.0, "step": 14030 }, { "epoch": 1.7848874189034474, "ewc_loss": 0.06256765872240067, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000298528146231547, "grad_norm": 7.530706405639648, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8536902666091919, "num_tokens": 535156939.0, "step": 14031 }, { "epoch": 1.785014629182038, "ewc_loss": 0.06191791221499443, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029203068697825074, "grad_norm": 7.225858211517334, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8660095930099487, "num_tokens": 535195553.0, "step": 14032 }, { "epoch": 1.7851418394606284, "ewc_loss": 0.0627341940999031, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003001935256179422, "grad_norm": 7.490532875061035, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8689014911651611, "num_tokens": 535235692.0, "step": 14033 }, { "epoch": 1.785269049739219, "ewc_loss": 0.06208978593349457, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029374941368587315, "grad_norm": 7.329460620880127, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8614676594734192, "num_tokens": 535269265.0, "step": 14034 }, { "epoch": 1.7853962600178095, "ewc_loss": 0.06257718801498413, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002986234030686319, "grad_norm": 7.433353900909424, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8792990446090698, "num_tokens": 535303825.0, "step": 14035 }, { "epoch": 1.7855234702964, "ewc_loss": 0.06226338446140289, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002954853989649564, "grad_norm": 7.352799892425537, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8565832376480103, "num_tokens": 535338802.0, "step": 14036 }, { "epoch": 1.7856506805749905, "ewc_loss": 0.06241990998387337, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029705066117458045, "grad_norm": 7.421676158905029, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8695461750030518, "num_tokens": 535376592.0, "step": 14037 }, { "epoch": 1.785777890853581, "ewc_loss": 0.06223390996456146, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029519066447392106, "grad_norm": 7.328109264373779, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8598967790603638, "num_tokens": 535416528.0, "step": 14038 }, { "epoch": 1.7859051011321716, "ewc_loss": 0.06236116588115692, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029646322946064174, "grad_norm": 7.417510509490967, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.859505832195282, "num_tokens": 535450892.0, "step": 14039 }, { "epoch": 1.7860323114107621, "ewc_loss": 0.062225550413131714, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029510704916901886, "grad_norm": 7.379944801330566, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8657707571983337, "num_tokens": 535488198.0, "step": 14040 }, { "epoch": 1.7861595216893527, "ewc_loss": 0.06229691207408905, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000295820675091818, "grad_norm": 7.337113857269287, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.861538827419281, "num_tokens": 535526020.0, "step": 14041 }, { "epoch": 1.786286731967943, "ewc_loss": 0.06230790913105011, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002959306293632835, "grad_norm": 7.3824076652526855, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8687779903411865, "num_tokens": 535560400.0, "step": 14042 }, { "epoch": 1.7864139422465335, "ewc_loss": 0.062235087156295776, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029520245152525604, "grad_norm": 7.374166488647461, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.856437623500824, "num_tokens": 535594699.0, "step": 14043 }, { "epoch": 1.786541152525124, "ewc_loss": 0.06232666224241257, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029611820355057716, "grad_norm": 7.381769180297852, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8680713176727295, "num_tokens": 535627019.0, "step": 14044 }, { "epoch": 1.7866683628037145, "ewc_loss": 0.06218899413943291, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947415050584823, "grad_norm": 7.323258876800537, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.872684121131897, "num_tokens": 535667479.0, "step": 14045 }, { "epoch": 1.786795573082305, "ewc_loss": 0.062313757836818695, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000295989157166332, "grad_norm": 7.390633583068848, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8578382730484009, "num_tokens": 535705211.0, "step": 14046 }, { "epoch": 1.7869227833608956, "ewc_loss": 0.062198705971241, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002948386245407164, "grad_norm": 7.336174964904785, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8439416289329529, "num_tokens": 535744907.0, "step": 14047 }, { "epoch": 1.787049993639486, "ewc_loss": 0.062363073229789734, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002964822924695909, "grad_norm": 7.434834003448486, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8609603047370911, "num_tokens": 535782526.0, "step": 14048 }, { "epoch": 1.7871772039180764, "ewc_loss": 0.06223948299884796, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002952463983092457, "grad_norm": 7.313523292541504, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8789490461349487, "num_tokens": 535824153.0, "step": 14049 }, { "epoch": 1.787304414196667, "ewc_loss": 0.0623578280210495, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029642984736710787, "grad_norm": 7.459341526031494, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8748356103897095, "num_tokens": 535859019.0, "step": 14050 }, { "epoch": 1.7874316244752575, "ewc_loss": 0.062215425074100494, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002950057969428599, "grad_norm": 7.3423967361450195, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8756506443023682, "num_tokens": 535896866.0, "step": 14051 }, { "epoch": 1.787558834753848, "ewc_loss": 0.0624287873506546, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029713945696130395, "grad_norm": 7.457920074462891, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8644392490386963, "num_tokens": 535937128.0, "step": 14052 }, { "epoch": 1.7876860450324386, "ewc_loss": 0.06202460080385208, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002930975751951337, "grad_norm": 7.307374954223633, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8805553913116455, "num_tokens": 535972727.0, "step": 14053 }, { "epoch": 1.787813255311029, "ewc_loss": 0.06250923871994019, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029794397414661944, "grad_norm": 7.460699558258057, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8683399558067322, "num_tokens": 536011491.0, "step": 14054 }, { "epoch": 1.7879404655896196, "ewc_loss": 0.06202484667301178, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002931000490207225, "grad_norm": 7.382816791534424, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8681696653366089, "num_tokens": 536048074.0, "step": 14055 }, { "epoch": 1.7880676758682101, "ewc_loss": 0.0623147077858448, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002959986450150609, "grad_norm": 7.470679759979248, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8719638586044312, "num_tokens": 536089554.0, "step": 14056 }, { "epoch": 1.7881948861468007, "ewc_loss": 0.062073126435279846, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002935828233603388, "grad_norm": 7.284254550933838, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8735867738723755, "num_tokens": 536127672.0, "step": 14057 }, { "epoch": 1.7883220964253912, "ewc_loss": 0.06238479167222977, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966994943562895, "grad_norm": 7.459634304046631, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8643324971199036, "num_tokens": 536157517.0, "step": 14058 }, { "epoch": 1.7884493067039817, "ewc_loss": 0.06209387630224228, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029379030456766486, "grad_norm": 7.352840900421143, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8769688606262207, "num_tokens": 536192855.0, "step": 14059 }, { "epoch": 1.7885765169825723, "ewc_loss": 0.062378283590078354, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966343890875578, "grad_norm": 7.420081615447998, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8653699159622192, "num_tokens": 536231727.0, "step": 14060 }, { "epoch": 1.7887037272611628, "ewc_loss": 0.06216863542795181, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000294537894660607, "grad_norm": 7.306813716888428, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8775277137756348, "num_tokens": 536270738.0, "step": 14061 }, { "epoch": 1.7888309375397533, "ewc_loss": 0.06225203722715378, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002978133561555296, "grad_norm": 7.417741775512695, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8819514513015747, "num_tokens": 536313006.0, "step": 14062 }, { "epoch": 1.7889581478183438, "ewc_loss": 0.0619814395904541, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029510734020732343, "grad_norm": 7.406753063201904, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8585048317909241, "num_tokens": 536350697.0, "step": 14063 }, { "epoch": 1.7890853580969344, "ewc_loss": 0.062103062868118286, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002963236183859408, "grad_norm": 7.505467414855957, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8719987273216248, "num_tokens": 536394252.0, "step": 14064 }, { "epoch": 1.789212568375525, "ewc_loss": 0.0621642991900444, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002944945590570569, "grad_norm": 7.514711856842041, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8623867034912109, "num_tokens": 536426591.0, "step": 14065 }, { "epoch": 1.7893397786541152, "ewc_loss": 0.06194237619638443, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029471670859493315, "grad_norm": 7.415822505950928, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8518111705780029, "num_tokens": 536468567.0, "step": 14066 }, { "epoch": 1.7894669889327057, "ewc_loss": 0.0620114728808403, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002954076917376369, "grad_norm": 7.421656131744385, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8546947836875916, "num_tokens": 536506352.0, "step": 14067 }, { "epoch": 1.7895941992112963, "ewc_loss": 0.061900824308395386, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029430119320750237, "grad_norm": 7.470337867736816, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8575787544250488, "num_tokens": 536538122.0, "step": 14068 }, { "epoch": 1.7897214094898868, "ewc_loss": 0.06201484799385071, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002954414230771363, "grad_norm": 7.507051467895508, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8558211326599121, "num_tokens": 536574617.0, "step": 14069 }, { "epoch": 1.7898486197684773, "ewc_loss": 0.061858564615249634, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029387863469310105, "grad_norm": 7.344719409942627, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8766130208969116, "num_tokens": 536611895.0, "step": 14070 }, { "epoch": 1.7899758300470678, "ewc_loss": 0.06194235011935234, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002947164757642895, "grad_norm": 7.3929009437561035, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8572001457214355, "num_tokens": 536651336.0, "step": 14071 }, { "epoch": 1.7901030403256584, "ewc_loss": 0.061934083700180054, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002946337917819619, "grad_norm": 7.379376411437988, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.859626293182373, "num_tokens": 536692566.0, "step": 14072 }, { "epoch": 1.7902302506042487, "ewc_loss": 0.062284886837005615, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002957004471682012, "grad_norm": 7.432309150695801, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8700240850448608, "num_tokens": 536730855.0, "step": 14073 }, { "epoch": 1.7903574608828392, "ewc_loss": 0.062136732041835785, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029421885847114027, "grad_norm": 7.3995585441589355, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8611576557159424, "num_tokens": 536766478.0, "step": 14074 }, { "epoch": 1.7904846711614297, "ewc_loss": 0.062295712530612946, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029580871341750026, "grad_norm": 7.465941905975342, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8580120801925659, "num_tokens": 536799900.0, "step": 14075 }, { "epoch": 1.7906118814400203, "ewc_loss": 0.062182895839214325, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000294680503429845, "grad_norm": 7.3967413902282715, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8709211945533752, "num_tokens": 536843597.0, "step": 14076 }, { "epoch": 1.7907390917186108, "ewc_loss": 0.06225743517279625, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029542591073550284, "grad_norm": 7.467372417449951, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8754357695579529, "num_tokens": 536879189.0, "step": 14077 }, { "epoch": 1.7908663019972013, "ewc_loss": 0.06221771240234375, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002950286725535989, "grad_norm": 7.388055324554443, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8685799241065979, "num_tokens": 536918099.0, "step": 14078 }, { "epoch": 1.7909935122757918, "ewc_loss": 0.06227724254131317, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002956239623017609, "grad_norm": 7.466967582702637, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8545373678207397, "num_tokens": 536954410.0, "step": 14079 }, { "epoch": 1.7911207225543824, "ewc_loss": 0.06187063083052635, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002939992700703442, "grad_norm": 7.353323936462402, "learning_rate": 1e-06, "loss": 0.3913, "mean_token_accuracy": 0.8858097791671753, "num_tokens": 536986688.0, "step": 14080 }, { "epoch": 1.791247932832973, "ewc_loss": 0.06245432049036026, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002973947557620704, "grad_norm": 7.474864959716797, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8654348850250244, "num_tokens": 537024181.0, "step": 14081 }, { "epoch": 1.7913751431115634, "ewc_loss": 0.06205665320158005, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002934180956799537, "grad_norm": 7.336164951324463, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8682547807693481, "num_tokens": 537064174.0, "step": 14082 }, { "epoch": 1.791502353390154, "ewc_loss": 0.062474727630615234, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975988609250635, "grad_norm": 7.46303653717041, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8566514253616333, "num_tokens": 537100279.0, "step": 14083 }, { "epoch": 1.7916295636687445, "ewc_loss": 0.062069058418273926, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002935421362053603, "grad_norm": 7.427866458892822, "learning_rate": 1e-06, "loss": 0.526, "mean_token_accuracy": 0.8414575457572937, "num_tokens": 537135339.0, "step": 14084 }, { "epoch": 1.791756773947335, "ewc_loss": 0.06232722848653793, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002961238205898553, "grad_norm": 7.427887439727783, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8574682474136353, "num_tokens": 537169608.0, "step": 14085 }, { "epoch": 1.7918839842259255, "ewc_loss": 0.062208376824855804, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029493533656932414, "grad_norm": 7.395771503448486, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8732671737670898, "num_tokens": 537205560.0, "step": 14086 }, { "epoch": 1.792011194504516, "ewc_loss": 0.06223692744970322, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002952208451461047, "grad_norm": 7.461910724639893, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8526296019554138, "num_tokens": 537238697.0, "step": 14087 }, { "epoch": 1.7921384047831066, "ewc_loss": 0.062216706573963165, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029501860262826085, "grad_norm": 7.360860347747803, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8599615097045898, "num_tokens": 537282365.0, "step": 14088 }, { "epoch": 1.7922656150616971, "ewc_loss": 0.06229540705680847, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002958056575153023, "grad_norm": 7.4487104415893555, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8721668720245361, "num_tokens": 537320805.0, "step": 14089 }, { "epoch": 1.7923928253402877, "ewc_loss": 0.062186114490032196, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947127213701606, "grad_norm": 7.441089630126953, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8670352697372437, "num_tokens": 537354607.0, "step": 14090 }, { "epoch": 1.792520035618878, "ewc_loss": 0.06228933483362198, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029574488871730864, "grad_norm": 7.512077331542969, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8756335973739624, "num_tokens": 537386672.0, "step": 14091 }, { "epoch": 1.7926472458974685, "ewc_loss": 0.062190763652324677, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947592001874, "grad_norm": 7.4304914474487305, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.866644024848938, "num_tokens": 537430882.0, "step": 14092 }, { "epoch": 1.792774456176059, "ewc_loss": 0.062173739075660706, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029458897188305855, "grad_norm": 7.43468713760376, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8668937683105469, "num_tokens": 537465114.0, "step": 14093 }, { "epoch": 1.7929016664546495, "ewc_loss": 0.06218908354640007, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029474240727722645, "grad_norm": 7.42970085144043, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8584300875663757, "num_tokens": 537502820.0, "step": 14094 }, { "epoch": 1.79302887673324, "ewc_loss": 0.06220608204603195, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029491240275092423, "grad_norm": 7.436978340148926, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8724620938301086, "num_tokens": 537537422.0, "step": 14095 }, { "epoch": 1.7931560870118306, "ewc_loss": 0.062093593180179596, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000293787510599941, "grad_norm": 7.472362995147705, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8691287636756897, "num_tokens": 537566828.0, "step": 14096 }, { "epoch": 1.793283297290421, "ewc_loss": 0.06220758706331253, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002949274203274399, "grad_norm": 7.4507060050964355, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8624475002288818, "num_tokens": 537606102.0, "step": 14097 }, { "epoch": 1.7934105075690114, "ewc_loss": 0.062057070434093475, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029342228663153946, "grad_norm": 7.3743205070495605, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8709793090820312, "num_tokens": 537638527.0, "step": 14098 }, { "epoch": 1.793537717847602, "ewc_loss": 0.062050752341747284, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002933591022156179, "grad_norm": 7.290339946746826, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8633310198783875, "num_tokens": 537683375.0, "step": 14099 }, { "epoch": 1.7936649281261925, "ewc_loss": 0.06229216232895851, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002957731776405126, "grad_norm": 7.389465808868408, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8595341444015503, "num_tokens": 537731354.0, "step": 14100 }, { "epoch": 1.793792138404783, "ewc_loss": 0.06205757334828377, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000293427292490378, "grad_norm": 7.396485328674316, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8606271743774414, "num_tokens": 537770055.0, "step": 14101 }, { "epoch": 1.7939193486833735, "ewc_loss": 0.06230074167251587, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029585897573269904, "grad_norm": 7.462925910949707, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8428975939750671, "num_tokens": 537807977.0, "step": 14102 }, { "epoch": 1.794046558961964, "ewc_loss": 0.06208972632884979, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029374880250543356, "grad_norm": 7.376260280609131, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8593476414680481, "num_tokens": 537847809.0, "step": 14103 }, { "epoch": 1.7941737692405546, "ewc_loss": 0.062299929559230804, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002958508557640016, "grad_norm": 7.399952411651611, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.845229983329773, "num_tokens": 537885506.0, "step": 14104 }, { "epoch": 1.7943009795191451, "ewc_loss": 0.062134962528944016, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029420119244605303, "grad_norm": 7.471884250640869, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8592564463615417, "num_tokens": 537918936.0, "step": 14105 }, { "epoch": 1.7944281897977357, "ewc_loss": 0.06225758418440819, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029542739503085613, "grad_norm": 7.4167633056640625, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8685874938964844, "num_tokens": 537954205.0, "step": 14106 }, { "epoch": 1.7945554000763262, "ewc_loss": 0.06219060346484184, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029475759947672486, "grad_norm": 7.38191032409668, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8679483532905579, "num_tokens": 537989190.0, "step": 14107 }, { "epoch": 1.7946826103549167, "ewc_loss": 0.062168676406145096, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029453833121806383, "grad_norm": 7.40097713470459, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.855605959892273, "num_tokens": 538028852.0, "step": 14108 }, { "epoch": 1.7948098206335072, "ewc_loss": 0.06222955137491226, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002951470669358969, "grad_norm": 7.4249114990234375, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8698431849479675, "num_tokens": 538061245.0, "step": 14109 }, { "epoch": 1.7949370309120978, "ewc_loss": 0.062149323523044586, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000294344819849357, "grad_norm": 7.35081672668457, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8773025870323181, "num_tokens": 538103238.0, "step": 14110 }, { "epoch": 1.7950642411906883, "ewc_loss": 0.062239013612270355, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029524171259254217, "grad_norm": 7.448986053466797, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8509472012519836, "num_tokens": 538142692.0, "step": 14111 }, { "epoch": 1.7951914514692788, "ewc_loss": 0.062192972749471664, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029478128999471664, "grad_norm": 7.3551740646362305, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8623749017715454, "num_tokens": 538183155.0, "step": 14112 }, { "epoch": 1.7953186617478694, "ewc_loss": 0.062433741986751556, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002971889916807413, "grad_norm": 7.471632957458496, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8624606132507324, "num_tokens": 538222063.0, "step": 14113 }, { "epoch": 1.7954458720264599, "ewc_loss": 0.06217425316572189, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029459409415721893, "grad_norm": 7.395676612854004, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8653943538665771, "num_tokens": 538259847.0, "step": 14114 }, { "epoch": 1.7955730823050502, "ewc_loss": 0.06287213414907455, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000296690093819052, "grad_norm": 7.453349590301514, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8701279759407043, "num_tokens": 538298557.0, "step": 14115 }, { "epoch": 1.7957002925836407, "ewc_loss": 0.06220386177301407, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002948901674244553, "grad_norm": 7.3754706382751465, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8608333468437195, "num_tokens": 538337926.0, "step": 14116 }, { "epoch": 1.7958275028622313, "ewc_loss": 0.0623607262969017, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002964588056784123, "grad_norm": 7.363958358764648, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8690720200538635, "num_tokens": 538382755.0, "step": 14117 }, { "epoch": 1.7959547131408218, "ewc_loss": 0.06232413277029991, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002960928832180798, "grad_norm": 7.429880142211914, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8432374000549316, "num_tokens": 538417639.0, "step": 14118 }, { "epoch": 1.7960819234194123, "ewc_loss": 0.06224354729056358, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002952870272565633, "grad_norm": 7.380119800567627, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8588269948959351, "num_tokens": 538458444.0, "step": 14119 }, { "epoch": 1.7962091336980028, "ewc_loss": 0.06233487278223038, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029620027635246515, "grad_norm": 7.407076358795166, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8660691380500793, "num_tokens": 538496580.0, "step": 14120 }, { "epoch": 1.7963363439765934, "ewc_loss": 0.06227600574493408, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029561162227764726, "grad_norm": 7.388660430908203, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.873327910900116, "num_tokens": 538538474.0, "step": 14121 }, { "epoch": 1.7964635542551837, "ewc_loss": 0.06232905015349388, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002961420686915517, "grad_norm": 7.405869007110596, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8659440279006958, "num_tokens": 538576828.0, "step": 14122 }, { "epoch": 1.7965907645337742, "ewc_loss": 0.062393203377723694, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000296783575322479, "grad_norm": 7.407883644104004, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8712416887283325, "num_tokens": 538611630.0, "step": 14123 }, { "epoch": 1.7967179748123647, "ewc_loss": 0.06206410750746727, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029593403451144695, "grad_norm": 7.393320560455322, "learning_rate": 1e-06, "loss": 0.5481, "mean_token_accuracy": 0.8401871919631958, "num_tokens": 538659537.0, "step": 14124 }, { "epoch": 1.7968451850909553, "ewc_loss": 0.062124624848365784, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002965391904581338, "grad_norm": 7.450153350830078, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8724339008331299, "num_tokens": 538700216.0, "step": 14125 }, { "epoch": 1.7969723953695458, "ewc_loss": 0.06230735033750534, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002959250414278358, "grad_norm": 7.456882953643799, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8626018166542053, "num_tokens": 538737719.0, "step": 14126 }, { "epoch": 1.7970996056481363, "ewc_loss": 0.06204702705144882, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029576325323432684, "grad_norm": 7.387950420379639, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8715436458587646, "num_tokens": 538774723.0, "step": 14127 }, { "epoch": 1.7972268159267268, "ewc_loss": 0.06236732006072998, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002965247549582273, "grad_norm": 7.440047740936279, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8734645843505859, "num_tokens": 538815288.0, "step": 14128 }, { "epoch": 1.7973540262053174, "ewc_loss": 0.0621679425239563, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029453099705278873, "grad_norm": 7.3513383865356445, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8637728691101074, "num_tokens": 538858239.0, "step": 14129 }, { "epoch": 1.797481236483908, "ewc_loss": 0.06238473951816559, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029669894138351083, "grad_norm": 7.494196891784668, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8667431473731995, "num_tokens": 538896947.0, "step": 14130 }, { "epoch": 1.7976084467624984, "ewc_loss": 0.062078431248664856, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000293635850539431, "grad_norm": 7.356269359588623, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8617867231369019, "num_tokens": 538936511.0, "step": 14131 }, { "epoch": 1.797735657041089, "ewc_loss": 0.06247210130095482, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029757258016616106, "grad_norm": 7.542481899261475, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8570431470870972, "num_tokens": 538969198.0, "step": 14132 }, { "epoch": 1.7978628673196795, "ewc_loss": 0.062010522931814194, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029295679996721447, "grad_norm": 7.339238166809082, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8751660585403442, "num_tokens": 539002258.0, "step": 14133 }, { "epoch": 1.79799007759827, "ewc_loss": 0.06254604458808899, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002983119629789144, "grad_norm": 7.487530708312988, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8505298495292664, "num_tokens": 539039993.0, "step": 14134 }, { "epoch": 1.7981172878768605, "ewc_loss": 0.06216142326593399, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002944657753687352, "grad_norm": 7.405827045440674, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.860855221748352, "num_tokens": 539080713.0, "step": 14135 }, { "epoch": 1.798244498155451, "ewc_loss": 0.06240345537662506, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002968861081171781, "grad_norm": 7.434080600738525, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8750039935112, "num_tokens": 539121115.0, "step": 14136 }, { "epoch": 1.7983717084340416, "ewc_loss": 0.06224021315574646, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029525370337069035, "grad_norm": 7.3701910972595215, "learning_rate": 1e-06, "loss": 0.4051, "mean_token_accuracy": 0.8830102682113647, "num_tokens": 539159889.0, "step": 14137 }, { "epoch": 1.7984989187126321, "ewc_loss": 0.06244642660021782, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029731582617387176, "grad_norm": 7.524448394775391, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8635123372077942, "num_tokens": 539201508.0, "step": 14138 }, { "epoch": 1.7986261289912227, "ewc_loss": 0.06212172284722328, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029406879912130535, "grad_norm": 7.374305248260498, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.846670389175415, "num_tokens": 539238640.0, "step": 14139 }, { "epoch": 1.798753339269813, "ewc_loss": 0.06237895041704178, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966410538647324, "grad_norm": 7.484066486358643, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8597784042358398, "num_tokens": 539274797.0, "step": 14140 }, { "epoch": 1.7988805495484035, "ewc_loss": 0.06219342723488808, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947858301922679, "grad_norm": 7.426388263702393, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8525888919830322, "num_tokens": 539308410.0, "step": 14141 }, { "epoch": 1.799007759826994, "ewc_loss": 0.06241174787282944, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002969690249301493, "grad_norm": 7.46311616897583, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8578747510910034, "num_tokens": 539345578.0, "step": 14142 }, { "epoch": 1.7991349701055845, "ewc_loss": 0.06221553683280945, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002950069319922477, "grad_norm": 7.461766242980957, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8490780591964722, "num_tokens": 539378946.0, "step": 14143 }, { "epoch": 1.799262180384175, "ewc_loss": 0.06222255900502205, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002950771595351398, "grad_norm": 7.392282485961914, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8773325681686401, "num_tokens": 539417663.0, "step": 14144 }, { "epoch": 1.7993893906627656, "ewc_loss": 0.06226687133312225, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000295520294457674, "grad_norm": 7.395452976226807, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8788001537322998, "num_tokens": 539459597.0, "step": 14145 }, { "epoch": 1.799516600941356, "ewc_loss": 0.0622287392616272, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029513894696719944, "grad_norm": 7.4000725746154785, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8665440678596497, "num_tokens": 539499579.0, "step": 14146 }, { "epoch": 1.7996438112199464, "ewc_loss": 0.06242331489920616, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029708471265621483, "grad_norm": 7.471121311187744, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8746671676635742, "num_tokens": 539538505.0, "step": 14147 }, { "epoch": 1.799771021498537, "ewc_loss": 0.062222301959991455, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002950745983980596, "grad_norm": 7.403642654418945, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8575516939163208, "num_tokens": 539574824.0, "step": 14148 }, { "epoch": 1.7998982317771275, "ewc_loss": 0.06241067871451378, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002969583438243717, "grad_norm": 7.471311569213867, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8519412875175476, "num_tokens": 539614181.0, "step": 14149 }, { "epoch": 1.800025442055718, "ewc_loss": 0.06223123520612717, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002951639180537313, "grad_norm": 7.375679969787598, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8519600629806519, "num_tokens": 539656713.0, "step": 14150 }, { "epoch": 1.8001526523343085, "ewc_loss": 0.06249696761369705, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002978212432935834, "grad_norm": 7.481364727020264, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8626289963722229, "num_tokens": 539695436.0, "step": 14151 }, { "epoch": 1.800279862612899, "ewc_loss": 0.062190569937229156, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947572502307594, "grad_norm": 7.346264362335205, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8666839003562927, "num_tokens": 539736549.0, "step": 14152 }, { "epoch": 1.8004070728914896, "ewc_loss": 0.06251447647809982, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002979963319376111, "grad_norm": 7.460068702697754, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8689344525337219, "num_tokens": 539776326.0, "step": 14153 }, { "epoch": 1.8005342831700801, "ewc_loss": 0.06219039857387543, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947555622085929, "grad_norm": 7.358310222625732, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8715813159942627, "num_tokens": 539819665.0, "step": 14154 }, { "epoch": 1.8006614934486707, "ewc_loss": 0.06253127008676529, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002981642901431769, "grad_norm": 7.497615814208984, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8639781475067139, "num_tokens": 539858175.0, "step": 14155 }, { "epoch": 1.8007887037272612, "ewc_loss": 0.0621749609708786, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002946011954918504, "grad_norm": 7.369567394256592, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8637948036193848, "num_tokens": 539895152.0, "step": 14156 }, { "epoch": 1.8009159140058517, "ewc_loss": 0.06253570318222046, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029820858617313206, "grad_norm": 7.460910797119141, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8689335584640503, "num_tokens": 539941033.0, "step": 14157 }, { "epoch": 1.8010431242844422, "ewc_loss": 0.06227511167526245, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002956026582978666, "grad_norm": 7.391439914703369, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8663275837898254, "num_tokens": 539977959.0, "step": 14158 }, { "epoch": 1.8011703345630328, "ewc_loss": 0.062459878623485565, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002974503440782428, "grad_norm": 7.556149005889893, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.86285400390625, "num_tokens": 540014857.0, "step": 14159 }, { "epoch": 1.8012975448416233, "ewc_loss": 0.06219074875116348, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947590546682477, "grad_norm": 7.373044967651367, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8615315556526184, "num_tokens": 540055775.0, "step": 14160 }, { "epoch": 1.8014247551202138, "ewc_loss": 0.06232200562953949, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029851304134353995, "grad_norm": 7.5248870849609375, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.872217059135437, "num_tokens": 540090287.0, "step": 14161 }, { "epoch": 1.8015519653988044, "ewc_loss": 0.06191025674343109, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029439551872201264, "grad_norm": 7.345942497253418, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8721250295639038, "num_tokens": 540129139.0, "step": 14162 }, { "epoch": 1.8016791756773949, "ewc_loss": 0.06227739155292511, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002980669087264687, "grad_norm": 7.5527873039245605, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8639670610427856, "num_tokens": 540165187.0, "step": 14163 }, { "epoch": 1.8018063859559852, "ewc_loss": 0.06194434314966202, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002947363827843219, "grad_norm": 7.335014343261719, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8760521411895752, "num_tokens": 540201113.0, "step": 14164 }, { "epoch": 1.8019335962345757, "ewc_loss": 0.06258032470941544, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029865483520552516, "grad_norm": 7.495057582855225, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8651295304298401, "num_tokens": 540237448.0, "step": 14165 }, { "epoch": 1.8020608065131662, "ewc_loss": 0.06198061630129814, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029509913292713463, "grad_norm": 7.428608417510986, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8573132157325745, "num_tokens": 540279703.0, "step": 14166 }, { "epoch": 1.8021880167917568, "ewc_loss": 0.06221770867705345, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000297470047371462, "grad_norm": 7.486110210418701, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8443514108657837, "num_tokens": 540313071.0, "step": 14167 }, { "epoch": 1.8023152270703473, "ewc_loss": 0.062008220702409744, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029537518275901675, "grad_norm": 7.380021095275879, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8686026930809021, "num_tokens": 540353940.0, "step": 14168 }, { "epoch": 1.8024424373489378, "ewc_loss": 0.06219920516014099, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002972850052174181, "grad_norm": 7.506014823913574, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8600908517837524, "num_tokens": 540391241.0, "step": 14169 }, { "epoch": 1.8025696476275284, "ewc_loss": 0.062095604836940765, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002962490252684802, "grad_norm": 7.4899468421936035, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8763635158538818, "num_tokens": 540420205.0, "step": 14170 }, { "epoch": 1.8026968579061187, "ewc_loss": 0.062022969126701355, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000295522651867941, "grad_norm": 7.405207633972168, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8686699271202087, "num_tokens": 540452121.0, "step": 14171 }, { "epoch": 1.8028240681847092, "ewc_loss": 0.06248589605093002, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002977105032186955, "grad_norm": 7.462797164916992, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8645830750465393, "num_tokens": 540487415.0, "step": 14172 }, { "epoch": 1.8029512784632997, "ewc_loss": 0.062014803290367126, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029544098651967943, "grad_norm": 7.448598384857178, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8539040088653564, "num_tokens": 540522293.0, "step": 14173 }, { "epoch": 1.8030784887418903, "ewc_loss": 0.06242549419403076, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002971065114252269, "grad_norm": 7.464915752410889, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8761035799980164, "num_tokens": 540560345.0, "step": 14174 }, { "epoch": 1.8032056990204808, "ewc_loss": 0.06233559548854828, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002962075232062489, "grad_norm": 7.477080345153809, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8728748559951782, "num_tokens": 540598299.0, "step": 14175 }, { "epoch": 1.8033329092990713, "ewc_loss": 0.06229528412222862, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029580440605059266, "grad_norm": 7.378572940826416, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8754788041114807, "num_tokens": 540634480.0, "step": 14176 }, { "epoch": 1.8034601195776618, "ewc_loss": 0.06234000623226166, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029625161550939083, "grad_norm": 7.422576427459717, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8560534119606018, "num_tokens": 540679473.0, "step": 14177 }, { "epoch": 1.8035873298562524, "ewc_loss": 0.06229616701602936, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002958132536150515, "grad_norm": 7.384359836578369, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8580789566040039, "num_tokens": 540722259.0, "step": 14178 }, { "epoch": 1.803714540134843, "ewc_loss": 0.06244821101427078, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029733366682194173, "grad_norm": 7.498107433319092, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8686743378639221, "num_tokens": 540758058.0, "step": 14179 }, { "epoch": 1.8038417504134334, "ewc_loss": 0.062166035175323486, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002945119049400091, "grad_norm": 7.372840881347656, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8590116500854492, "num_tokens": 540798535.0, "step": 14180 }, { "epoch": 1.803968960692024, "ewc_loss": 0.06247255578637123, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975771203637123, "grad_norm": 7.410104751586914, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8640397787094116, "num_tokens": 540839423.0, "step": 14181 }, { "epoch": 1.8040961709706145, "ewc_loss": 0.06227736175060272, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029562515555880964, "grad_norm": 7.38309907913208, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8699103593826294, "num_tokens": 540887512.0, "step": 14182 }, { "epoch": 1.804223381249205, "ewc_loss": 0.06242350488901138, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002970866044051945, "grad_norm": 7.509230136871338, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8677345514297485, "num_tokens": 540922292.0, "step": 14183 }, { "epoch": 1.8043505915277955, "ewc_loss": 0.06222156435251236, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002950672060251236, "grad_norm": 7.2947916984558105, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8713998198509216, "num_tokens": 540964105.0, "step": 14184 }, { "epoch": 1.804477801806386, "ewc_loss": 0.062376804649829865, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002990610373672098, "grad_norm": 7.539859771728516, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8631128072738647, "num_tokens": 541001793.0, "step": 14185 }, { "epoch": 1.8046050120849766, "ewc_loss": 0.06229995936155319, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029585117590613663, "grad_norm": 7.453775405883789, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8700994253158569, "num_tokens": 541041104.0, "step": 14186 }, { "epoch": 1.8047322223635671, "ewc_loss": 0.06255127489566803, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002983643498737365, "grad_norm": 7.9164137840271, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8586568832397461, "num_tokens": 541079643.0, "step": 14187 }, { "epoch": 1.8048594326421576, "ewc_loss": 0.06210581958293915, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029390977579168975, "grad_norm": 7.692181587219238, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8700401782989502, "num_tokens": 541118554.0, "step": 14188 }, { "epoch": 1.804986642920748, "ewc_loss": 0.061741650104522705, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002927094465121627, "grad_norm": 7.4468183517456055, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8527361154556274, "num_tokens": 541156893.0, "step": 14189 }, { "epoch": 1.8051138531993385, "ewc_loss": 0.06165628880262375, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002918558311648667, "grad_norm": 7.435923099517822, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8811945915222168, "num_tokens": 541197283.0, "step": 14190 }, { "epoch": 1.805241063477929, "ewc_loss": 0.061860594898462296, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002938989200629294, "grad_norm": 7.670461654663086, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8784958124160767, "num_tokens": 541232343.0, "step": 14191 }, { "epoch": 1.8053682737565195, "ewc_loss": 0.06153424456715584, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029063542024232447, "grad_norm": 7.34112024307251, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8562387824058533, "num_tokens": 541270274.0, "step": 14192 }, { "epoch": 1.80549548403511, "ewc_loss": 0.062065690755844116, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029594989609904587, "grad_norm": 7.621993541717529, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8594121336936951, "num_tokens": 541306018.0, "step": 14193 }, { "epoch": 1.8056226943137006, "ewc_loss": 0.06181348115205765, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029098635422997177, "grad_norm": 7.337869167327881, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8648361563682556, "num_tokens": 541344974.0, "step": 14194 }, { "epoch": 1.805749904592291, "ewc_loss": 0.06228554993867874, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002981484867632389, "grad_norm": 7.576868057250977, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8743992447853088, "num_tokens": 541378590.0, "step": 14195 }, { "epoch": 1.8058771148708814, "ewc_loss": 0.061941102147102356, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029226255719549954, "grad_norm": 7.322849750518799, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8670183420181274, "num_tokens": 541415945.0, "step": 14196 }, { "epoch": 1.806004325149472, "ewc_loss": 0.06255039572715759, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029835556051693857, "grad_norm": 7.539309501647949, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8618896007537842, "num_tokens": 541459577.0, "step": 14197 }, { "epoch": 1.8061315354280625, "ewc_loss": 0.06208362430334091, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002936878299806267, "grad_norm": 7.35006856918335, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8649570941925049, "num_tokens": 541501041.0, "step": 14198 }, { "epoch": 1.806258745706653, "ewc_loss": 0.06247029826045036, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975545357912779, "grad_norm": 7.532994270324707, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8582005500793457, "num_tokens": 541538587.0, "step": 14199 }, { "epoch": 1.8063859559852435, "ewc_loss": 0.061889197677373886, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029418495250865817, "grad_norm": 7.335122585296631, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8509721755981445, "num_tokens": 541582225.0, "step": 14200 }, { "epoch": 1.806513166263834, "ewc_loss": 0.06240663677453995, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029935932252556086, "grad_norm": 7.5259857177734375, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.873738706111908, "num_tokens": 541616956.0, "step": 14201 }, { "epoch": 1.8066403765424246, "ewc_loss": 0.06215883791446686, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002944399311672896, "grad_norm": 7.356368064880371, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8542617559432983, "num_tokens": 541650814.0, "step": 14202 }, { "epoch": 1.8067675868210151, "ewc_loss": 0.06263194978237152, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002991710207425058, "grad_norm": 7.539252758026123, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8736832141876221, "num_tokens": 541692167.0, "step": 14203 }, { "epoch": 1.8068947970996057, "ewc_loss": 0.06229673698544502, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029581892886199057, "grad_norm": 7.436790943145752, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8545326590538025, "num_tokens": 541732427.0, "step": 14204 }, { "epoch": 1.8070220073781962, "ewc_loss": 0.0625191330909729, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002980428689625114, "grad_norm": 7.447743892669678, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8646210432052612, "num_tokens": 541773256.0, "step": 14205 }, { "epoch": 1.8071492176567867, "ewc_loss": 0.062455832958221436, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002974098897539079, "grad_norm": 7.399333477020264, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8706472516059875, "num_tokens": 541815008.0, "step": 14206 }, { "epoch": 1.8072764279353772, "ewc_loss": 0.06256857514381409, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029853731393814087, "grad_norm": 7.590062141418457, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8674888610839844, "num_tokens": 541853462.0, "step": 14207 }, { "epoch": 1.8074036382139678, "ewc_loss": 0.062436915934085846, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029722071485593915, "grad_norm": 7.473474979400635, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8602781891822815, "num_tokens": 541890487.0, "step": 14208 }, { "epoch": 1.8075308484925583, "ewc_loss": 0.06251755356788635, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002980270655825734, "grad_norm": 7.468821048736572, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8685320019721985, "num_tokens": 541929857.0, "step": 14209 }, { "epoch": 1.8076580587711488, "ewc_loss": 0.062468208372592926, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029753363924100995, "grad_norm": 7.4480204582214355, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8570394515991211, "num_tokens": 541969923.0, "step": 14210 }, { "epoch": 1.8077852690497394, "ewc_loss": 0.06254437565803528, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029829528648406267, "grad_norm": 7.513406276702881, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8740662932395935, "num_tokens": 542008802.0, "step": 14211 }, { "epoch": 1.8079124793283299, "ewc_loss": 0.06246485561132431, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029750014073215425, "grad_norm": 7.443898677825928, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8785215020179749, "num_tokens": 542049276.0, "step": 14212 }, { "epoch": 1.8080396896069202, "ewc_loss": 0.06248058006167412, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002976573596242815, "grad_norm": 7.449365139007568, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8579438328742981, "num_tokens": 542086921.0, "step": 14213 }, { "epoch": 1.8081668998855107, "ewc_loss": 0.06246456503868103, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002974972012452781, "grad_norm": 7.433717250823975, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8645214438438416, "num_tokens": 542124291.0, "step": 14214 }, { "epoch": 1.8082941101641012, "ewc_loss": 0.06251455843448639, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002979971468448639, "grad_norm": 7.503028869628906, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8530964851379395, "num_tokens": 542167600.0, "step": 14215 }, { "epoch": 1.8084213204426918, "ewc_loss": 0.06238764524459839, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002967280161101371, "grad_norm": 7.511531829833984, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8585397601127625, "num_tokens": 542201134.0, "step": 14216 }, { "epoch": 1.8085485307212823, "ewc_loss": 0.062214307487010956, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002974360540974885, "grad_norm": 7.453639507293701, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8518109917640686, "num_tokens": 542242590.0, "step": 14217 }, { "epoch": 1.8086757409998728, "ewc_loss": 0.06240314990282059, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002968830813188106, "grad_norm": 7.41896390914917, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.863702654838562, "num_tokens": 542281945.0, "step": 14218 }, { "epoch": 1.8088029512784631, "ewc_loss": 0.062457963824272156, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029743119375780225, "grad_norm": 7.471315860748291, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8732249736785889, "num_tokens": 542319389.0, "step": 14219 }, { "epoch": 1.8089301615570537, "ewc_loss": 0.06221972405910492, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029749018722213805, "grad_norm": 7.4486775398254395, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8518518209457397, "num_tokens": 542356912.0, "step": 14220 }, { "epoch": 1.8090573718356442, "ewc_loss": 0.06251710653305054, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029802261269651353, "grad_norm": 7.426449775695801, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8574951887130737, "num_tokens": 542394054.0, "step": 14221 }, { "epoch": 1.8091845821142347, "ewc_loss": 0.06245269253849983, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002973784867208451, "grad_norm": 7.453307151794434, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8698914051055908, "num_tokens": 542431063.0, "step": 14222 }, { "epoch": 1.8093117923928252, "ewc_loss": 0.06241406500339508, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002969921915791929, "grad_norm": 7.389569282531738, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8527434468269348, "num_tokens": 542471398.0, "step": 14223 }, { "epoch": 1.8094390026714158, "ewc_loss": 0.06255040317773819, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000298355589620769, "grad_norm": 7.459676265716553, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8498094081878662, "num_tokens": 542516288.0, "step": 14224 }, { "epoch": 1.8095662129500063, "ewc_loss": 0.06243310496211052, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002971826179418713, "grad_norm": 7.423678398132324, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8629990816116333, "num_tokens": 542552292.0, "step": 14225 }, { "epoch": 1.8096934232285968, "ewc_loss": 0.06259245425462723, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029877611086703837, "grad_norm": 7.475597381591797, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8673751354217529, "num_tokens": 542592380.0, "step": 14226 }, { "epoch": 1.8098206335071874, "ewc_loss": 0.06243589520454407, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002972105285152793, "grad_norm": 7.397414684295654, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8563488721847534, "num_tokens": 542632698.0, "step": 14227 }, { "epoch": 1.8099478437857779, "ewc_loss": 0.06264888495206833, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000299340405035764, "grad_norm": 7.555588722229004, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8536666035652161, "num_tokens": 542668234.0, "step": 14228 }, { "epoch": 1.8100750540643684, "ewc_loss": 0.062399618327617645, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002968477492686361, "grad_norm": 7.426006317138672, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8625783920288086, "num_tokens": 542706205.0, "step": 14229 }, { "epoch": 1.810202264342959, "ewc_loss": 0.06268040835857391, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002996555995196104, "grad_norm": 7.635220050811768, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8612937927246094, "num_tokens": 542737068.0, "step": 14230 }, { "epoch": 1.8103294746215495, "ewc_loss": 0.062216199934482574, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002950135385617614, "grad_norm": 7.340733528137207, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8762643337249756, "num_tokens": 542769776.0, "step": 14231 }, { "epoch": 1.81045668490014, "ewc_loss": 0.06279529631137848, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030080456053838134, "grad_norm": 7.581421375274658, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8771257996559143, "num_tokens": 542802440.0, "step": 14232 }, { "epoch": 1.8105838951787305, "ewc_loss": 0.062202148139476776, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002948730543721467, "grad_norm": 7.35477876663208, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8753863573074341, "num_tokens": 542841158.0, "step": 14233 }, { "epoch": 1.810711105457321, "ewc_loss": 0.06277531385421753, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030060470453463495, "grad_norm": 7.58484411239624, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.859947681427002, "num_tokens": 542876637.0, "step": 14234 }, { "epoch": 1.8108383157359116, "ewc_loss": 0.0622476190328598, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029532777261920273, "grad_norm": 7.3453803062438965, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8705794811248779, "num_tokens": 542915087.0, "step": 14235 }, { "epoch": 1.8109655260145021, "ewc_loss": 0.06263889372348785, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000299240491585806, "grad_norm": 7.481247901916504, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.856236457824707, "num_tokens": 542950635.0, "step": 14236 }, { "epoch": 1.8110927362930926, "ewc_loss": 0.062416911125183105, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029702068422921, "grad_norm": 7.423527717590332, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8599774837493896, "num_tokens": 542990078.0, "step": 14237 }, { "epoch": 1.811219946571683, "ewc_loss": 0.06268827617168427, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029973432538099587, "grad_norm": 7.521756172180176, "learning_rate": 1e-06, "loss": 0.5375, "mean_token_accuracy": 0.8382095098495483, "num_tokens": 543030333.0, "step": 14238 }, { "epoch": 1.8113471568502735, "ewc_loss": 0.062415506690740585, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002970066270790994, "grad_norm": 7.37193489074707, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8582316637039185, "num_tokens": 543070537.0, "step": 14239 }, { "epoch": 1.811474367128864, "ewc_loss": 0.06262481957674026, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002990997745655477, "grad_norm": 7.440306186676025, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8623120784759521, "num_tokens": 543109137.0, "step": 14240 }, { "epoch": 1.8116015774074545, "ewc_loss": 0.06250660121440887, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002979175769723952, "grad_norm": 7.412095546722412, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8565588593482971, "num_tokens": 543146616.0, "step": 14241 }, { "epoch": 1.811728787686045, "ewc_loss": 0.062457554042339325, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029742709011770785, "grad_norm": 7.431936264038086, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8783875703811646, "num_tokens": 543178433.0, "step": 14242 }, { "epoch": 1.8118559979646356, "ewc_loss": 0.06255361437797546, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002983877493534237, "grad_norm": 7.453579902648926, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8768135905265808, "num_tokens": 543215086.0, "step": 14243 }, { "epoch": 1.811983208243226, "ewc_loss": 0.062444016337394714, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002972917282022536, "grad_norm": 7.449954986572266, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8476369976997375, "num_tokens": 543252111.0, "step": 14244 }, { "epoch": 1.8121104185218164, "ewc_loss": 0.06248689815402031, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002977205440402031, "grad_norm": 7.3880839347839355, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8714168071746826, "num_tokens": 543291938.0, "step": 14245 }, { "epoch": 1.812237628800407, "ewc_loss": 0.0625562071800232, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029841362265869975, "grad_norm": 7.404236793518066, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8679838180541992, "num_tokens": 543335141.0, "step": 14246 }, { "epoch": 1.8123648390789975, "ewc_loss": 0.06256689131259918, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029852052102796733, "grad_norm": 7.428207874298096, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8639964461326599, "num_tokens": 543371305.0, "step": 14247 }, { "epoch": 1.812492049357588, "ewc_loss": 0.06259319186210632, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987835032399744, "grad_norm": 7.464978218078613, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8758166432380676, "num_tokens": 543407431.0, "step": 14248 }, { "epoch": 1.8126192596361785, "ewc_loss": 0.06247902661561966, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002976418472826481, "grad_norm": 7.351864337921143, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8630521297454834, "num_tokens": 543450287.0, "step": 14249 }, { "epoch": 1.812746469914769, "ewc_loss": 0.06271979957818985, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030004954896867275, "grad_norm": 7.466145038604736, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.877876877784729, "num_tokens": 543486521.0, "step": 14250 }, { "epoch": 1.8128736801933596, "ewc_loss": 0.06240508705377579, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002969024353660643, "grad_norm": 7.387718200683594, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8649088144302368, "num_tokens": 543522157.0, "step": 14251 }, { "epoch": 1.8130008904719501, "ewc_loss": 0.0627094954252243, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029994649230502546, "grad_norm": 7.4600443840026855, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8672384023666382, "num_tokens": 543560503.0, "step": 14252 }, { "epoch": 1.8131281007505406, "ewc_loss": 0.06243187189102173, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002971703070215881, "grad_norm": 7.395692825317383, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.853081226348877, "num_tokens": 543598943.0, "step": 14253 }, { "epoch": 1.8132553110291312, "ewc_loss": 0.06258565932512283, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987081534229219, "grad_norm": 7.48651123046875, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8656517267227173, "num_tokens": 543640466.0, "step": 14254 }, { "epoch": 1.8133825213077217, "ewc_loss": 0.06251199543476105, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002979715063702315, "grad_norm": 7.39471435546875, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8653467893600464, "num_tokens": 543677581.0, "step": 14255 }, { "epoch": 1.8135097315863122, "ewc_loss": 0.06253823637962341, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002982339356094599, "grad_norm": 7.456187725067139, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8565952181816101, "num_tokens": 543711220.0, "step": 14256 }, { "epoch": 1.8136369418649028, "ewc_loss": 0.06250792741775513, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029793079011142254, "grad_norm": 7.453972339630127, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8592987060546875, "num_tokens": 543750109.0, "step": 14257 }, { "epoch": 1.8137641521434933, "ewc_loss": 0.062492601573467255, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002977775875478983, "grad_norm": 7.388369560241699, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8684647083282471, "num_tokens": 543788040.0, "step": 14258 }, { "epoch": 1.8138913624220838, "ewc_loss": 0.06252314150333405, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029808294493705034, "grad_norm": 7.507628440856934, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8644840717315674, "num_tokens": 543821037.0, "step": 14259 }, { "epoch": 1.8140185727006743, "ewc_loss": 0.06241285055875778, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002969800552818924, "grad_norm": 7.445686340332031, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8762481212615967, "num_tokens": 543856331.0, "step": 14260 }, { "epoch": 1.8141457829792649, "ewc_loss": 0.06266321241855621, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029948371229693294, "grad_norm": 7.452548503875732, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8690243363380432, "num_tokens": 543892898.0, "step": 14261 }, { "epoch": 1.8142729932578552, "ewc_loss": 0.06245770305395126, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029742857441306114, "grad_norm": 7.442173480987549, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8599374890327454, "num_tokens": 543932858.0, "step": 14262 }, { "epoch": 1.8144002035364457, "ewc_loss": 0.06256968528032303, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002985484024975449, "grad_norm": 7.4556884765625, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8720799684524536, "num_tokens": 543968145.0, "step": 14263 }, { "epoch": 1.8145274138150362, "ewc_loss": 0.062468525022268295, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029753681155852973, "grad_norm": 7.43910026550293, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8630246520042419, "num_tokens": 544000495.0, "step": 14264 }, { "epoch": 1.8146546240936268, "ewc_loss": 0.06251437962055206, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029799537151120603, "grad_norm": 7.452259540557861, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8583917617797852, "num_tokens": 544041345.0, "step": 14265 }, { "epoch": 1.8147818343722173, "ewc_loss": 0.06242711842060089, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029712272225879133, "grad_norm": 7.456857681274414, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8699681758880615, "num_tokens": 544080084.0, "step": 14266 }, { "epoch": 1.8149090446508078, "ewc_loss": 0.06233061105012894, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029615769744850695, "grad_norm": 7.458499431610107, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.856356143951416, "num_tokens": 544115828.0, "step": 14267 }, { "epoch": 1.8150362549293981, "ewc_loss": 0.06237928941845894, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966444590128958, "grad_norm": 7.410355091094971, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8613600730895996, "num_tokens": 544153213.0, "step": 14268 }, { "epoch": 1.8151634652079887, "ewc_loss": 0.06238912045955658, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029674277175217867, "grad_norm": 7.384974956512451, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8675460815429688, "num_tokens": 544192383.0, "step": 14269 }, { "epoch": 1.8152906754865792, "ewc_loss": 0.0623699352145195, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002965509193018079, "grad_norm": 7.4176740646362305, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8662601709365845, "num_tokens": 544232034.0, "step": 14270 }, { "epoch": 1.8154178857651697, "ewc_loss": 0.06242994964122772, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029715104028582573, "grad_norm": 7.49064826965332, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8474106788635254, "num_tokens": 544266242.0, "step": 14271 }, { "epoch": 1.8155450960437602, "ewc_loss": 0.06233265995979309, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029617815744131804, "grad_norm": 7.39652681350708, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8542455434799194, "num_tokens": 544304995.0, "step": 14272 }, { "epoch": 1.8156723063223508, "ewc_loss": 0.0625152438879013, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002980040153488517, "grad_norm": 7.450697898864746, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8637425899505615, "num_tokens": 544341950.0, "step": 14273 }, { "epoch": 1.8157995166009413, "ewc_loss": 0.062332525849342346, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000296176818665117, "grad_norm": 7.410126686096191, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8595700263977051, "num_tokens": 544379535.0, "step": 14274 }, { "epoch": 1.8159267268795318, "ewc_loss": 0.06256160140037537, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029846758116036654, "grad_norm": 7.445457458496094, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8628267049789429, "num_tokens": 544416939.0, "step": 14275 }, { "epoch": 1.8160539371581224, "ewc_loss": 0.0623629130423069, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002964806917589158, "grad_norm": 7.407179355621338, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8760870695114136, "num_tokens": 544461476.0, "step": 14276 }, { "epoch": 1.8161811474367129, "ewc_loss": 0.06244494765996933, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002973010123241693, "grad_norm": 7.395900249481201, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8653093576431274, "num_tokens": 544505051.0, "step": 14277 }, { "epoch": 1.8163083577153034, "ewc_loss": 0.0624711699783802, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975632669404149, "grad_norm": 7.345104694366455, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8641732931137085, "num_tokens": 544550408.0, "step": 14278 }, { "epoch": 1.816435567993894, "ewc_loss": 0.06258795410394669, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029873111634515226, "grad_norm": 7.481870651245117, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8634769320487976, "num_tokens": 544586645.0, "step": 14279 }, { "epoch": 1.8165627782724845, "ewc_loss": 0.06241857260465622, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029703727341257036, "grad_norm": 7.379990100860596, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8568146228790283, "num_tokens": 544628087.0, "step": 14280 }, { "epoch": 1.816689988551075, "ewc_loss": 0.06266600638628006, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029951162287034094, "grad_norm": 7.486445426940918, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.878487765789032, "num_tokens": 544667961.0, "step": 14281 }, { "epoch": 1.8168171988296655, "ewc_loss": 0.062424980103969574, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002971013600472361, "grad_norm": 7.444838047027588, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8610948324203491, "num_tokens": 544704919.0, "step": 14282 }, { "epoch": 1.816944409108256, "ewc_loss": 0.06264661252498627, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002993176458403468, "grad_norm": 7.498532295227051, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8686797618865967, "num_tokens": 544741421.0, "step": 14283 }, { "epoch": 1.8170716193868466, "ewc_loss": 0.062496110796928406, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002978126867674291, "grad_norm": 7.4132795333862305, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8685626983642578, "num_tokens": 544776414.0, "step": 14284 }, { "epoch": 1.817198829665437, "ewc_loss": 0.06266078352928162, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029945941059850156, "grad_norm": 7.460134029388428, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8571673035621643, "num_tokens": 544814964.0, "step": 14285 }, { "epoch": 1.8173260399440276, "ewc_loss": 0.06250172108411789, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029786876984871924, "grad_norm": 7.4161763191223145, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8505725264549255, "num_tokens": 544853381.0, "step": 14286 }, { "epoch": 1.817453250222618, "ewc_loss": 0.06259883940219879, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002988399937748909, "grad_norm": 7.472979545593262, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8667657375335693, "num_tokens": 544892474.0, "step": 14287 }, { "epoch": 1.8175804605012085, "ewc_loss": 0.0624818280339241, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000297669816063717, "grad_norm": 7.432554721832275, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8710194826126099, "num_tokens": 544927667.0, "step": 14288 }, { "epoch": 1.817707670779799, "ewc_loss": 0.06266507506370544, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029950233874842525, "grad_norm": 7.478724956512451, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8700685501098633, "num_tokens": 544959780.0, "step": 14289 }, { "epoch": 1.8178348810583895, "ewc_loss": 0.06245213747024536, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002973729569930583, "grad_norm": 7.402690887451172, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.856139600276947, "num_tokens": 545001832.0, "step": 14290 }, { "epoch": 1.81796209133698, "ewc_loss": 0.06271180510520935, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002999696007464081, "grad_norm": 7.529196262359619, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8726252913475037, "num_tokens": 545038489.0, "step": 14291 }, { "epoch": 1.8180893016155706, "ewc_loss": 0.06234985962510109, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029635016107931733, "grad_norm": 7.445141792297363, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8440287113189697, "num_tokens": 545069086.0, "step": 14292 }, { "epoch": 1.818216511894161, "ewc_loss": 0.06260429322719574, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002988944761455059, "grad_norm": 7.461601734161377, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8672176003456116, "num_tokens": 545104564.0, "step": 14293 }, { "epoch": 1.8183437221727514, "ewc_loss": 0.06250196695327759, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029787118546664715, "grad_norm": 7.365314483642578, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.869148850440979, "num_tokens": 545149036.0, "step": 14294 }, { "epoch": 1.818470932451342, "ewc_loss": 0.0626264438033104, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029911601450294256, "grad_norm": 7.435083866119385, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8747508525848389, "num_tokens": 545183825.0, "step": 14295 }, { "epoch": 1.8185981427299325, "ewc_loss": 0.06256469339132309, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002984984894283116, "grad_norm": 7.475615978240967, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8679248094558716, "num_tokens": 545221274.0, "step": 14296 }, { "epoch": 1.818725353008523, "ewc_loss": 0.06255701929330826, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029842177173122764, "grad_norm": 7.43904972076416, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8708091974258423, "num_tokens": 545259242.0, "step": 14297 }, { "epoch": 1.8188525632871135, "ewc_loss": 0.06258226931095123, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029867427656427026, "grad_norm": 7.5645623207092285, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.849407434463501, "num_tokens": 545289380.0, "step": 14298 }, { "epoch": 1.818979773565704, "ewc_loss": 0.062419675290584564, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029704830376431346, "grad_norm": 7.373814582824707, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8677465319633484, "num_tokens": 545321935.0, "step": 14299 }, { "epoch": 1.8191069838442946, "ewc_loss": 0.06272333115339279, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030008485191501677, "grad_norm": 7.470639228820801, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.87103670835495, "num_tokens": 545365714.0, "step": 14300 }, { "epoch": 1.8192341941228851, "ewc_loss": 0.06241919845342636, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029704353073611856, "grad_norm": 7.417489528656006, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8540947437286377, "num_tokens": 545406838.0, "step": 14301 }, { "epoch": 1.8193614044014756, "ewc_loss": 0.0626266673207283, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002991182263940573, "grad_norm": 7.4776458740234375, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8603084087371826, "num_tokens": 545443265.0, "step": 14302 }, { "epoch": 1.8194886146800662, "ewc_loss": 0.06244610995054245, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029731268296018243, "grad_norm": 7.396053791046143, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8741218447685242, "num_tokens": 545479903.0, "step": 14303 }, { "epoch": 1.8196158249586567, "ewc_loss": 0.06296057999134064, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003000159631483257, "grad_norm": 7.531393051147461, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8710111379623413, "num_tokens": 545513767.0, "step": 14304 }, { "epoch": 1.8197430352372472, "ewc_loss": 0.062470853328704834, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975600946228951, "grad_norm": 7.464219093322754, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8491804599761963, "num_tokens": 545550541.0, "step": 14305 }, { "epoch": 1.8198702455158378, "ewc_loss": 0.06266386061906815, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029949017334729433, "grad_norm": 7.4967122077941895, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8656695485115051, "num_tokens": 545590739.0, "step": 14306 }, { "epoch": 1.8199974557944283, "ewc_loss": 0.06246907263994217, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975422830786556, "grad_norm": 7.428165435791016, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8498369455337524, "num_tokens": 545627127.0, "step": 14307 }, { "epoch": 1.8201246660730188, "ewc_loss": 0.06258480250835419, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029869956779293716, "grad_norm": 7.448864936828613, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8685664534568787, "num_tokens": 545665473.0, "step": 14308 }, { "epoch": 1.8202518763516093, "ewc_loss": 0.06250564754009247, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029790806001983583, "grad_norm": 7.468349933624268, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8798645734786987, "num_tokens": 545708018.0, "step": 14309 }, { "epoch": 1.8203790866301999, "ewc_loss": 0.06248445808887482, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029769615503028035, "grad_norm": 7.440554618835449, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8688918352127075, "num_tokens": 545745635.0, "step": 14310 }, { "epoch": 1.8205062969087902, "ewc_loss": 0.062392935156822205, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029678092687390745, "grad_norm": 7.394210338592529, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8535239696502686, "num_tokens": 545785260.0, "step": 14311 }, { "epoch": 1.8206335071873807, "ewc_loss": 0.06259114295244217, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029876301414333284, "grad_norm": 7.459615707397461, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8615612387657166, "num_tokens": 545821696.0, "step": 14312 }, { "epoch": 1.8207607174659712, "ewc_loss": 0.06240727752447128, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002969243214465678, "grad_norm": 7.440545082092285, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.877590537071228, "num_tokens": 545857971.0, "step": 14313 }, { "epoch": 1.8208879277445618, "ewc_loss": 0.062474191188812256, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000297593476716429, "grad_norm": 7.479113578796387, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8536134362220764, "num_tokens": 545891000.0, "step": 14314 }, { "epoch": 1.8210151380231523, "ewc_loss": 0.0625365823507309, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029821737552993, "grad_norm": 7.402845859527588, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.849480390548706, "num_tokens": 545932825.0, "step": 14315 }, { "epoch": 1.8211423483017428, "ewc_loss": 0.06243211403489113, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029717269353568554, "grad_norm": 7.409750461578369, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8543354868888855, "num_tokens": 545972849.0, "step": 14316 }, { "epoch": 1.8212695585803331, "ewc_loss": 0.06257147341966629, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002985663013532758, "grad_norm": 7.431567192077637, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8769580125808716, "num_tokens": 546011492.0, "step": 14317 }, { "epoch": 1.8213967688589237, "ewc_loss": 0.06252752244472504, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002981268335133791, "grad_norm": 7.48342227935791, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.860016942024231, "num_tokens": 546048461.0, "step": 14318 }, { "epoch": 1.8215239791375142, "ewc_loss": 0.06247682124376297, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002976197865791619, "grad_norm": 7.3772783279418945, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8739302158355713, "num_tokens": 546087113.0, "step": 14319 }, { "epoch": 1.8216511894161047, "ewc_loss": 0.06256195157766342, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002984711027238518, "grad_norm": 7.492680072784424, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8674240112304688, "num_tokens": 546124413.0, "step": 14320 }, { "epoch": 1.8217783996946952, "ewc_loss": 0.06242372468113899, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029708881629630923, "grad_norm": 7.365816116333008, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8746790289878845, "num_tokens": 546165763.0, "step": 14321 }, { "epoch": 1.8219056099732858, "ewc_loss": 0.06262017786502838, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029905338305979967, "grad_norm": 7.437790393829346, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8657088279724121, "num_tokens": 546203218.0, "step": 14322 }, { "epoch": 1.8220328202518763, "ewc_loss": 0.062467098236083984, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975225215777755, "grad_norm": 7.42065954208374, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8546794652938843, "num_tokens": 546247610.0, "step": 14323 }, { "epoch": 1.8221600305304668, "ewc_loss": 0.06256423145532608, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029849386191926897, "grad_norm": 7.462775230407715, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8571853041648865, "num_tokens": 546288952.0, "step": 14324 }, { "epoch": 1.8222872408090574, "ewc_loss": 0.06241548806428909, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002970064524561167, "grad_norm": 7.384671688079834, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8755202889442444, "num_tokens": 546322449.0, "step": 14325 }, { "epoch": 1.8224144510876479, "ewc_loss": 0.06260466575622559, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002988982596434653, "grad_norm": 7.508993625640869, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8694586753845215, "num_tokens": 546362692.0, "step": 14326 }, { "epoch": 1.8225416613662384, "ewc_loss": 0.06214883178472519, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002967813052237034, "grad_norm": 7.413102626800537, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8486158847808838, "num_tokens": 546398856.0, "step": 14327 }, { "epoch": 1.822668871644829, "ewc_loss": 0.06234433501958847, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000298736325930804, "grad_norm": 7.406017303466797, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8576281666755676, "num_tokens": 546439633.0, "step": 14328 }, { "epoch": 1.8227960819234195, "ewc_loss": 0.06261394172906876, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029899095534347, "grad_norm": 7.4372968673706055, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8690154552459717, "num_tokens": 546477282.0, "step": 14329 }, { "epoch": 1.82292329220201, "ewc_loss": 0.06250284612178802, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000297880033031106, "grad_norm": 7.411206245422363, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8754011988639832, "num_tokens": 546514584.0, "step": 14330 }, { "epoch": 1.8230505024806005, "ewc_loss": 0.06263507902622223, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002992023073602468, "grad_norm": 7.483001232147217, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8662834167480469, "num_tokens": 546550761.0, "step": 14331 }, { "epoch": 1.823177712759191, "ewc_loss": 0.06248902529478073, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002977417898364365, "grad_norm": 7.382747650146484, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8496314287185669, "num_tokens": 546594157.0, "step": 14332 }, { "epoch": 1.8233049230377816, "ewc_loss": 0.0627424567937851, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030027609318494797, "grad_norm": 7.557706356048584, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.874080240726471, "num_tokens": 546634671.0, "step": 14333 }, { "epoch": 1.823432133316372, "ewc_loss": 0.06231239438056946, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002959755074698478, "grad_norm": 7.356263637542725, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8645344972610474, "num_tokens": 546672084.0, "step": 14334 }, { "epoch": 1.8235593435949626, "ewc_loss": 0.06280113756656647, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030086294282227755, "grad_norm": 7.605717658996582, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8574777841567993, "num_tokens": 546701920.0, "step": 14335 }, { "epoch": 1.823686553873553, "ewc_loss": 0.06225210428237915, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029537262162193656, "grad_norm": 7.362891674041748, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8676712512969971, "num_tokens": 546735776.0, "step": 14336 }, { "epoch": 1.8238137641521435, "ewc_loss": 0.06273654103279114, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003002170124091208, "grad_norm": 7.493474006652832, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8699065446853638, "num_tokens": 546772991.0, "step": 14337 }, { "epoch": 1.823940974430734, "ewc_loss": 0.06238018721342087, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966534229926765, "grad_norm": 7.515997409820557, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8585699200630188, "num_tokens": 546805310.0, "step": 14338 }, { "epoch": 1.8240681847093245, "ewc_loss": 0.062480535358190536, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002976569230668247, "grad_norm": 7.439966678619385, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8702539205551147, "num_tokens": 546838240.0, "step": 14339 }, { "epoch": 1.824195394987915, "ewc_loss": 0.06256380677223206, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002984896127600223, "grad_norm": 7.451303005218506, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8596847057342529, "num_tokens": 546878484.0, "step": 14340 }, { "epoch": 1.8243226052665056, "ewc_loss": 0.06238045543432236, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966561005450785, "grad_norm": 7.445550441741943, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8800643682479858, "num_tokens": 546919747.0, "step": 14341 }, { "epoch": 1.8244498155450959, "ewc_loss": 0.06244690343737602, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029732059920206666, "grad_norm": 7.418824195861816, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8645855188369751, "num_tokens": 546962614.0, "step": 14342 }, { "epoch": 1.8245770258236864, "ewc_loss": 0.06243336945772171, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029718526639044285, "grad_norm": 7.460885047912598, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8577964901924133, "num_tokens": 547000059.0, "step": 14343 }, { "epoch": 1.824704236102277, "ewc_loss": 0.06232734024524689, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002961249556392431, "grad_norm": 7.421650409698486, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.866952121257782, "num_tokens": 547037372.0, "step": 14344 }, { "epoch": 1.8248314463808675, "ewc_loss": 0.06247524917125702, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029760407051071525, "grad_norm": 7.562842845916748, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8511956930160522, "num_tokens": 547071840.0, "step": 14345 }, { "epoch": 1.824958656659458, "ewc_loss": 0.06233585625886917, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029621011344715953, "grad_norm": 7.517794609069824, "learning_rate": 1e-06, "loss": 0.5422, "mean_token_accuracy": 0.8392202854156494, "num_tokens": 547108916.0, "step": 14346 }, { "epoch": 1.8250858669380485, "ewc_loss": 0.06227635219693184, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002956150856334716, "grad_norm": 7.439363479614258, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8830394744873047, "num_tokens": 547146991.0, "step": 14347 }, { "epoch": 1.825213077216639, "ewc_loss": 0.06238113343715668, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029666288173757493, "grad_norm": 7.46124792098999, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8747694492340088, "num_tokens": 547184093.0, "step": 14348 }, { "epoch": 1.8253402874952296, "ewc_loss": 0.06218466907739639, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002946982567664236, "grad_norm": 7.471223831176758, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8709919452667236, "num_tokens": 547217570.0, "step": 14349 }, { "epoch": 1.8254674977738201, "ewc_loss": 0.06224001944065094, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002952517825178802, "grad_norm": 7.404239654541016, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8625402450561523, "num_tokens": 547259498.0, "step": 14350 }, { "epoch": 1.8255947080524106, "ewc_loss": 0.06234515458345413, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002963031001854688, "grad_norm": 7.437201023101807, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8638210296630859, "num_tokens": 547298419.0, "step": 14351 }, { "epoch": 1.8257219183310012, "ewc_loss": 0.06257116794586182, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0002961218124255538, "grad_norm": 7.418637752532959, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8721530437469482, "num_tokens": 547342984.0, "step": 14352 }, { "epoch": 1.8258491286095917, "ewc_loss": 0.06226751208305359, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002955266973003745, "grad_norm": 7.418229579925537, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8648805618286133, "num_tokens": 547383162.0, "step": 14353 }, { "epoch": 1.8259763388881822, "ewc_loss": 0.06235935539007187, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002964450977742672, "grad_norm": 7.495779991149902, "learning_rate": 1e-06, "loss": 0.39, "mean_token_accuracy": 0.8854466676712036, "num_tokens": 547416967.0, "step": 14354 }, { "epoch": 1.8261035491667728, "ewc_loss": 0.062161438167095184, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029446594999171793, "grad_norm": 7.383800029754639, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8679267764091492, "num_tokens": 547454695.0, "step": 14355 }, { "epoch": 1.8262307594453633, "ewc_loss": 0.06238534674048424, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002967050240840763, "grad_norm": 7.455991744995117, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8737268447875977, "num_tokens": 547494258.0, "step": 14356 }, { "epoch": 1.8263579697239538, "ewc_loss": 0.06230583041906357, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029590987833216786, "grad_norm": 7.415239334106445, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.85933518409729, "num_tokens": 547531404.0, "step": 14357 }, { "epoch": 1.8264851800025443, "ewc_loss": 0.0623573400080204, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029642495792359114, "grad_norm": 7.445018291473389, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8691496849060059, "num_tokens": 547566553.0, "step": 14358 }, { "epoch": 1.8266123902811349, "ewc_loss": 0.06226225942373276, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029547413578256965, "grad_norm": 7.393538951873779, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8569332361221313, "num_tokens": 547605505.0, "step": 14359 }, { "epoch": 1.8267396005597252, "ewc_loss": 0.062449175864458084, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029734332929365337, "grad_norm": 7.442703723907471, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8722670078277588, "num_tokens": 547643146.0, "step": 14360 }, { "epoch": 1.8268668108383157, "ewc_loss": 0.06235384941101074, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002963900624308735, "grad_norm": 7.408304691314697, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.861802339553833, "num_tokens": 547683373.0, "step": 14361 }, { "epoch": 1.8269940211169062, "ewc_loss": 0.0625152736902237, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029800430638715625, "grad_norm": 7.4435038566589355, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8590778112411499, "num_tokens": 547716762.0, "step": 14362 }, { "epoch": 1.8271212313954968, "ewc_loss": 0.06232859194278717, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029613749939017, "grad_norm": 7.379201412200928, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.871888279914856, "num_tokens": 547756907.0, "step": 14363 }, { "epoch": 1.8272484416740873, "ewc_loss": 0.0625549703836441, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002984012826345861, "grad_norm": 7.524285316467285, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8604001402854919, "num_tokens": 547793545.0, "step": 14364 }, { "epoch": 1.8273756519526778, "ewc_loss": 0.06231720745563507, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002960236161015928, "grad_norm": 7.422922134399414, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8553165197372437, "num_tokens": 547836845.0, "step": 14365 }, { "epoch": 1.8275028622312681, "ewc_loss": 0.062472350895404816, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975750539917499, "grad_norm": 7.438374042510986, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8712902665138245, "num_tokens": 547875849.0, "step": 14366 }, { "epoch": 1.8276300725098586, "ewc_loss": 0.062360476702451706, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002964563318528235, "grad_norm": 7.386516571044922, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8586044311523438, "num_tokens": 547918672.0, "step": 14367 }, { "epoch": 1.8277572827884492, "ewc_loss": 0.062479011714458466, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002976417017634958, "grad_norm": 7.520175933837891, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8517462015151978, "num_tokens": 547958681.0, "step": 14368 }, { "epoch": 1.8278844930670397, "ewc_loss": 0.062311943620443344, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000295970996376127, "grad_norm": 7.394573211669922, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8590362071990967, "num_tokens": 547996940.0, "step": 14369 }, { "epoch": 1.8280117033456302, "ewc_loss": 0.06259721517562866, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002988237247336656, "grad_norm": 7.538773059844971, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8740434646606445, "num_tokens": 548030188.0, "step": 14370 }, { "epoch": 1.8281389136242208, "ewc_loss": 0.0621861070394516, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002947126340586692, "grad_norm": 7.340132713317871, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8799229860305786, "num_tokens": 548066891.0, "step": 14371 }, { "epoch": 1.8282661239028113, "ewc_loss": 0.06273055076599121, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030015702941454947, "grad_norm": 7.550983428955078, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8697245121002197, "num_tokens": 548104664.0, "step": 14372 }, { "epoch": 1.8283933341814018, "ewc_loss": 0.06223667785525322, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002952183422166854, "grad_norm": 7.391308784484863, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8603257536888123, "num_tokens": 548143938.0, "step": 14373 }, { "epoch": 1.8285205444599923, "ewc_loss": 0.06274130195379257, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030026459717191756, "grad_norm": 7.497294902801514, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8633424639701843, "num_tokens": 548183213.0, "step": 14374 }, { "epoch": 1.8286477547385829, "ewc_loss": 0.06234801560640335, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029633170925080776, "grad_norm": 7.423987865447998, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8486915826797485, "num_tokens": 548221304.0, "step": 14375 }, { "epoch": 1.8287749650171734, "ewc_loss": 0.06261958181858063, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029904741677455604, "grad_norm": 7.488706588745117, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8617337942123413, "num_tokens": 548261123.0, "step": 14376 }, { "epoch": 1.828902175295764, "ewc_loss": 0.062439993023872375, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002972515067085624, "grad_norm": 7.463908672332764, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.843725323677063, "num_tokens": 548305040.0, "step": 14377 }, { "epoch": 1.8290293855743545, "ewc_loss": 0.06257496774196625, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002986012550536543, "grad_norm": 7.524580478668213, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8723040819168091, "num_tokens": 548338538.0, "step": 14378 }, { "epoch": 1.829156595852945, "ewc_loss": 0.06253768503665924, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029822837677784264, "grad_norm": 7.500858783721924, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.868482768535614, "num_tokens": 548378510.0, "step": 14379 }, { "epoch": 1.8292838061315355, "ewc_loss": 0.06251522153615952, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000298003782518208, "grad_norm": 7.508756637573242, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8576997518539429, "num_tokens": 548415790.0, "step": 14380 }, { "epoch": 1.829411016410126, "ewc_loss": 0.062410540878772736, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029695694684050977, "grad_norm": 7.483335018157959, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8555334806442261, "num_tokens": 548447294.0, "step": 14381 }, { "epoch": 1.8295382266887166, "ewc_loss": 0.06250764429569244, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029792802524752915, "grad_norm": 7.532846450805664, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.872174859046936, "num_tokens": 548484617.0, "step": 14382 }, { "epoch": 1.829665436967307, "ewc_loss": 0.06238003820180893, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966519386973232, "grad_norm": 7.432737350463867, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8591137528419495, "num_tokens": 548527552.0, "step": 14383 }, { "epoch": 1.8297926472458976, "ewc_loss": 0.06255769729614258, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029842849471606314, "grad_norm": 7.525809288024902, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.8819330930709839, "num_tokens": 548561322.0, "step": 14384 }, { "epoch": 1.829919857524488, "ewc_loss": 0.062355056405067444, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029640214052051306, "grad_norm": 7.456734657287598, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.874538004398346, "num_tokens": 548594810.0, "step": 14385 }, { "epoch": 1.8300470678030785, "ewc_loss": 0.062461040914058685, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029746198561042547, "grad_norm": 7.509693622589111, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.857603132724762, "num_tokens": 548631094.0, "step": 14386 }, { "epoch": 1.830174278081669, "ewc_loss": 0.06226367503404617, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002954883093480021, "grad_norm": 7.4181013107299805, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8648189306259155, "num_tokens": 548669162.0, "step": 14387 }, { "epoch": 1.8303014883602595, "ewc_loss": 0.06255683302879333, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029841993818990886, "grad_norm": 7.514101028442383, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8652290105819702, "num_tokens": 548704555.0, "step": 14388 }, { "epoch": 1.83042869863885, "ewc_loss": 0.06238080561161041, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002966595930047333, "grad_norm": 7.384263515472412, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8785505890846252, "num_tokens": 548741852.0, "step": 14389 }, { "epoch": 1.8305559089174406, "ewc_loss": 0.06270483881235123, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029989995528012514, "grad_norm": 7.482632637023926, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8522738218307495, "num_tokens": 548788455.0, "step": 14390 }, { "epoch": 1.8306831191960309, "ewc_loss": 0.06247634440660477, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000297615013550967, "grad_norm": 7.416139602661133, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8722864389419556, "num_tokens": 548826347.0, "step": 14391 }, { "epoch": 1.8308103294746214, "ewc_loss": 0.06267710775136948, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002996226539835334, "grad_norm": 7.545657157897949, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8581142425537109, "num_tokens": 548865428.0, "step": 14392 }, { "epoch": 1.830937539753212, "ewc_loss": 0.06245121359825134, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002973636728711426, "grad_norm": 7.469240665435791, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8510051965713501, "num_tokens": 548901671.0, "step": 14393 }, { "epoch": 1.8310647500318025, "ewc_loss": 0.06269866228103638, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002998381678480655, "grad_norm": 7.498592376708984, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.861565113067627, "num_tokens": 548939234.0, "step": 14394 }, { "epoch": 1.831191960310393, "ewc_loss": 0.06246793270111084, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000297530903480947, "grad_norm": 7.4530863761901855, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8683042526245117, "num_tokens": 548975334.0, "step": 14395 }, { "epoch": 1.8313191705889835, "ewc_loss": 0.06257203966379166, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002985719474963844, "grad_norm": 7.422354221343994, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8594924807548523, "num_tokens": 549016372.0, "step": 14396 }, { "epoch": 1.831446380867574, "ewc_loss": 0.06261765956878662, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002990281209349632, "grad_norm": 7.437541484832764, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.870833158493042, "num_tokens": 549058049.0, "step": 14397 }, { "epoch": 1.8315735911461646, "ewc_loss": 0.06263737380504608, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002992252993863076, "grad_norm": 7.463643550872803, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8526253700256348, "num_tokens": 549097855.0, "step": 14398 }, { "epoch": 1.831700801424755, "ewc_loss": 0.06267353892326355, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029958694358356297, "grad_norm": 7.509272575378418, "learning_rate": 1e-06, "loss": 0.3914, "mean_token_accuracy": 0.8842899799346924, "num_tokens": 549130906.0, "step": 14399 }, { "epoch": 1.8318280117033456, "ewc_loss": 0.06255321204662323, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029838367481715977, "grad_norm": 7.496153831481934, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8454070091247559, "num_tokens": 549176574.0, "step": 14400 }, { "epoch": 1.8319552219819362, "ewc_loss": 0.06266240030527115, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029947556322440505, "grad_norm": 7.525909423828125, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8623281717300415, "num_tokens": 549213945.0, "step": 14401 }, { "epoch": 1.8320824322605267, "ewc_loss": 0.06256134808063507, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029846507823094726, "grad_norm": 7.4782891273498535, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8566009998321533, "num_tokens": 549255843.0, "step": 14402 }, { "epoch": 1.8322096425391172, "ewc_loss": 0.06226594001054764, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029795238515362144, "grad_norm": 7.53526496887207, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8681752681732178, "num_tokens": 549292573.0, "step": 14403 }, { "epoch": 1.8323368528177078, "ewc_loss": 0.06231461465358734, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002984390885103494, "grad_norm": 7.429692268371582, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.877101719379425, "num_tokens": 549335291.0, "step": 14404 }, { "epoch": 1.8324640630962983, "ewc_loss": 0.06255977600812912, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029844933305867016, "grad_norm": 7.489974021911621, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8522297739982605, "num_tokens": 549381337.0, "step": 14405 }, { "epoch": 1.8325912733748888, "ewc_loss": 0.06252771615982056, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002981287252623588, "grad_norm": 7.457542896270752, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.875879168510437, "num_tokens": 549418180.0, "step": 14406 }, { "epoch": 1.8327184836534793, "ewc_loss": 0.06252726912498474, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002981242723762989, "grad_norm": 7.442994594573975, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8729372620582581, "num_tokens": 549461623.0, "step": 14407 }, { "epoch": 1.8328456939320699, "ewc_loss": 0.06263794004917145, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002992309455294162, "grad_norm": 7.493618488311768, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8675126433372498, "num_tokens": 549500791.0, "step": 14408 }, { "epoch": 1.8329729042106602, "ewc_loss": 0.06219368055462837, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002972297661472112, "grad_norm": 7.4517974853515625, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8706550598144531, "num_tokens": 549533343.0, "step": 14409 }, { "epoch": 1.8331001144892507, "ewc_loss": 0.06238231062889099, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002991160727106035, "grad_norm": 7.473844528198242, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8561297655105591, "num_tokens": 549572707.0, "step": 14410 }, { "epoch": 1.8332273247678412, "ewc_loss": 0.06237253174185753, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029901828384026885, "grad_norm": 7.478634834289551, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.871057391166687, "num_tokens": 549609772.0, "step": 14411 }, { "epoch": 1.8333545350464318, "ewc_loss": 0.06262777745723724, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029912934405729175, "grad_norm": 7.489035606384277, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8530522584915161, "num_tokens": 549648626.0, "step": 14412 }, { "epoch": 1.8334817453250223, "ewc_loss": 0.0625501424074173, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029835294117219746, "grad_norm": 7.4273295402526855, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8530803918838501, "num_tokens": 549693211.0, "step": 14413 }, { "epoch": 1.8336089556036128, "ewc_loss": 0.062479518353939056, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030008816975168884, "grad_norm": 7.540899753570557, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8591307401657104, "num_tokens": 549729463.0, "step": 14414 }, { "epoch": 1.8337361658822031, "ewc_loss": 0.06250374764204025, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002978890261147171, "grad_norm": 7.423674583435059, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8706868886947632, "num_tokens": 549769139.0, "step": 14415 }, { "epoch": 1.8338633761607936, "ewc_loss": 0.062494296580553055, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003002359298989177, "grad_norm": 7.423630237579346, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8681893348693848, "num_tokens": 549807067.0, "step": 14416 }, { "epoch": 1.8339905864393842, "ewc_loss": 0.06265658885240555, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029941744287498295, "grad_norm": 7.458107948303223, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8636995553970337, "num_tokens": 549842689.0, "step": 14417 }, { "epoch": 1.8341177967179747, "ewc_loss": 0.06275098025798798, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030036139651201665, "grad_norm": 7.50252103805542, "learning_rate": 1e-06, "loss": 0.391, "mean_token_accuracy": 0.8833322525024414, "num_tokens": 549876108.0, "step": 14418 }, { "epoch": 1.8342450069965652, "ewc_loss": 0.06258665025234222, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987181069329381, "grad_norm": 7.421523571014404, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8787758350372314, "num_tokens": 549909656.0, "step": 14419 }, { "epoch": 1.8343722172751558, "ewc_loss": 0.06247308477759361, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000300023821182549, "grad_norm": 7.450232028961182, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8567674160003662, "num_tokens": 549949156.0, "step": 14420 }, { "epoch": 1.8344994275537463, "ewc_loss": 0.06248600780963898, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003001530421897769, "grad_norm": 7.412025451660156, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8716386556625366, "num_tokens": 549990256.0, "step": 14421 }, { "epoch": 1.8346266378323368, "ewc_loss": 0.06252545118331909, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030054751550778747, "grad_norm": 7.514023303985596, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8729234933853149, "num_tokens": 550023557.0, "step": 14422 }, { "epoch": 1.8347538481109273, "ewc_loss": 0.06264765560626984, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029932812321931124, "grad_norm": 7.382509231567383, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8615531325340271, "num_tokens": 550069402.0, "step": 14423 }, { "epoch": 1.8348810583895179, "ewc_loss": 0.0628822073340416, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030167363001964986, "grad_norm": 7.467519283294678, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8723016977310181, "num_tokens": 550110860.0, "step": 14424 }, { "epoch": 1.8350082686681084, "ewc_loss": 0.06273961812257767, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030024771695025265, "grad_norm": 7.531219005584717, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8498382568359375, "num_tokens": 550151736.0, "step": 14425 }, { "epoch": 1.835135478946699, "ewc_loss": 0.062431737780570984, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002996103430632502, "grad_norm": 7.451933860778809, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8663682341575623, "num_tokens": 550187731.0, "step": 14426 }, { "epoch": 1.8352626892252895, "ewc_loss": 0.06250384449958801, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003003314486704767, "grad_norm": 7.5107903480529785, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8664330244064331, "num_tokens": 550220545.0, "step": 14427 }, { "epoch": 1.83538989950388, "ewc_loss": 0.062424421310424805, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029709580121561885, "grad_norm": 7.455860137939453, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.86742103099823, "num_tokens": 550254755.0, "step": 14428 }, { "epoch": 1.8355171097824705, "ewc_loss": 0.06282561272382736, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003011077060364187, "grad_norm": 7.467896938323975, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8709108829498291, "num_tokens": 550295191.0, "step": 14429 }, { "epoch": 1.835644320061061, "ewc_loss": 0.062330715358257294, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029860014910809696, "grad_norm": 7.4910502433776855, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.857729971408844, "num_tokens": 550332984.0, "step": 14430 }, { "epoch": 1.8357715303396516, "ewc_loss": 0.06246217340230942, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029991468181833625, "grad_norm": 7.510632038116455, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8689213395118713, "num_tokens": 550377470.0, "step": 14431 }, { "epoch": 1.835898740618242, "ewc_loss": 0.06229298189282417, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029822278884239495, "grad_norm": 7.396475315093994, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8693881034851074, "num_tokens": 550416123.0, "step": 14432 }, { "epoch": 1.8360259508968326, "ewc_loss": 0.062484607100486755, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030013901414349675, "grad_norm": 7.463405609130859, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8699570298194885, "num_tokens": 550458062.0, "step": 14433 }, { "epoch": 1.836153161175423, "ewc_loss": 0.06239087134599686, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002992016961798072, "grad_norm": 7.470979690551758, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8580163717269897, "num_tokens": 550496485.0, "step": 14434 }, { "epoch": 1.8362803714540135, "ewc_loss": 0.0624903179705143, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003001961449626833, "grad_norm": 7.5008134841918945, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8663272857666016, "num_tokens": 550536500.0, "step": 14435 }, { "epoch": 1.836407581732604, "ewc_loss": 0.06244798004627228, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002997727715410292, "grad_norm": 7.500818252563477, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8587349653244019, "num_tokens": 550575611.0, "step": 14436 }, { "epoch": 1.8365347920111945, "ewc_loss": 0.062464967370033264, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002999426214955747, "grad_norm": 7.516921043395996, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8561278581619263, "num_tokens": 550616431.0, "step": 14437 }, { "epoch": 1.836662002289785, "ewc_loss": 0.06270851939916611, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029993674252182245, "grad_norm": 7.496031761169434, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8736771941184998, "num_tokens": 550653626.0, "step": 14438 }, { "epoch": 1.8367892125683756, "ewc_loss": 0.06249835342168808, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003002765297424048, "grad_norm": 7.541844367980957, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8592737317085266, "num_tokens": 550690899.0, "step": 14439 }, { "epoch": 1.8369164228469659, "ewc_loss": 0.062392111867666245, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029921409441158175, "grad_norm": 7.47060489654541, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8646004796028137, "num_tokens": 550734408.0, "step": 14440 }, { "epoch": 1.8370436331255564, "ewc_loss": 0.06244852766394615, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002997782430611551, "grad_norm": 7.5313496589660645, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8533730506896973, "num_tokens": 550767888.0, "step": 14441 }, { "epoch": 1.837170843404147, "ewc_loss": 0.06242547929286957, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002995477698277682, "grad_norm": 7.449040412902832, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.866683840751648, "num_tokens": 550807192.0, "step": 14442 }, { "epoch": 1.8372980536827375, "ewc_loss": 0.06255029886960983, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030079594580456614, "grad_norm": 7.517460823059082, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8726401329040527, "num_tokens": 550848006.0, "step": 14443 }, { "epoch": 1.837425263961328, "ewc_loss": 0.062475789338350296, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003000508586410433, "grad_norm": 7.43031644821167, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8658223748207092, "num_tokens": 550888350.0, "step": 14444 }, { "epoch": 1.8375524742399185, "ewc_loss": 0.06258648633956909, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003011577937286347, "grad_norm": 7.539410591125488, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8707498908042908, "num_tokens": 550925389.0, "step": 14445 }, { "epoch": 1.837679684518509, "ewc_loss": 0.062425583600997925, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002995487884618342, "grad_norm": 7.4372239112854, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8693609237670898, "num_tokens": 550962610.0, "step": 14446 }, { "epoch": 1.8378068947970996, "ewc_loss": 0.06261943280696869, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030148730729706585, "grad_norm": 7.505682945251465, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.862575888633728, "num_tokens": 551001018.0, "step": 14447 }, { "epoch": 1.83793410507569, "ewc_loss": 0.06243455410003662, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002996385155711323, "grad_norm": 7.468377590179443, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8666186332702637, "num_tokens": 551038505.0, "step": 14448 }, { "epoch": 1.8380613153542806, "ewc_loss": 0.06283894926309586, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000301241030683741, "grad_norm": 7.484621524810791, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8773840069770813, "num_tokens": 551072235.0, "step": 14449 }, { "epoch": 1.8381885256328712, "ewc_loss": 0.06280938535928726, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030094539397396147, "grad_norm": 7.468948841094971, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8776542544364929, "num_tokens": 551110129.0, "step": 14450 }, { "epoch": 1.8383157359114617, "ewc_loss": 0.06257705390453339, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003010634973179549, "grad_norm": 7.443781852722168, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.865723729133606, "num_tokens": 551152380.0, "step": 14451 }, { "epoch": 1.8384429461900522, "ewc_loss": 0.06271165609359741, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030240954947657883, "grad_norm": 7.50656795501709, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8753101825714111, "num_tokens": 551195942.0, "step": 14452 }, { "epoch": 1.8385701564686427, "ewc_loss": 0.06256292015314102, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000300922169117257, "grad_norm": 7.46048641204834, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8576512336730957, "num_tokens": 551238502.0, "step": 14453 }, { "epoch": 1.8386973667472333, "ewc_loss": 0.06262469291687012, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030153992702253163, "grad_norm": 7.519134998321533, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8749414086341858, "num_tokens": 551278091.0, "step": 14454 }, { "epoch": 1.8388245770258238, "ewc_loss": 0.06247572600841522, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003000502474606037, "grad_norm": 7.509768486022949, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8443523645401001, "num_tokens": 551314940.0, "step": 14455 }, { "epoch": 1.8389517873044143, "ewc_loss": 0.0626179501414299, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003014724643435329, "grad_norm": 7.485830307006836, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.851059079170227, "num_tokens": 551356427.0, "step": 14456 }, { "epoch": 1.8390789975830049, "ewc_loss": 0.06254199892282486, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000300712970783934, "grad_norm": 7.495306015014648, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8692262768745422, "num_tokens": 551400436.0, "step": 14457 }, { "epoch": 1.8392062078615952, "ewc_loss": 0.06247788667678833, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030007181339897215, "grad_norm": 7.482248783111572, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8629862070083618, "num_tokens": 551438335.0, "step": 14458 }, { "epoch": 1.8393334181401857, "ewc_loss": 0.06253474205732346, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003006403858307749, "grad_norm": 7.544017791748047, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8666091561317444, "num_tokens": 551471214.0, "step": 14459 }, { "epoch": 1.8394606284187762, "ewc_loss": 0.06241852045059204, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029947818256914616, "grad_norm": 7.455219745635986, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8523463010787964, "num_tokens": 551504071.0, "step": 14460 }, { "epoch": 1.8395878386973668, "ewc_loss": 0.06263740360736847, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003016670234501362, "grad_norm": 7.523194313049316, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8597640991210938, "num_tokens": 551538456.0, "step": 14461 }, { "epoch": 1.8397150489759573, "ewc_loss": 0.06238472834229469, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000299140257993713, "grad_norm": 7.426392078399658, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8686298727989197, "num_tokens": 551575334.0, "step": 14462 }, { "epoch": 1.8398422592545478, "ewc_loss": 0.06268759071826935, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030216891900636256, "grad_norm": 7.530335903167725, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8736401796340942, "num_tokens": 551617127.0, "step": 14463 }, { "epoch": 1.8399694695331381, "ewc_loss": 0.06268367171287537, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002996882831212133, "grad_norm": 7.411327362060547, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.870255172252655, "num_tokens": 551655548.0, "step": 14464 }, { "epoch": 1.8400966798117286, "ewc_loss": 0.0629105418920517, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030195695580914617, "grad_norm": 7.531808376312256, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8618887662887573, "num_tokens": 551691807.0, "step": 14465 }, { "epoch": 1.8402238900903192, "ewc_loss": 0.06268184632062912, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002996700059156865, "grad_norm": 7.409505367279053, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8730226755142212, "num_tokens": 551729805.0, "step": 14466 }, { "epoch": 1.8403511003689097, "ewc_loss": 0.06304488331079483, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000303300388623029, "grad_norm": 7.543731212615967, "learning_rate": 1e-06, "loss": 0.5408, "mean_token_accuracy": 0.8416478633880615, "num_tokens": 551775220.0, "step": 14467 }, { "epoch": 1.8404783106475002, "ewc_loss": 0.06271418184041977, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029999337857589126, "grad_norm": 7.474081516265869, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8568115830421448, "num_tokens": 551816242.0, "step": 14468 }, { "epoch": 1.8406055209260908, "ewc_loss": 0.06292498856782913, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030210145632736385, "grad_norm": 7.539167404174805, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8538239002227783, "num_tokens": 551854929.0, "step": 14469 }, { "epoch": 1.8407327312046813, "ewc_loss": 0.06273122131824493, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000300163752399385, "grad_norm": 7.456953525543213, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8634351491928101, "num_tokens": 551898877.0, "step": 14470 }, { "epoch": 1.8408599414832718, "ewc_loss": 0.06283529102802277, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030120450537651777, "grad_norm": 7.468848705291748, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8809943199157715, "num_tokens": 551934252.0, "step": 14471 }, { "epoch": 1.8409871517618623, "ewc_loss": 0.06282839179039001, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030113544198684394, "grad_norm": 7.489110469818115, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8764415383338928, "num_tokens": 551974779.0, "step": 14472 }, { "epoch": 1.8411143620404529, "ewc_loss": 0.06281794607639313, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003010309883393347, "grad_norm": 7.52160120010376, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8646746873855591, "num_tokens": 552011627.0, "step": 14473 }, { "epoch": 1.8412415723190434, "ewc_loss": 0.06277024745941162, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030055406386964023, "grad_norm": 7.540458679199219, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8440504670143127, "num_tokens": 552045333.0, "step": 14474 }, { "epoch": 1.841368782597634, "ewc_loss": 0.06298916786909103, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003003018500749022, "grad_norm": 7.490701198577881, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8584549427032471, "num_tokens": 552079974.0, "step": 14475 }, { "epoch": 1.8414959928762245, "ewc_loss": 0.06282390654087067, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003010906220879406, "grad_norm": 7.496457099914551, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8720913529396057, "num_tokens": 552117507.0, "step": 14476 }, { "epoch": 1.841623203154815, "ewc_loss": 0.06268056482076645, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002996572002302855, "grad_norm": 7.457385063171387, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8637225031852722, "num_tokens": 552156009.0, "step": 14477 }, { "epoch": 1.8417504134334055, "ewc_loss": 0.0628131851553917, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003009834326803684, "grad_norm": 7.527254581451416, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8727763295173645, "num_tokens": 552189092.0, "step": 14478 }, { "epoch": 1.841877623711996, "ewc_loss": 0.06263307482004166, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029918228392489254, "grad_norm": 7.424836158752441, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8623527884483337, "num_tokens": 552228305.0, "step": 14479 }, { "epoch": 1.8420048339905866, "ewc_loss": 0.06277591735124588, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003006107290275395, "grad_norm": 7.478342056274414, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8659175634384155, "num_tokens": 552269402.0, "step": 14480 }, { "epoch": 1.842132044269177, "ewc_loss": 0.0627041682600975, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002998932613991201, "grad_norm": 7.469518661499023, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8608541488647461, "num_tokens": 552304870.0, "step": 14481 }, { "epoch": 1.8422592545477676, "ewc_loss": 0.06275491416454315, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003004006575793028, "grad_norm": 7.488533973693848, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8600092530250549, "num_tokens": 552336802.0, "step": 14482 }, { "epoch": 1.842386464826358, "ewc_loss": 0.0627402514219284, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003002540615852922, "grad_norm": 7.455657482147217, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8781216740608215, "num_tokens": 552376124.0, "step": 14483 }, { "epoch": 1.8425136751049485, "ewc_loss": 0.06285934150218964, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030144499032758176, "grad_norm": 7.57712459564209, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8558341860771179, "num_tokens": 552412285.0, "step": 14484 }, { "epoch": 1.842640885383539, "ewc_loss": 0.06265580654144287, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000299409672152251, "grad_norm": 7.451141834259033, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8651978373527527, "num_tokens": 552447527.0, "step": 14485 }, { "epoch": 1.8427680956621295, "ewc_loss": 0.06282804161310196, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003011319786310196, "grad_norm": 7.506072044372559, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8670836091041565, "num_tokens": 552486569.0, "step": 14486 }, { "epoch": 1.84289530594072, "ewc_loss": 0.06264816224575043, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029933321638964117, "grad_norm": 7.457190990447998, "learning_rate": 1e-06, "loss": 0.594, "mean_token_accuracy": 0.82914137840271, "num_tokens": 552529158.0, "step": 14487 }, { "epoch": 1.8430225162193106, "ewc_loss": 0.06285104155540466, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003013619570992887, "grad_norm": 7.664707660675049, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8637069463729858, "num_tokens": 552572183.0, "step": 14488 }, { "epoch": 1.8431497264979009, "ewc_loss": 0.06256578117609024, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002985093742609024, "grad_norm": 7.471584320068359, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8628894090652466, "num_tokens": 552609785.0, "step": 14489 }, { "epoch": 1.8432769367764914, "ewc_loss": 0.06273740530014038, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030022565624676645, "grad_norm": 7.526986122131348, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8708726763725281, "num_tokens": 552641243.0, "step": 14490 }, { "epoch": 1.843404147055082, "ewc_loss": 0.0625428706407547, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000298280268907547, "grad_norm": 7.482845306396484, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8576292991638184, "num_tokens": 552677595.0, "step": 14491 }, { "epoch": 1.8435313573336725, "ewc_loss": 0.06277864426374435, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003006380284205079, "grad_norm": 7.536444187164307, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8652536869049072, "num_tokens": 552715492.0, "step": 14492 }, { "epoch": 1.843658567612263, "ewc_loss": 0.06265737861394882, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002994253591168672, "grad_norm": 7.4035539627075195, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8642852902412415, "num_tokens": 552757410.0, "step": 14493 }, { "epoch": 1.8437857778908535, "ewc_loss": 0.06274580210447311, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003003095625899732, "grad_norm": 7.5049214363098145, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8803564310073853, "num_tokens": 552793794.0, "step": 14494 }, { "epoch": 1.843912988169444, "ewc_loss": 0.06321822106838226, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000300150946713984, "grad_norm": 7.460700511932373, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8505354523658752, "num_tokens": 552836010.0, "step": 14495 }, { "epoch": 1.8440401984480346, "ewc_loss": 0.0628041923046112, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003008934436365962, "grad_norm": 7.480745315551758, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8679429292678833, "num_tokens": 552874136.0, "step": 14496 }, { "epoch": 1.844167408726625, "ewc_loss": 0.06286562979221344, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003015078546013683, "grad_norm": 7.522599697113037, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8614693880081177, "num_tokens": 552912314.0, "step": 14497 }, { "epoch": 1.8442946190052156, "ewc_loss": 0.06276637315750122, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030051529756747186, "grad_norm": 7.499293327331543, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8529021739959717, "num_tokens": 552953998.0, "step": 14498 }, { "epoch": 1.8444218292838062, "ewc_loss": 0.0628422424197197, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030127400532364845, "grad_norm": 7.505636215209961, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8844732046127319, "num_tokens": 552987310.0, "step": 14499 }, { "epoch": 1.8445490395623967, "ewc_loss": 0.06281176209449768, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003009691718034446, "grad_norm": 7.587223529815674, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8615065813064575, "num_tokens": 553019830.0, "step": 14500 }, { "epoch": 1.8446762498409872, "ewc_loss": 0.06257427483797073, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002985943283420056, "grad_norm": 7.416000843048096, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8638176918029785, "num_tokens": 553066695.0, "step": 14501 }, { "epoch": 1.8448034601195777, "ewc_loss": 0.06282974779605865, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003011490043718368, "grad_norm": 7.759012222290039, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8557976484298706, "num_tokens": 553098943.0, "step": 14502 }, { "epoch": 1.8449306703981683, "ewc_loss": 0.06240374222397804, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002968889893963933, "grad_norm": 7.420502662658691, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8697220683097839, "num_tokens": 553136106.0, "step": 14503 }, { "epoch": 1.8450578806767588, "ewc_loss": 0.0627993568778038, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000300845131278038, "grad_norm": 7.562375068664551, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8703403472900391, "num_tokens": 553170014.0, "step": 14504 }, { "epoch": 1.8451850909553493, "ewc_loss": 0.06241688132286072, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029702039319090545, "grad_norm": 7.407580852508545, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8585001230239868, "num_tokens": 553213297.0, "step": 14505 }, { "epoch": 1.8453123012339399, "ewc_loss": 0.06282205879688263, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000301072170259431, "grad_norm": 7.6213836669921875, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8660215139389038, "num_tokens": 553248559.0, "step": 14506 }, { "epoch": 1.8454395115125302, "ewc_loss": 0.062474802136421204, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002975995885208249, "grad_norm": 7.40421724319458, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8665581941604614, "num_tokens": 553289140.0, "step": 14507 }, { "epoch": 1.8455667217911207, "ewc_loss": 0.06289356201887131, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003017871640622616, "grad_norm": 7.6585307121276855, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8575984835624695, "num_tokens": 553330619.0, "step": 14508 }, { "epoch": 1.8456939320697112, "ewc_loss": 0.06242101639509201, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029706174973398447, "grad_norm": 7.442859649658203, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.874437153339386, "num_tokens": 553371295.0, "step": 14509 }, { "epoch": 1.8458211423483017, "ewc_loss": 0.06288900971412659, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030174164567142725, "grad_norm": 7.535073280334473, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8607539534568787, "num_tokens": 553410237.0, "step": 14510 }, { "epoch": 1.8459483526268923, "ewc_loss": 0.0625256597995758, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002981081488542259, "grad_norm": 7.553830623626709, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8720647096633911, "num_tokens": 553446727.0, "step": 14511 }, { "epoch": 1.8460755629054828, "ewc_loss": 0.06270013004541397, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029985286528244615, "grad_norm": 7.6040263175964355, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8704593777656555, "num_tokens": 553488626.0, "step": 14512 }, { "epoch": 1.846202773184073, "ewc_loss": 0.06243828684091568, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029723445186391473, "grad_norm": 7.472456932067871, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8568378686904907, "num_tokens": 553528294.0, "step": 14513 }, { "epoch": 1.8463299834626636, "ewc_loss": 0.062394119799137115, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029923414695076644, "grad_norm": 7.516034126281738, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.86868816614151, "num_tokens": 553563738.0, "step": 14514 }, { "epoch": 1.8464571937412542, "ewc_loss": 0.06221427023410797, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002974356757476926, "grad_norm": 7.545269012451172, "learning_rate": 1e-06, "loss": 0.5714, "mean_token_accuracy": 0.831082820892334, "num_tokens": 553603351.0, "step": 14515 }, { "epoch": 1.8465844040198447, "ewc_loss": 0.06232353299856186, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002985282917506993, "grad_norm": 7.488753795623779, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8631724119186401, "num_tokens": 553643655.0, "step": 14516 }, { "epoch": 1.8467116142984352, "ewc_loss": 0.062299080193042755, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002982837613672018, "grad_norm": 7.535848140716553, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8760855197906494, "num_tokens": 553677944.0, "step": 14517 }, { "epoch": 1.8468388245770258, "ewc_loss": 0.0624094232916832, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029694580007344484, "grad_norm": 7.46979284286499, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8595188856124878, "num_tokens": 553712120.0, "step": 14518 }, { "epoch": 1.8469660348556163, "ewc_loss": 0.06259135901927948, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987651387229562, "grad_norm": 7.434835433959961, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8695155382156372, "num_tokens": 553751442.0, "step": 14519 }, { "epoch": 1.8470932451342068, "ewc_loss": 0.06266621500253677, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029951368924230337, "grad_norm": 7.504855155944824, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8722406625747681, "num_tokens": 553782924.0, "step": 14520 }, { "epoch": 1.8472204554127973, "ewc_loss": 0.062442515045404434, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002972767106257379, "grad_norm": 7.481784343719482, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8565032482147217, "num_tokens": 553818861.0, "step": 14521 }, { "epoch": 1.8473476656913879, "ewc_loss": 0.06266821920871735, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002995337126776576, "grad_norm": 7.474819183349609, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8623594641685486, "num_tokens": 553855593.0, "step": 14522 }, { "epoch": 1.8474748759699784, "ewc_loss": 0.06254648417234421, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002983163867611438, "grad_norm": 7.475582122802734, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8636500239372253, "num_tokens": 553896028.0, "step": 14523 }, { "epoch": 1.847602086248569, "ewc_loss": 0.062628373503685, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002991352812387049, "grad_norm": 7.469812870025635, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8689759373664856, "num_tokens": 553935212.0, "step": 14524 }, { "epoch": 1.8477292965271594, "ewc_loss": 0.06265951693058014, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002994467504322529, "grad_norm": 7.508966445922852, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8637264370918274, "num_tokens": 553971231.0, "step": 14525 }, { "epoch": 1.84785650680575, "ewc_loss": 0.06261388957500458, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029899048968218267, "grad_norm": 7.487946510314941, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8742343187332153, "num_tokens": 554006782.0, "step": 14526 }, { "epoch": 1.8479837170843405, "ewc_loss": 0.06263121217489243, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002991636865772307, "grad_norm": 7.537112712860107, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8577165007591248, "num_tokens": 554043427.0, "step": 14527 }, { "epoch": 1.848110927362931, "ewc_loss": 0.06254096329212189, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002982611767947674, "grad_norm": 7.406837463378906, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8537289500236511, "num_tokens": 554084187.0, "step": 14528 }, { "epoch": 1.8482381376415216, "ewc_loss": 0.0627589002251625, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030044055893085897, "grad_norm": 7.544015884399414, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8645216822624207, "num_tokens": 554122186.0, "step": 14529 }, { "epoch": 1.848365347920112, "ewc_loss": 0.06257347762584686, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029858632478863, "grad_norm": 7.452052116394043, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8639446496963501, "num_tokens": 554161388.0, "step": 14530 }, { "epoch": 1.8484925581987026, "ewc_loss": 0.06269419193267822, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029979352257214487, "grad_norm": 7.534550666809082, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8565301299095154, "num_tokens": 554199837.0, "step": 14531 }, { "epoch": 1.848619768477293, "ewc_loss": 0.06250470876693726, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029789868858642876, "grad_norm": 7.45382833480835, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8538527488708496, "num_tokens": 554239567.0, "step": 14532 }, { "epoch": 1.8487469787558835, "ewc_loss": 0.06273172795772552, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030016887467354536, "grad_norm": 7.519443511962891, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8679919242858887, "num_tokens": 554277040.0, "step": 14533 }, { "epoch": 1.848874189034474, "ewc_loss": 0.06259648501873016, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002988163905683905, "grad_norm": 7.438446998596191, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8701136112213135, "num_tokens": 554318994.0, "step": 14534 }, { "epoch": 1.8490013993130645, "ewc_loss": 0.06275440752506256, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030039562261663377, "grad_norm": 7.591137409210205, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8590319752693176, "num_tokens": 554355334.0, "step": 14535 }, { "epoch": 1.849128609591655, "ewc_loss": 0.06254898756742477, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029834144515916705, "grad_norm": 7.4713029861450195, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8534963130950928, "num_tokens": 554396504.0, "step": 14536 }, { "epoch": 1.8492558198702456, "ewc_loss": 0.06279940158128738, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003008455678354949, "grad_norm": 7.561177730560303, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8705115914344788, "num_tokens": 554429772.0, "step": 14537 }, { "epoch": 1.8493830301488359, "ewc_loss": 0.06259089708328247, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029876051121391356, "grad_norm": 7.4550275802612305, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8670176267623901, "num_tokens": 554470102.0, "step": 14538 }, { "epoch": 1.8495102404274264, "ewc_loss": 0.06284875422716141, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003013390814885497, "grad_norm": 7.544064998626709, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8547793626785278, "num_tokens": 554511632.0, "step": 14539 }, { "epoch": 1.849637450706017, "ewc_loss": 0.0625518336892128, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002983698796015233, "grad_norm": 7.478151798248291, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8602744340896606, "num_tokens": 554553028.0, "step": 14540 }, { "epoch": 1.8497646609846075, "ewc_loss": 0.06277995556592941, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030065112514421344, "grad_norm": 7.603166580200195, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8599798679351807, "num_tokens": 554585109.0, "step": 14541 }, { "epoch": 1.849891871263198, "ewc_loss": 0.06259094178676605, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987610059790313, "grad_norm": 7.516193866729736, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8650816082954407, "num_tokens": 554621141.0, "step": 14542 }, { "epoch": 1.8500190815417885, "ewc_loss": 0.0626547783613205, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029939934029243886, "grad_norm": 7.517393589019775, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8637844920158386, "num_tokens": 554660838.0, "step": 14543 }, { "epoch": 1.850146291820379, "ewc_loss": 0.06259897351264954, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029884130344726145, "grad_norm": 7.615078926086426, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.864848792552948, "num_tokens": 554696338.0, "step": 14544 }, { "epoch": 1.8502735020989696, "ewc_loss": 0.06248976290225983, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000297749211313203, "grad_norm": 7.496286392211914, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8771836757659912, "num_tokens": 554729782.0, "step": 14545 }, { "epoch": 1.85040071237756, "ewc_loss": 0.0626952201128006, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029980376712046564, "grad_norm": 7.515520095825195, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8807640671730042, "num_tokens": 554767566.0, "step": 14546 }, { "epoch": 1.8505279226561506, "ewc_loss": 0.06259196251630783, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987711923196912, "grad_norm": 7.533658981323242, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8693047165870667, "num_tokens": 554806063.0, "step": 14547 }, { "epoch": 1.8506551329347412, "ewc_loss": 0.06258558481931686, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987074258271605, "grad_norm": 7.50058126449585, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8617544174194336, "num_tokens": 554839418.0, "step": 14548 }, { "epoch": 1.8507823432133317, "ewc_loss": 0.06237496808171272, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029904264374636114, "grad_norm": 7.466846466064453, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8602268695831299, "num_tokens": 554881888.0, "step": 14549 }, { "epoch": 1.8509095534919222, "ewc_loss": 0.06260600686073303, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002989116183016449, "grad_norm": 7.4957075119018555, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8832038640975952, "num_tokens": 554921561.0, "step": 14550 }, { "epoch": 1.8510367637705127, "ewc_loss": 0.06272993981838226, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030015091761015356, "grad_norm": 7.5640482902526855, "learning_rate": 1e-06, "loss": 0.5262, "mean_token_accuracy": 0.8410096168518066, "num_tokens": 554954380.0, "step": 14551 }, { "epoch": 1.8511639740491033, "ewc_loss": 0.06257175654172897, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029856915352866054, "grad_norm": 7.458045959472656, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8659515380859375, "num_tokens": 554994781.0, "step": 14552 }, { "epoch": 1.8512911843276938, "ewc_loss": 0.0625675618648529, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003009685897268355, "grad_norm": 7.63056755065918, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8729920387268066, "num_tokens": 555024938.0, "step": 14553 }, { "epoch": 1.8514183946062843, "ewc_loss": 0.06220629811286926, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002973559603560716, "grad_norm": 7.454865455627441, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8529877066612244, "num_tokens": 555059785.0, "step": 14554 }, { "epoch": 1.8515456048848749, "ewc_loss": 0.0626162439584732, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003014553803950548, "grad_norm": 7.606570720672607, "learning_rate": 1e-06, "loss": 0.5344, "mean_token_accuracy": 0.8395283222198486, "num_tokens": 555097502.0, "step": 14555 }, { "epoch": 1.8516728151634652, "ewc_loss": 0.06245699152350426, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002974214730784297, "grad_norm": 7.4794921875, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8612269163131714, "num_tokens": 555139976.0, "step": 14556 }, { "epoch": 1.8518000254420557, "ewc_loss": 0.0627417117357254, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003002686717081815, "grad_norm": 7.507660865783691, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.867687463760376, "num_tokens": 555181250.0, "step": 14557 }, { "epoch": 1.8519272357206462, "ewc_loss": 0.06260900944471359, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029894165345467627, "grad_norm": 7.6299052238464355, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.846472978591919, "num_tokens": 555214927.0, "step": 14558 }, { "epoch": 1.8520544459992367, "ewc_loss": 0.06245125085115433, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000297364080324769, "grad_norm": 7.506364822387695, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8611557483673096, "num_tokens": 555255795.0, "step": 14559 }, { "epoch": 1.8521816562778273, "ewc_loss": 0.06262841075658798, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029913565958850086, "grad_norm": 7.490151405334473, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8671038150787354, "num_tokens": 555294177.0, "step": 14560 }, { "epoch": 1.8523088665564178, "ewc_loss": 0.062414348125457764, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029699504375457764, "grad_norm": 7.571390151977539, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8766148090362549, "num_tokens": 555331548.0, "step": 14561 }, { "epoch": 1.852436076835008, "ewc_loss": 0.06252609193325043, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029811245622113347, "grad_norm": 7.490859508514404, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8756216168403625, "num_tokens": 555368426.0, "step": 14562 }, { "epoch": 1.8525632871135986, "ewc_loss": 0.0626252070069313, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029910364537499845, "grad_norm": 7.49858283996582, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8810166120529175, "num_tokens": 555411594.0, "step": 14563 }, { "epoch": 1.8526904973921892, "ewc_loss": 0.062487706542015076, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002977286058012396, "grad_norm": 7.546032428741455, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8562124967575073, "num_tokens": 555450412.0, "step": 14564 }, { "epoch": 1.8528177076707797, "ewc_loss": 0.06255771219730377, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002984286693390459, "grad_norm": 7.520650863647461, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8725870847702026, "num_tokens": 555489067.0, "step": 14565 }, { "epoch": 1.8529449179493702, "ewc_loss": 0.06292001903057098, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0002971689682453871, "grad_norm": 7.581062316894531, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8706831336021423, "num_tokens": 555524552.0, "step": 14566 }, { "epoch": 1.8530721282279607, "ewc_loss": 0.06252308934926987, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029808247927576303, "grad_norm": 7.549241542816162, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8385546803474426, "num_tokens": 555563766.0, "step": 14567 }, { "epoch": 1.8531993385065513, "ewc_loss": 0.06253769248723984, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000298228464089334, "grad_norm": 7.4746174812316895, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8720971345901489, "num_tokens": 555601410.0, "step": 14568 }, { "epoch": 1.8533265487851418, "ewc_loss": 0.06255855411291122, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002984370803460479, "grad_norm": 7.776945114135742, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8633920550346375, "num_tokens": 555641346.0, "step": 14569 }, { "epoch": 1.8534537590637323, "ewc_loss": 0.062346816062927246, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002963197184726596, "grad_norm": 7.454935550689697, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8738682270050049, "num_tokens": 555676537.0, "step": 14570 }, { "epoch": 1.8535809693423229, "ewc_loss": 0.06276718527078629, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003005234175361693, "grad_norm": 7.562216281890869, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8506678342819214, "num_tokens": 555721634.0, "step": 14571 }, { "epoch": 1.8537081796209134, "ewc_loss": 0.06208790838718414, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002961720456369221, "grad_norm": 7.435510635375977, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8651491403579712, "num_tokens": 555765447.0, "step": 14572 }, { "epoch": 1.853835389899504, "ewc_loss": 0.06276100128889084, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003004616010002792, "grad_norm": 7.6877946853637695, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8637057542800903, "num_tokens": 555798598.0, "step": 14573 }, { "epoch": 1.8539626001780944, "ewc_loss": 0.06236753612756729, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002965269086416811, "grad_norm": 7.492201805114746, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8725008368492126, "num_tokens": 555829465.0, "step": 14574 }, { "epoch": 1.854089810456685, "ewc_loss": 0.06279699504375458, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003008215280715376, "grad_norm": 7.632600784301758, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8658758401870728, "num_tokens": 555866577.0, "step": 14575 }, { "epoch": 1.8542170207352755, "ewc_loss": 0.06236662343144417, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002965177991427481, "grad_norm": 7.516310214996338, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8669825792312622, "num_tokens": 555900472.0, "step": 14576 }, { "epoch": 1.854344231013866, "ewc_loss": 0.06267410516738892, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029959261883050203, "grad_norm": 7.617547512054443, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.858778715133667, "num_tokens": 555936356.0, "step": 14577 }, { "epoch": 1.8544714412924566, "ewc_loss": 0.0624953955411911, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029780552722513676, "grad_norm": 7.553727149963379, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8575059771537781, "num_tokens": 555975338.0, "step": 14578 }, { "epoch": 1.854598651571047, "ewc_loss": 0.06261669099330902, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002990185166709125, "grad_norm": 7.575851917266846, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8548728227615356, "num_tokens": 556019651.0, "step": 14579 }, { "epoch": 1.8547258618496376, "ewc_loss": 0.06267593055963516, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002996108669321984, "grad_norm": 7.599830150604248, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8602781891822815, "num_tokens": 556054530.0, "step": 14580 }, { "epoch": 1.854853072128228, "ewc_loss": 0.06251081079244614, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002979596611112356, "grad_norm": 7.612194061279297, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8600937724113464, "num_tokens": 556091985.0, "step": 14581 }, { "epoch": 1.8549802824068184, "ewc_loss": 0.06263305246829987, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002991821093019098, "grad_norm": 7.586395740509033, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8681001663208008, "num_tokens": 556128777.0, "step": 14582 }, { "epoch": 1.855107492685409, "ewc_loss": 0.06257984787225723, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002986500330734998, "grad_norm": 7.655515193939209, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8846490383148193, "num_tokens": 556164286.0, "step": 14583 }, { "epoch": 1.8552347029639995, "ewc_loss": 0.06254177540540695, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002982692967634648, "grad_norm": 7.512872695922852, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8722795844078064, "num_tokens": 556199141.0, "step": 14584 }, { "epoch": 1.85536191324259, "ewc_loss": 0.06272508203983307, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030010234331712127, "grad_norm": 7.78349494934082, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8696703910827637, "num_tokens": 556237104.0, "step": 14585 }, { "epoch": 1.8554891235211806, "ewc_loss": 0.06234312057495117, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029628275660797954, "grad_norm": 7.489654064178467, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8542317152023315, "num_tokens": 556275343.0, "step": 14586 }, { "epoch": 1.8556163337997709, "ewc_loss": 0.06278776377439499, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003007291816174984, "grad_norm": 7.673279762268066, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.881057620048523, "num_tokens": 556316539.0, "step": 14587 }, { "epoch": 1.8557435440783614, "ewc_loss": 0.062374744564294815, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002965989988297224, "grad_norm": 7.488220691680908, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8754029273986816, "num_tokens": 556351101.0, "step": 14588 }, { "epoch": 1.855870754356952, "ewc_loss": 0.06271249800920486, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002999765274580568, "grad_norm": 7.614673614501953, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8762610554695129, "num_tokens": 556382458.0, "step": 14589 }, { "epoch": 1.8559979646355425, "ewc_loss": 0.06252975761890411, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002981490979436785, "grad_norm": 7.591553211212158, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.866884171962738, "num_tokens": 556416980.0, "step": 14590 }, { "epoch": 1.856125174914133, "ewc_loss": 0.06258510053157806, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029870253638364375, "grad_norm": 7.616404056549072, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8698310256004333, "num_tokens": 556451868.0, "step": 14591 }, { "epoch": 1.8562523851927235, "ewc_loss": 0.06249682605266571, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029781984630972147, "grad_norm": 7.587128162384033, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8539422750473022, "num_tokens": 556485682.0, "step": 14592 }, { "epoch": 1.856379595471314, "ewc_loss": 0.06252735108137131, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029812505817972124, "grad_norm": 7.512299060821533, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8762357234954834, "num_tokens": 556524857.0, "step": 14593 }, { "epoch": 1.8565068057499046, "ewc_loss": 0.06264571845531464, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002993087691720575, "grad_norm": 7.557356357574463, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8693455457687378, "num_tokens": 556565763.0, "step": 14594 }, { "epoch": 1.856634016028495, "ewc_loss": 0.06261343508958817, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002989858912769705, "grad_norm": 7.5807623863220215, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8541133403778076, "num_tokens": 556606202.0, "step": 14595 }, { "epoch": 1.8567612263070856, "ewc_loss": 0.06262446194887161, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029909616569057107, "grad_norm": 7.498178005218506, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.873856246471405, "num_tokens": 556646906.0, "step": 14596 }, { "epoch": 1.8568884365856761, "ewc_loss": 0.06271792948246002, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030003083520568907, "grad_norm": 7.592089653015137, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.866760790348053, "num_tokens": 556684249.0, "step": 14597 }, { "epoch": 1.8570156468642667, "ewc_loss": 0.06256866455078125, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029853818705305457, "grad_norm": 7.59634256362915, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8535269498825073, "num_tokens": 556726840.0, "step": 14598 }, { "epoch": 1.8571428571428572, "ewc_loss": 0.06260181963443756, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029886976699344814, "grad_norm": 7.564840316772461, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.86697918176651, "num_tokens": 556764822.0, "step": 14599 }, { "epoch": 1.8572700674214477, "ewc_loss": 0.062588170170784, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002987332409247756, "grad_norm": 7.651366233825684, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8640037775039673, "num_tokens": 556800884.0, "step": 14600 }, { "epoch": 1.8573972777000383, "ewc_loss": 0.06239698827266693, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002968214685097337, "grad_norm": 7.498301029205322, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8715726137161255, "num_tokens": 556841233.0, "step": 14601 }, { "epoch": 1.8575244879786288, "ewc_loss": 0.06245475634932518, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002998405252583325, "grad_norm": 7.613681316375732, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8589056134223938, "num_tokens": 556880995.0, "step": 14602 }, { "epoch": 1.8576516982572193, "ewc_loss": 0.06239354982972145, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002967870677821338, "grad_norm": 7.478173732757568, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8693392276763916, "num_tokens": 556914367.0, "step": 14603 }, { "epoch": 1.8577789085358098, "ewc_loss": 0.06276591122150421, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003005106991622597, "grad_norm": 7.614223957061768, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8455023765563965, "num_tokens": 556955944.0, "step": 14604 }, { "epoch": 1.8579061188144002, "ewc_loss": 0.062267813831567764, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002979710989166051, "grad_norm": 7.5904741287231445, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.848649263381958, "num_tokens": 556989851.0, "step": 14605 }, { "epoch": 1.8580333290929907, "ewc_loss": 0.06229200214147568, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029821298085153103, "grad_norm": 7.487853050231934, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8634098768234253, "num_tokens": 557033662.0, "step": 14606 }, { "epoch": 1.8581605393715812, "ewc_loss": 0.06231999397277832, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029849293059669435, "grad_norm": 7.4764299392700195, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8580750823020935, "num_tokens": 557074869.0, "step": 14607 }, { "epoch": 1.8582877496501717, "ewc_loss": 0.062239356338977814, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029768652166239917, "grad_norm": 7.527519702911377, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.853884220123291, "num_tokens": 557110164.0, "step": 14608 }, { "epoch": 1.8584149599287623, "ewc_loss": 0.06234108656644821, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029870381695218384, "grad_norm": 7.529176712036133, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8647174835205078, "num_tokens": 557149309.0, "step": 14609 }, { "epoch": 1.8585421702073528, "ewc_loss": 0.06228070706129074, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00029810002888552845, "grad_norm": 7.518002033233643, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8602127432823181, "num_tokens": 557180510.0, "step": 14610 }, { "epoch": 1.858669380485943, "ewc_loss": 0.06262359768152237, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002990875509567559, "grad_norm": 7.550270080566406, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8702147006988525, "num_tokens": 557218062.0, "step": 14611 }, { "epoch": 1.8587965907645336, "ewc_loss": 0.06257468461990356, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002985983737744391, "grad_norm": 7.4933319091796875, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8562939763069153, "num_tokens": 557258963.0, "step": 14612 }, { "epoch": 1.8589238010431242, "ewc_loss": 0.06240168213844299, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0002993097878061235, "grad_norm": 7.554385185241699, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8662851452827454, "num_tokens": 557296046.0, "step": 14613 }, { "epoch": 1.8590510113217147, "ewc_loss": 0.06255555152893066, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000298407074296847, "grad_norm": 7.50799036026001, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8633334040641785, "num_tokens": 557328395.0, "step": 14614 }, { "epoch": 1.8591782216003052, "ewc_loss": 0.06268728524446487, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002997244009748101, "grad_norm": 7.519049644470215, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8674311637878418, "num_tokens": 557363888.0, "step": 14615 }, { "epoch": 1.8593054318788957, "ewc_loss": 0.06270353496074677, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029988688766025007, "grad_norm": 7.5544209480285645, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8560482263565063, "num_tokens": 557402944.0, "step": 14616 }, { "epoch": 1.8594326421574863, "ewc_loss": 0.06258570402860641, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029870858998037875, "grad_norm": 7.4705400466918945, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8701663017272949, "num_tokens": 557442247.0, "step": 14617 }, { "epoch": 1.8595598524360768, "ewc_loss": 0.06273899972438812, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003002415469381958, "grad_norm": 7.555529594421387, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8581665754318237, "num_tokens": 557483580.0, "step": 14618 }, { "epoch": 1.8596870627146673, "ewc_loss": 0.06264165043830872, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002992680238094181, "grad_norm": 7.488802909851074, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8680947422981262, "num_tokens": 557520134.0, "step": 14619 }, { "epoch": 1.8598142729932579, "ewc_loss": 0.06271237134933472, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002999752468895167, "grad_norm": 7.551521301269531, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8589339256286621, "num_tokens": 557565726.0, "step": 14620 }, { "epoch": 1.8599414832718484, "ewc_loss": 0.06261391192674637, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002989906643051654, "grad_norm": 7.507087707519531, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8631014823913574, "num_tokens": 557602871.0, "step": 14621 }, { "epoch": 1.860068693550439, "ewc_loss": 0.06280206143856049, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030087216873653233, "grad_norm": 7.538562774658203, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8575319051742554, "num_tokens": 557640873.0, "step": 14622 }, { "epoch": 1.8601959038290294, "ewc_loss": 0.06269305944442749, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002997821429744363, "grad_norm": 7.535858154296875, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8586378693580627, "num_tokens": 557684233.0, "step": 14623 }, { "epoch": 1.86032311410762, "ewc_loss": 0.06277570873498917, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030060866265557706, "grad_norm": 7.55671501159668, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.870453953742981, "num_tokens": 557716895.0, "step": 14624 }, { "epoch": 1.8604503243862105, "ewc_loss": 0.06269601732492447, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002998117415700108, "grad_norm": 7.4955925941467285, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8619280457496643, "num_tokens": 557758197.0, "step": 14625 }, { "epoch": 1.860577534664801, "ewc_loss": 0.06278157234191895, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003006672777701169, "grad_norm": 7.656390190124512, "learning_rate": 1e-06, "loss": 0.5603, "mean_token_accuracy": 0.8373169898986816, "num_tokens": 557794196.0, "step": 14626 }, { "epoch": 1.8607047449433916, "ewc_loss": 0.06250816583633423, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029793320572935045, "grad_norm": 7.485988140106201, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8480576872825623, "num_tokens": 557833856.0, "step": 14627 }, { "epoch": 1.860831955221982, "ewc_loss": 0.06291145086288452, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003019660653080791, "grad_norm": 7.632025718688965, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8514248132705688, "num_tokens": 557866304.0, "step": 14628 }, { "epoch": 1.8609591655005726, "ewc_loss": 0.06253306567668915, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029818221810273826, "grad_norm": 7.515748977661133, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8766295909881592, "num_tokens": 557907065.0, "step": 14629 }, { "epoch": 1.861086375779163, "ewc_loss": 0.06287510693073273, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030160267488099635, "grad_norm": 7.553145885467529, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.859696626663208, "num_tokens": 557947326.0, "step": 14630 }, { "epoch": 1.8612135860577534, "ewc_loss": 0.06313179433345795, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00029928673757240176, "grad_norm": 7.49080228805542, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.863010048866272, "num_tokens": 557990059.0, "step": 14631 }, { "epoch": 1.861340796336344, "ewc_loss": 0.06277267634868622, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003005783073604107, "grad_norm": 7.66914176940918, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8661230802536011, "num_tokens": 558028406.0, "step": 14632 }, { "epoch": 1.8614680066149345, "ewc_loss": 0.06250686198472977, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002979201963171363, "grad_norm": 7.460242748260498, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.86107337474823, "num_tokens": 558070239.0, "step": 14633 }, { "epoch": 1.861595216893525, "ewc_loss": 0.06292908638715744, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030214243452064693, "grad_norm": 7.5877814292907715, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8683987855911255, "num_tokens": 558107685.0, "step": 14634 }, { "epoch": 1.8617224271721156, "ewc_loss": 0.06265488266944885, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002994003880303353, "grad_norm": 7.5787835121154785, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8575267791748047, "num_tokens": 558143185.0, "step": 14635 }, { "epoch": 1.8618496374507059, "ewc_loss": 0.06284980475902557, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003013495879713446, "grad_norm": 7.567241191864014, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8725435137748718, "num_tokens": 558176207.0, "step": 14636 }, { "epoch": 1.8619768477292964, "ewc_loss": 0.06272879242897034, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030013947980478406, "grad_norm": 7.596147060394287, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8808674812316895, "num_tokens": 558212215.0, "step": 14637 }, { "epoch": 1.862104058007887, "ewc_loss": 0.06269420683383942, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029979366809129715, "grad_norm": 7.5536675453186035, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8731970191001892, "num_tokens": 558248471.0, "step": 14638 }, { "epoch": 1.8622312682864774, "ewc_loss": 0.06270167231559753, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002998682903125882, "grad_norm": 7.543004512786865, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8504500389099121, "num_tokens": 558285211.0, "step": 14639 }, { "epoch": 1.862358478565068, "ewc_loss": 0.06257744133472443, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002986259351018816, "grad_norm": 7.626095771789551, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8570057153701782, "num_tokens": 558314434.0, "step": 14640 }, { "epoch": 1.8624856888436585, "ewc_loss": 0.06262005865573883, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029905213159509003, "grad_norm": 7.515465259552002, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8717237114906311, "num_tokens": 558353710.0, "step": 14641 }, { "epoch": 1.862612899122249, "ewc_loss": 0.06260792911052704, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029893082682974637, "grad_norm": 7.55305290222168, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8527100086212158, "num_tokens": 558390476.0, "step": 14642 }, { "epoch": 1.8627401094008396, "ewc_loss": 0.06254427880048752, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002982943260576576, "grad_norm": 7.451089382171631, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8728539943695068, "num_tokens": 558424917.0, "step": 14643 }, { "epoch": 1.86286731967943, "ewc_loss": 0.0627971813082695, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030082339071668684, "grad_norm": 7.598124027252197, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8569995164871216, "num_tokens": 558459831.0, "step": 14644 }, { "epoch": 1.8629945299580206, "ewc_loss": 0.06247928738594055, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029764443752355874, "grad_norm": 7.477702617645264, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8791334629058838, "num_tokens": 558498600.0, "step": 14645 }, { "epoch": 1.8631217402366111, "ewc_loss": 0.06287049502134323, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030155651620589197, "grad_norm": 7.547488212585449, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8556928634643555, "num_tokens": 558537465.0, "step": 14646 }, { "epoch": 1.8632489505152017, "ewc_loss": 0.06261952221393585, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029904674738645554, "grad_norm": 7.5132646560668945, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8832613825798035, "num_tokens": 558575357.0, "step": 14647 }, { "epoch": 1.8633761607937922, "ewc_loss": 0.06278570741415024, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003007086634170264, "grad_norm": 7.531780242919922, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8611525297164917, "num_tokens": 558619422.0, "step": 14648 }, { "epoch": 1.8635033710723827, "ewc_loss": 0.06257803738117218, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002986319304909557, "grad_norm": 7.558366775512695, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8553258180618286, "num_tokens": 558657677.0, "step": 14649 }, { "epoch": 1.8636305813509733, "ewc_loss": 0.06269925832748413, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029984410502947867, "grad_norm": 7.5976948738098145, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8726952075958252, "num_tokens": 558692912.0, "step": 14650 }, { "epoch": 1.8637577916295638, "ewc_loss": 0.0625448152422905, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002982997102662921, "grad_norm": 7.475195407867432, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8761327862739563, "num_tokens": 558733470.0, "step": 14651 }, { "epoch": 1.8638850019081543, "ewc_loss": 0.06275438517332077, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003003954188898206, "grad_norm": 7.595673561096191, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8603947162628174, "num_tokens": 558771899.0, "step": 14652 }, { "epoch": 1.8640122121867448, "ewc_loss": 0.06250052154064178, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002978567499667406, "grad_norm": 7.479724884033203, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.869457483291626, "num_tokens": 558813002.0, "step": 14653 }, { "epoch": 1.8641394224653351, "ewc_loss": 0.06287690252065659, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030162057373672724, "grad_norm": 7.665981292724609, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8488409519195557, "num_tokens": 558850827.0, "step": 14654 }, { "epoch": 1.8642666327439257, "ewc_loss": 0.062460415065288544, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002974556991830468, "grad_norm": 7.4506378173828125, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8614035844802856, "num_tokens": 558889003.0, "step": 14655 }, { "epoch": 1.8643938430225162, "ewc_loss": 0.06281977891921997, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003010493819601834, "grad_norm": 7.588159561157227, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8702749609947205, "num_tokens": 558925849.0, "step": 14656 }, { "epoch": 1.8645210533011067, "ewc_loss": 0.06262395530939102, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002990911016240716, "grad_norm": 7.47794771194458, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8631269335746765, "num_tokens": 558967471.0, "step": 14657 }, { "epoch": 1.8646482635796973, "ewc_loss": 0.06296201050281525, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030247162794694304, "grad_norm": 7.583698749542236, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8676278591156006, "num_tokens": 559001938.0, "step": 14658 }, { "epoch": 1.8647754738582878, "ewc_loss": 0.06261366605758667, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029898821958340704, "grad_norm": 7.485651016235352, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8710893392562866, "num_tokens": 559040515.0, "step": 14659 }, { "epoch": 1.864902684136878, "ewc_loss": 0.06298619508743286, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030271350988186896, "grad_norm": 7.572391510009766, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8780230283737183, "num_tokens": 559077702.0, "step": 14660 }, { "epoch": 1.8650298944154686, "ewc_loss": 0.06269405782222748, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002997921546921134, "grad_norm": 7.488182544708252, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.857387125492096, "num_tokens": 559114741.0, "step": 14661 }, { "epoch": 1.8651571046940592, "ewc_loss": 0.06299702078104019, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030282174702733755, "grad_norm": 7.592395305633545, "learning_rate": 1e-06, "loss": 0.5382, "mean_token_accuracy": 0.8426905870437622, "num_tokens": 559155844.0, "step": 14662 }, { "epoch": 1.8652843149726497, "ewc_loss": 0.06276660412549973, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003005176258739084, "grad_norm": 7.490067958831787, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8666772842407227, "num_tokens": 559194964.0, "step": 14663 }, { "epoch": 1.8654115252512402, "ewc_loss": 0.06287306547164917, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003015822439920157, "grad_norm": 7.611462593078613, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8706439137458801, "num_tokens": 559230083.0, "step": 14664 }, { "epoch": 1.8655387355298307, "ewc_loss": 0.06270605325698853, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029991206247359514, "grad_norm": 7.437067985534668, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8758692741394043, "num_tokens": 559268397.0, "step": 14665 }, { "epoch": 1.8656659458084213, "ewc_loss": 0.06307575106620789, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030360909295268357, "grad_norm": 7.576848983764648, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.856096088886261, "num_tokens": 559302024.0, "step": 14666 }, { "epoch": 1.8657931560870118, "ewc_loss": 0.06259819865226746, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002988335327245295, "grad_norm": 7.472250938415527, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8623353838920593, "num_tokens": 559343192.0, "step": 14667 }, { "epoch": 1.8659203663656023, "ewc_loss": 0.06313544511795044, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003042059834115207, "grad_norm": 7.582315444946289, "learning_rate": 1e-06, "loss": 0.3888, "mean_token_accuracy": 0.8877828121185303, "num_tokens": 559380616.0, "step": 14668 }, { "epoch": 1.8660475766441929, "ewc_loss": 0.06281140446662903, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000300965562928468, "grad_norm": 7.509010314941406, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8748914003372192, "num_tokens": 559413621.0, "step": 14669 }, { "epoch": 1.8661747869227834, "ewc_loss": 0.06292349100112915, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003020864969585091, "grad_norm": 7.5605621337890625, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8732427358627319, "num_tokens": 559447274.0, "step": 14670 }, { "epoch": 1.866301997201374, "ewc_loss": 0.06255112588405609, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003008042403962463, "grad_norm": 7.519413471221924, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8743247985839844, "num_tokens": 559484306.0, "step": 14671 }, { "epoch": 1.8664292074799644, "ewc_loss": 0.06285515427589417, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000301403139019385, "grad_norm": 7.523340225219727, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8711324334144592, "num_tokens": 559527126.0, "step": 14672 }, { "epoch": 1.866556417758555, "ewc_loss": 0.06287561357021332, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030160765163600445, "grad_norm": 7.516399383544922, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8691083192825317, "num_tokens": 559568342.0, "step": 14673 }, { "epoch": 1.8666836280371455, "ewc_loss": 0.06281717866659164, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030102336313575506, "grad_norm": 7.5443243980407715, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8608970046043396, "num_tokens": 559609597.0, "step": 14674 }, { "epoch": 1.866810838315736, "ewc_loss": 0.062788225710392, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000300733809126541, "grad_norm": 7.525807857513428, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8622123003005981, "num_tokens": 559646392.0, "step": 14675 }, { "epoch": 1.8669380485943265, "ewc_loss": 0.062968909740448, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030254063312895596, "grad_norm": 7.5587334632873535, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8676518201828003, "num_tokens": 559684836.0, "step": 14676 }, { "epoch": 1.867065258872917, "ewc_loss": 0.06289058178663254, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003017573617398739, "grad_norm": 7.497453212738037, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8688586354255676, "num_tokens": 559727793.0, "step": 14677 }, { "epoch": 1.8671924691515076, "ewc_loss": 0.06303047388792038, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030315632466226816, "grad_norm": 7.570849895477295, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8543121218681335, "num_tokens": 559768154.0, "step": 14678 }, { "epoch": 1.867319679430098, "ewc_loss": 0.06275546550750732, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003004062455147505, "grad_norm": 7.508147239685059, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8649501204490662, "num_tokens": 559798732.0, "step": 14679 }, { "epoch": 1.8674468897086884, "ewc_loss": 0.063010573387146, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003029573417734355, "grad_norm": 7.53348445892334, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8649030923843384, "num_tokens": 559842796.0, "step": 14680 }, { "epoch": 1.867574099987279, "ewc_loss": 0.0625511035323143, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003008039784617722, "grad_norm": 7.516115188598633, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8622720241546631, "num_tokens": 559884198.0, "step": 14681 }, { "epoch": 1.8677013102658695, "ewc_loss": 0.06264261901378632, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030171917751431465, "grad_norm": 7.581714153289795, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8503421545028687, "num_tokens": 559916749.0, "step": 14682 }, { "epoch": 1.86782852054446, "ewc_loss": 0.06262923777103424, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030158538720570505, "grad_norm": 7.601620197296143, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8596614599227905, "num_tokens": 559958430.0, "step": 14683 }, { "epoch": 1.8679557308230506, "ewc_loss": 0.06282760202884674, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003011275839526206, "grad_norm": 7.5477728843688965, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8823603391647339, "num_tokens": 559995720.0, "step": 14684 }, { "epoch": 1.8680829411016409, "ewc_loss": 0.06282864511013031, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003011379740200937, "grad_norm": 7.539384841918945, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8679506778717041, "num_tokens": 560033304.0, "step": 14685 }, { "epoch": 1.8682101513802314, "ewc_loss": 0.06258074194192886, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003011003718711436, "grad_norm": 7.516124725341797, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8625979423522949, "num_tokens": 560069879.0, "step": 14686 }, { "epoch": 1.868337361658822, "ewc_loss": 0.06269331276416779, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030222610803321004, "grad_norm": 7.590632438659668, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8632215261459351, "num_tokens": 560106853.0, "step": 14687 }, { "epoch": 1.8684645719374124, "ewc_loss": 0.06259635835886002, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030125657212920487, "grad_norm": 7.527670383453369, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8654131889343262, "num_tokens": 560144225.0, "step": 14688 }, { "epoch": 1.868591782216003, "ewc_loss": 0.06295451521873474, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030239668558351696, "grad_norm": 7.594127655029297, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8580300211906433, "num_tokens": 560186520.0, "step": 14689 }, { "epoch": 1.8687189924945935, "ewc_loss": 0.06252126395702362, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030050560599192977, "grad_norm": 7.531610012054443, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8679771423339844, "num_tokens": 560225458.0, "step": 14690 }, { "epoch": 1.868846202773184, "ewc_loss": 0.06260327249765396, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003013256937265396, "grad_norm": 7.544440746307373, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8708200454711914, "num_tokens": 560263401.0, "step": 14691 }, { "epoch": 1.8689734130517746, "ewc_loss": 0.06253577768802643, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003006507467944175, "grad_norm": 7.505714416503906, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8614609837532043, "num_tokens": 560305026.0, "step": 14692 }, { "epoch": 1.869100623330365, "ewc_loss": 0.06265594810247421, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030185244395397604, "grad_norm": 7.504889011383057, "learning_rate": 1e-06, "loss": 0.561, "mean_token_accuracy": 0.8322731256484985, "num_tokens": 560354823.0, "step": 14693 }, { "epoch": 1.8692278336089556, "ewc_loss": 0.062624491751194, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030153788975439966, "grad_norm": 7.592310905456543, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8809263706207275, "num_tokens": 560391888.0, "step": 14694 }, { "epoch": 1.8693550438875461, "ewc_loss": 0.062492191791534424, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030021488782949746, "grad_norm": 7.479369163513184, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8676079511642456, "num_tokens": 560425350.0, "step": 14695 }, { "epoch": 1.8694822541661367, "ewc_loss": 0.06279784440994263, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003032714012078941, "grad_norm": 7.583406448364258, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8752454519271851, "num_tokens": 560458521.0, "step": 14696 }, { "epoch": 1.8696094644447272, "ewc_loss": 0.0625176951289177, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030046989559195936, "grad_norm": 7.463731288909912, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8595662117004395, "num_tokens": 560506378.0, "step": 14697 }, { "epoch": 1.8697366747233177, "ewc_loss": 0.06285184621810913, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003038114227820188, "grad_norm": 7.5822672843933105, "learning_rate": 1e-06, "loss": 0.3848, "mean_token_accuracy": 0.886940062046051, "num_tokens": 560544055.0, "step": 14698 }, { "epoch": 1.8698638850019083, "ewc_loss": 0.06269030272960663, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002997545525431633, "grad_norm": 7.510218620300293, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8746771216392517, "num_tokens": 560577240.0, "step": 14699 }, { "epoch": 1.8699910952804988, "ewc_loss": 0.06285280734300613, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003038210270460695, "grad_norm": 7.652385234832764, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8600184917449951, "num_tokens": 560615662.0, "step": 14700 }, { "epoch": 1.8701183055590893, "ewc_loss": 0.06268563121557236, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029970789910294116, "grad_norm": 7.434477806091309, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8591580390930176, "num_tokens": 560658386.0, "step": 14701 }, { "epoch": 1.8702455158376798, "ewc_loss": 0.06313785910606384, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030423016869463027, "grad_norm": 7.626769542694092, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8640505075454712, "num_tokens": 560696366.0, "step": 14702 }, { "epoch": 1.8703727261162701, "ewc_loss": 0.0627267062664032, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003001185832545161, "grad_norm": 7.533121109008789, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8528916239738464, "num_tokens": 560736419.0, "step": 14703 }, { "epoch": 1.8704999363948607, "ewc_loss": 0.06308843195438385, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030373584013432264, "grad_norm": 7.556044578552246, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8635637164115906, "num_tokens": 560777839.0, "step": 14704 }, { "epoch": 1.8706271466734512, "ewc_loss": 0.06282743066549301, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030112589593045413, "grad_norm": 7.530636310577393, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8744577169418335, "num_tokens": 560816167.0, "step": 14705 }, { "epoch": 1.8707543569520417, "ewc_loss": 0.0629551038146019, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030240259366109967, "grad_norm": 7.556163787841797, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8510652184486389, "num_tokens": 560854524.0, "step": 14706 }, { "epoch": 1.8708815672306323, "ewc_loss": 0.06292479485273361, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030209950637072325, "grad_norm": 7.505706310272217, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8618850111961365, "num_tokens": 560894393.0, "step": 14707 }, { "epoch": 1.8710087775092228, "ewc_loss": 0.06291267275810242, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003019783180207014, "grad_norm": 7.528570175170898, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8759365081787109, "num_tokens": 560938640.0, "step": 14708 }, { "epoch": 1.871135987787813, "ewc_loss": 0.06298220157623291, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030267355032265186, "grad_norm": 7.555021286010742, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8644794225692749, "num_tokens": 560976805.0, "step": 14709 }, { "epoch": 1.8712631980664036, "ewc_loss": 0.06291534006595612, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003020049771293998, "grad_norm": 7.465034484863281, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8605213165283203, "num_tokens": 561018940.0, "step": 14710 }, { "epoch": 1.8713904083449941, "ewc_loss": 0.06309422850608826, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030379381496459246, "grad_norm": 7.5364813804626465, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8691529631614685, "num_tokens": 561065228.0, "step": 14711 }, { "epoch": 1.8715176186235847, "ewc_loss": 0.06295453011989594, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030239688931033015, "grad_norm": 7.528473854064941, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8565466403961182, "num_tokens": 561107540.0, "step": 14712 }, { "epoch": 1.8716448289021752, "ewc_loss": 0.06307871639728546, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030363869154825807, "grad_norm": 7.604278564453125, "learning_rate": 1e-06, "loss": 0.5527, "mean_token_accuracy": 0.8439508080482483, "num_tokens": 561144774.0, "step": 14713 }, { "epoch": 1.8717720391807657, "ewc_loss": 0.06293350458145142, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030218661413528025, "grad_norm": 7.574944496154785, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8457982540130615, "num_tokens": 561186774.0, "step": 14714 }, { "epoch": 1.8718992494593563, "ewc_loss": 0.06298771500587463, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003027286729775369, "grad_norm": 7.54640531539917, "learning_rate": 1e-06, "loss": 0.4094, "mean_token_accuracy": 0.8768568634986877, "num_tokens": 561223997.0, "step": 14715 }, { "epoch": 1.8720264597379468, "ewc_loss": 0.06299892067909241, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030284072272479534, "grad_norm": 7.573582172393799, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8725908994674683, "num_tokens": 561261583.0, "step": 14716 }, { "epoch": 1.8721536700165373, "ewc_loss": 0.06300108879804611, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003028624632861465, "grad_norm": 7.5112152099609375, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.869242787361145, "num_tokens": 561302681.0, "step": 14717 }, { "epoch": 1.8722808802951278, "ewc_loss": 0.06310513615608215, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030390292522497475, "grad_norm": 7.59414529800415, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8632295727729797, "num_tokens": 561338402.0, "step": 14718 }, { "epoch": 1.8724080905737184, "ewc_loss": 0.06287794560194016, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003016310220118612, "grad_norm": 7.475830078125, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8691897392272949, "num_tokens": 561372892.0, "step": 14719 }, { "epoch": 1.872535300852309, "ewc_loss": 0.06320434808731079, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003048950165975839, "grad_norm": 7.600967884063721, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8620028495788574, "num_tokens": 561409474.0, "step": 14720 }, { "epoch": 1.8726625111308994, "ewc_loss": 0.06286319345235825, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003014835237991065, "grad_norm": 7.506131649017334, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8377504348754883, "num_tokens": 561450152.0, "step": 14721 }, { "epoch": 1.87278972140949, "ewc_loss": 0.06322990357875824, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030515057733282447, "grad_norm": 7.548678874969482, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.85907381772995, "num_tokens": 561489448.0, "step": 14722 }, { "epoch": 1.8729169316880805, "ewc_loss": 0.06295612454414368, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030241280910559, "grad_norm": 7.514881610870361, "learning_rate": 1e-06, "loss": 0.5362, "mean_token_accuracy": 0.8414725065231323, "num_tokens": 561529684.0, "step": 14723 }, { "epoch": 1.873044141966671, "ewc_loss": 0.06315182149410248, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003043697797693312, "grad_norm": 7.5799078941345215, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8660922050476074, "num_tokens": 561564259.0, "step": 14724 }, { "epoch": 1.8731713522452615, "ewc_loss": 0.06306760758161545, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003035276604350656, "grad_norm": 7.443238735198975, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8713757991790771, "num_tokens": 561609650.0, "step": 14725 }, { "epoch": 1.873298562523852, "ewc_loss": 0.06330244243144989, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030587599030695856, "grad_norm": 7.623123645782471, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8710615038871765, "num_tokens": 561645520.0, "step": 14726 }, { "epoch": 1.8734257728024426, "ewc_loss": 0.06305952370166779, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003034468099940568, "grad_norm": 7.492238998413086, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8702622056007385, "num_tokens": 561682771.0, "step": 14727 }, { "epoch": 1.873552983081033, "ewc_loss": 0.06324473023414612, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030529883224517107, "grad_norm": 7.633976459503174, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.855315089225769, "num_tokens": 561721317.0, "step": 14728 }, { "epoch": 1.8736801933596234, "ewc_loss": 0.06295765191316605, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030242808861657977, "grad_norm": 7.473920822143555, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8664687871932983, "num_tokens": 561763178.0, "step": 14729 }, { "epoch": 1.873807403638214, "ewc_loss": 0.06322818249464035, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000305133406072855, "grad_norm": 7.606091022491455, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8656564354896545, "num_tokens": 561802292.0, "step": 14730 }, { "epoch": 1.8739346139168045, "ewc_loss": 0.06290633976459503, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030191498808562756, "grad_norm": 7.4750752449035645, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8740553259849548, "num_tokens": 561838610.0, "step": 14731 }, { "epoch": 1.874061824195395, "ewc_loss": 0.06321927905082703, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030504437745548785, "grad_norm": 7.59515380859375, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8674747943878174, "num_tokens": 561881992.0, "step": 14732 }, { "epoch": 1.8741890344739855, "ewc_loss": 0.06294600665569305, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030231167329475284, "grad_norm": 7.550492763519287, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8556702733039856, "num_tokens": 561917661.0, "step": 14733 }, { "epoch": 1.8743162447525759, "ewc_loss": 0.06316011399030685, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003044527256861329, "grad_norm": 7.563870429992676, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8661134243011475, "num_tokens": 561958218.0, "step": 14734 }, { "epoch": 1.8744434550311664, "ewc_loss": 0.06299451738595963, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003027967468369752, "grad_norm": 7.532418251037598, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8767193555831909, "num_tokens": 561993882.0, "step": 14735 }, { "epoch": 1.874570665309757, "ewc_loss": 0.0631609559059143, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000304461078485474, "grad_norm": 10.475688934326172, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8650891780853271, "num_tokens": 562035443.0, "step": 14736 }, { "epoch": 1.8746978755883474, "ewc_loss": 0.06530606001615524, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00032591214403510094, "grad_norm": 7.745425701141357, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8639708757400513, "num_tokens": 562072806.0, "step": 14737 }, { "epoch": 1.874825085866938, "ewc_loss": 0.06386063992977142, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031145798857323825, "grad_norm": 7.815158367156982, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8474797010421753, "num_tokens": 562113652.0, "step": 14738 }, { "epoch": 1.8749522961455285, "ewc_loss": 0.06298288702964783, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030268047703430057, "grad_norm": 7.597469806671143, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8564874529838562, "num_tokens": 562150600.0, "step": 14739 }, { "epoch": 1.875079506424119, "ewc_loss": 0.06401719152927399, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031302348361350596, "grad_norm": 7.840211868286133, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8657876253128052, "num_tokens": 562187730.0, "step": 14740 }, { "epoch": 1.8752067167027096, "ewc_loss": 0.06301172822713852, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030296886689029634, "grad_norm": 7.607271671295166, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8655632734298706, "num_tokens": 562228229.0, "step": 14741 }, { "epoch": 1.8753339269813, "ewc_loss": 0.06341540068387985, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003070055681746453, "grad_norm": 7.723876476287842, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8709349632263184, "num_tokens": 562262293.0, "step": 14742 }, { "epoch": 1.8754611372598906, "ewc_loss": 0.0630372017621994, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003032235545106232, "grad_norm": 7.6017656326293945, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8627171516418457, "num_tokens": 562303389.0, "step": 14743 }, { "epoch": 1.8755883475384811, "ewc_loss": 0.06330670416355133, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030591862741857767, "grad_norm": 7.681075096130371, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8574952483177185, "num_tokens": 562344461.0, "step": 14744 }, { "epoch": 1.8757155578170717, "ewc_loss": 0.06288523972034454, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030170398531481624, "grad_norm": 7.559950828552246, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8484013080596924, "num_tokens": 562387068.0, "step": 14745 }, { "epoch": 1.8758427680956622, "ewc_loss": 0.06316825747489929, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030453415820375085, "grad_norm": 7.627873420715332, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8616312742233276, "num_tokens": 562428225.0, "step": 14746 }, { "epoch": 1.8759699783742527, "ewc_loss": 0.06266991049051285, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030199208413250744, "grad_norm": 7.620205879211426, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8646666407585144, "num_tokens": 562463879.0, "step": 14747 }, { "epoch": 1.8760971886528433, "ewc_loss": 0.06308271735906601, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030367873841896653, "grad_norm": 7.626137733459473, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8573466539382935, "num_tokens": 562502327.0, "step": 14748 }, { "epoch": 1.8762243989314338, "ewc_loss": 0.0634724423289299, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030269319540821016, "grad_norm": 13.778800964355469, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8680544495582581, "num_tokens": 562542162.0, "step": 14749 }, { "epoch": 1.8763516092100243, "ewc_loss": 0.07166068255901337, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00038945834967307746, "grad_norm": 8.61208724975586, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8591787219047546, "num_tokens": 562583364.0, "step": 14750 }, { "epoch": 1.8764788194886148, "ewc_loss": 0.0621083602309227, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000293935154331848, "grad_norm": 7.526800632476807, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8668519854545593, "num_tokens": 562615870.0, "step": 14751 }, { "epoch": 1.8766060297672051, "ewc_loss": 0.06442181766033173, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003170697600580752, "grad_norm": 7.982861518859863, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8499726057052612, "num_tokens": 562655086.0, "step": 14752 }, { "epoch": 1.8767332400457957, "ewc_loss": 0.0637567862868309, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003104194183833897, "grad_norm": 7.5748772621154785, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8575537204742432, "num_tokens": 562697532.0, "step": 14753 }, { "epoch": 1.8768604503243862, "ewc_loss": 0.06395292282104492, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031238075462169945, "grad_norm": 7.843721389770508, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.844136118888855, "num_tokens": 562731057.0, "step": 14754 }, { "epoch": 1.8769876606029767, "ewc_loss": 0.06328441947698593, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030569577938877046, "grad_norm": 7.5617828369140625, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8533003330230713, "num_tokens": 562770402.0, "step": 14755 }, { "epoch": 1.8771148708815673, "ewc_loss": 0.06389886885881424, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031184026738628745, "grad_norm": 7.797674179077148, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8601133823394775, "num_tokens": 562806191.0, "step": 14756 }, { "epoch": 1.8772420811601578, "ewc_loss": 0.06316221505403519, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030447373865172267, "grad_norm": 7.478048801422119, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8637194633483887, "num_tokens": 562846864.0, "step": 14757 }, { "epoch": 1.877369291438748, "ewc_loss": 0.06381316483020782, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031098321778699756, "grad_norm": 7.73261833190918, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8587837219238281, "num_tokens": 562882887.0, "step": 14758 }, { "epoch": 1.8774965017173386, "ewc_loss": 0.06310717761516571, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030392338521778584, "grad_norm": 7.518828392028809, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8632243871688843, "num_tokens": 562919096.0, "step": 14759 }, { "epoch": 1.8776237119959291, "ewc_loss": 0.06367072463035583, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003095588181167841, "grad_norm": 7.738161087036133, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8374766111373901, "num_tokens": 562959336.0, "step": 14760 }, { "epoch": 1.8777509222745197, "ewc_loss": 0.0631573349237442, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030442490242421627, "grad_norm": 7.52323055267334, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8638734817504883, "num_tokens": 562997883.0, "step": 14761 }, { "epoch": 1.8778781325531102, "ewc_loss": 0.06398071348667145, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030777588835917413, "grad_norm": 7.680695533752441, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8740887641906738, "num_tokens": 563036763.0, "step": 14762 }, { "epoch": 1.8780053428317007, "ewc_loss": 0.06307416409254074, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003035932022612542, "grad_norm": 7.569581985473633, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8805167078971863, "num_tokens": 563076815.0, "step": 14763 }, { "epoch": 1.8781325531102913, "ewc_loss": 0.06334967166185379, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030634825816378, "grad_norm": 7.656273365020752, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8743257522583008, "num_tokens": 563110511.0, "step": 14764 }, { "epoch": 1.8782597633888818, "ewc_loss": 0.06300583481788635, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030290993163362145, "grad_norm": 7.5669403076171875, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8590681552886963, "num_tokens": 563140757.0, "step": 14765 }, { "epoch": 1.8783869736674723, "ewc_loss": 0.06320863962173462, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030493797385133803, "grad_norm": 7.577915668487549, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8631888031959534, "num_tokens": 563179961.0, "step": 14766 }, { "epoch": 1.8785141839460628, "ewc_loss": 0.06313220411539078, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003041735908482224, "grad_norm": 7.488263130187988, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8702074885368347, "num_tokens": 563221355.0, "step": 14767 }, { "epoch": 1.8786413942246534, "ewc_loss": 0.06318534910678864, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003047050558961928, "grad_norm": 7.622679233551025, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.885556697845459, "num_tokens": 563257427.0, "step": 14768 }, { "epoch": 1.878768604503244, "ewc_loss": 0.0630415752530098, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003032672975677997, "grad_norm": 7.529740333557129, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8616143465042114, "num_tokens": 563294371.0, "step": 14769 }, { "epoch": 1.8788958147818344, "ewc_loss": 0.06328389793634415, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030569054069928825, "grad_norm": 7.570119857788086, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8739062547683716, "num_tokens": 563338592.0, "step": 14770 }, { "epoch": 1.879023025060425, "ewc_loss": 0.06312378495931625, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003040894225705415, "grad_norm": 7.5841965675354, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8689996004104614, "num_tokens": 563373853.0, "step": 14771 }, { "epoch": 1.8791502353390155, "ewc_loss": 0.0630594789981842, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030344631522893906, "grad_norm": 7.554274559020996, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8631926774978638, "num_tokens": 563411137.0, "step": 14772 }, { "epoch": 1.879277445617606, "ewc_loss": 0.06313867121934891, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030423825955949724, "grad_norm": 7.573182582855225, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8691399097442627, "num_tokens": 563450294.0, "step": 14773 }, { "epoch": 1.8794046558961965, "ewc_loss": 0.0630672350525856, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030352393514476717, "grad_norm": 7.578954219818115, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8657090663909912, "num_tokens": 563496858.0, "step": 14774 }, { "epoch": 1.879531866174787, "ewc_loss": 0.06307484954595566, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000303600070765242, "grad_norm": 7.516298770904541, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.859969437122345, "num_tokens": 563539566.0, "step": 14775 }, { "epoch": 1.8796590764533776, "ewc_loss": 0.06309927999973297, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003038443683180958, "grad_norm": 7.593449115753174, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8713638186454773, "num_tokens": 563574363.0, "step": 14776 }, { "epoch": 1.879786286731968, "ewc_loss": 0.06296698749065399, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030252148280851543, "grad_norm": 7.593460559844971, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.866166353225708, "num_tokens": 563605540.0, "step": 14777 }, { "epoch": 1.8799134970105584, "ewc_loss": 0.06306134164333344, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030346494168043137, "grad_norm": 7.611286163330078, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.85004723072052, "num_tokens": 563646640.0, "step": 14778 }, { "epoch": 1.880040707289149, "ewc_loss": 0.06281247735023499, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003034177061636001, "grad_norm": 7.556765079498291, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8743807673454285, "num_tokens": 563683576.0, "step": 14779 }, { "epoch": 1.8801679175677395, "ewc_loss": 0.06304044276475906, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003032560052815825, "grad_norm": 7.6469407081604, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8563595414161682, "num_tokens": 563724088.0, "step": 14780 }, { "epoch": 1.88029512784633, "ewc_loss": 0.06290607899427414, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000301912339637056, "grad_norm": 7.525574207305908, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8632277250289917, "num_tokens": 563760897.0, "step": 14781 }, { "epoch": 1.8804223381249205, "ewc_loss": 0.06282281875610352, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030352111207321286, "grad_norm": 7.555389881134033, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8692553043365479, "num_tokens": 563800158.0, "step": 14782 }, { "epoch": 1.8805495484035109, "ewc_loss": 0.0629500225186348, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003023517783731222, "grad_norm": 7.545169830322266, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8712456822395325, "num_tokens": 563833288.0, "step": 14783 }, { "epoch": 1.8806767586821014, "ewc_loss": 0.06312379986047745, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030408953898586333, "grad_norm": 7.56055212020874, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8759843111038208, "num_tokens": 563874507.0, "step": 14784 }, { "epoch": 1.880803968960692, "ewc_loss": 0.06301809847354889, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003030325169675052, "grad_norm": 7.559978008270264, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8645800948143005, "num_tokens": 563912452.0, "step": 14785 }, { "epoch": 1.8809311792392824, "ewc_loss": 0.0630054697394371, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030290623544715345, "grad_norm": 7.583137512207031, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8472249507904053, "num_tokens": 563950565.0, "step": 14786 }, { "epoch": 1.881058389517873, "ewc_loss": 0.06308240443468094, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030367562430910766, "grad_norm": 7.595674514770508, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8552316427230835, "num_tokens": 563984924.0, "step": 14787 }, { "epoch": 1.8811855997964635, "ewc_loss": 0.06308706849813461, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030372224864549935, "grad_norm": 7.556667327880859, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8718781471252441, "num_tokens": 564022887.0, "step": 14788 }, { "epoch": 1.881312810075054, "ewc_loss": 0.06306687742471695, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000303520355373621, "grad_norm": 7.555619239807129, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8635389804840088, "num_tokens": 564064829.0, "step": 14789 }, { "epoch": 1.8814400203536445, "ewc_loss": 0.06308364868164062, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030368802254088223, "grad_norm": 7.6313605308532715, "learning_rate": 1e-06, "loss": 0.5379, "mean_token_accuracy": 0.8430742621421814, "num_tokens": 564100858.0, "step": 14790 }, { "epoch": 1.881567230632235, "ewc_loss": 0.06293955445289612, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003022471209987998, "grad_norm": 7.497650146484375, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8568979501724243, "num_tokens": 564141477.0, "step": 14791 }, { "epoch": 1.8816944409108256, "ewc_loss": 0.06319794058799744, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030483092996291816, "grad_norm": 7.629188537597656, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.857504665851593, "num_tokens": 564179271.0, "step": 14792 }, { "epoch": 1.8818216511894161, "ewc_loss": 0.06285561621189117, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030140767921693623, "grad_norm": 7.464119911193848, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8703827857971191, "num_tokens": 564224336.0, "step": 14793 }, { "epoch": 1.8819488614680067, "ewc_loss": 0.06337160617113113, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003065676137339324, "grad_norm": 7.677637577056885, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8592628240585327, "num_tokens": 564265157.0, "step": 14794 }, { "epoch": 1.8820760717465972, "ewc_loss": 0.06282274425029755, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003010790387634188, "grad_norm": 7.656428813934326, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.869779646396637, "num_tokens": 564295023.0, "step": 14795 }, { "epoch": 1.8822032820251877, "ewc_loss": 0.06303916126489639, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003032431995961815, "grad_norm": 7.547647953033447, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8669551610946655, "num_tokens": 564333700.0, "step": 14796 }, { "epoch": 1.8823304923037782, "ewc_loss": 0.06293904781341553, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003022419987246394, "grad_norm": 7.740611553192139, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8500306606292725, "num_tokens": 564374342.0, "step": 14797 }, { "epoch": 1.8824577025823688, "ewc_loss": 0.06289278715848923, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003017794224433601, "grad_norm": 7.727572917938232, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8653198480606079, "num_tokens": 564422877.0, "step": 14798 }, { "epoch": 1.8825849128609593, "ewc_loss": 0.06274668127298355, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003003183810506016, "grad_norm": 7.429357528686523, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8645858764648438, "num_tokens": 564464713.0, "step": 14799 }, { "epoch": 1.8827121231395498, "ewc_loss": 0.06302929669618607, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003031445376109332, "grad_norm": 7.710538864135742, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8535051345825195, "num_tokens": 564500667.0, "step": 14800 }, { "epoch": 1.8828393334181401, "ewc_loss": 0.06254757940769196, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029832732980139554, "grad_norm": 7.40178918838501, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.873704195022583, "num_tokens": 564541460.0, "step": 14801 }, { "epoch": 1.8829665436967307, "ewc_loss": 0.06329597532749176, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030581135069951415, "grad_norm": 7.858546257019043, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.855728805065155, "num_tokens": 564580983.0, "step": 14802 }, { "epoch": 1.8830937539753212, "ewc_loss": 0.06252036988735199, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000298055267194286, "grad_norm": 7.406018257141113, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8743342161178589, "num_tokens": 564615967.0, "step": 14803 }, { "epoch": 1.8832209642539117, "ewc_loss": 0.06346593797206879, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030751098529435694, "grad_norm": 7.9201765060424805, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8546106219291687, "num_tokens": 564653255.0, "step": 14804 }, { "epoch": 1.8833481745325023, "ewc_loss": 0.06256385147571564, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029849010752514005, "grad_norm": 7.417938709259033, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8610226511955261, "num_tokens": 564688420.0, "step": 14805 }, { "epoch": 1.8834753848110928, "ewc_loss": 0.06358454376459122, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003086969954892993, "grad_norm": 8.011293411254883, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8569366931915283, "num_tokens": 564733123.0, "step": 14806 }, { "epoch": 1.883602595089683, "ewc_loss": 0.06262568384408951, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00029910841840319335, "grad_norm": 7.395492076873779, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8715692758560181, "num_tokens": 564767712.0, "step": 14807 }, { "epoch": 1.8837298053682736, "ewc_loss": 0.06373114883899689, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031016304274089634, "grad_norm": 7.971797466278076, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8607372641563416, "num_tokens": 564810663.0, "step": 14808 }, { "epoch": 1.8838570156468641, "ewc_loss": 0.06268727779388428, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0002997243427671492, "grad_norm": 7.459907531738281, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8618434071540833, "num_tokens": 564848689.0, "step": 14809 }, { "epoch": 1.8839842259254547, "ewc_loss": 0.06364188343286514, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003092703700531274, "grad_norm": 7.782105445861816, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8517460227012634, "num_tokens": 564887426.0, "step": 14810 }, { "epoch": 1.8841114362040452, "ewc_loss": 0.06294865161180496, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003023380704689771, "grad_norm": 7.643744468688965, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8681591749191284, "num_tokens": 564924852.0, "step": 14811 }, { "epoch": 1.8842386464826357, "ewc_loss": 0.06305739283561707, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030586685170419514, "grad_norm": 7.621857166290283, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8611088395118713, "num_tokens": 564964201.0, "step": 14812 }, { "epoch": 1.8843658567612263, "ewc_loss": 0.0630817636847496, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003036691923625767, "grad_norm": 7.60536003112793, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.875457227230072, "num_tokens": 565004939.0, "step": 14813 }, { "epoch": 1.8844930670398168, "ewc_loss": 0.06305377185344696, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003033893008250743, "grad_norm": 7.590208530426025, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8682290315628052, "num_tokens": 565044067.0, "step": 14814 }, { "epoch": 1.8846202773184073, "ewc_loss": 0.0631614401936531, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003044659679289907, "grad_norm": 7.632996082305908, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8669723272323608, "num_tokens": 565082114.0, "step": 14815 }, { "epoch": 1.8847474875969978, "ewc_loss": 0.06300710886716843, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030292265000753105, "grad_norm": 7.531383037567139, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8650410175323486, "num_tokens": 565118350.0, "step": 14816 }, { "epoch": 1.8848746978755884, "ewc_loss": 0.0632689893245697, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003055414999835193, "grad_norm": 7.656545639038086, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.857766330242157, "num_tokens": 565153313.0, "step": 14817 }, { "epoch": 1.885001908154179, "ewc_loss": 0.06304077804088593, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030325938132591546, "grad_norm": 7.612706661224365, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8519396781921387, "num_tokens": 565191332.0, "step": 14818 }, { "epoch": 1.8851291184327694, "ewc_loss": 0.06317839026451111, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030463546863757074, "grad_norm": 7.584394454956055, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8607709407806396, "num_tokens": 565229937.0, "step": 14819 }, { "epoch": 1.88525632871136, "ewc_loss": 0.06308643519878387, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003037158749066293, "grad_norm": 7.5691423416137695, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8588278293609619, "num_tokens": 565266271.0, "step": 14820 }, { "epoch": 1.8853835389899505, "ewc_loss": 0.06307680904865265, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030361965764313936, "grad_norm": 7.611095428466797, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8618346452713013, "num_tokens": 565300593.0, "step": 14821 }, { "epoch": 1.885510749268541, "ewc_loss": 0.063153937458992, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030439093825407326, "grad_norm": 7.559719562530518, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8626744747161865, "num_tokens": 565341801.0, "step": 14822 }, { "epoch": 1.8856379595471315, "ewc_loss": 0.06307004392147064, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003035520203411579, "grad_norm": 7.5863542556762695, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8818298578262329, "num_tokens": 565374380.0, "step": 14823 }, { "epoch": 1.885765169825722, "ewc_loss": 0.06310024857521057, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000303854001685977, "grad_norm": 7.603975772857666, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8568201065063477, "num_tokens": 565413096.0, "step": 14824 }, { "epoch": 1.8858923801043126, "ewc_loss": 0.06302730739116669, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003031246014870703, "grad_norm": 7.616997718811035, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8732577562332153, "num_tokens": 565453292.0, "step": 14825 }, { "epoch": 1.886019590382903, "ewc_loss": 0.0630495548248291, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030334710027091205, "grad_norm": 7.594038963317871, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.85719895362854, "num_tokens": 565489311.0, "step": 14826 }, { "epoch": 1.8861468006614934, "ewc_loss": 0.06299088150262833, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030276039615273476, "grad_norm": 7.55947732925415, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.847611665725708, "num_tokens": 565526293.0, "step": 14827 }, { "epoch": 1.886274010940084, "ewc_loss": 0.0631425604224205, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030427714227698743, "grad_norm": 7.612732410430908, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8564165234565735, "num_tokens": 565569215.0, "step": 14828 }, { "epoch": 1.8864012212186745, "ewc_loss": 0.06301087141036987, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030296031036414206, "grad_norm": 7.485957145690918, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8748400211334229, "num_tokens": 565610755.0, "step": 14829 }, { "epoch": 1.886528431497265, "ewc_loss": 0.0632704496383667, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003055560227949172, "grad_norm": 7.753060340881348, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8686127662658691, "num_tokens": 565641964.0, "step": 14830 }, { "epoch": 1.8866556417758555, "ewc_loss": 0.0630115419626236, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030296697514131665, "grad_norm": 7.607851028442383, "learning_rate": 1e-06, "loss": 0.5648, "mean_token_accuracy": 0.8291802406311035, "num_tokens": 565682852.0, "step": 14831 }, { "epoch": 1.8867828520544458, "ewc_loss": 0.06313827633857727, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003042343305423856, "grad_norm": 7.6305060386657715, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.87940514087677, "num_tokens": 565721352.0, "step": 14832 }, { "epoch": 1.8869100623330364, "ewc_loss": 0.06303992867469788, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003032508247997612, "grad_norm": 7.698364734649658, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8691929578781128, "num_tokens": 565764045.0, "step": 14833 }, { "epoch": 1.887037272611627, "ewc_loss": 0.06296759843826294, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030252750730142, "grad_norm": 7.586193561553955, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8723322153091431, "num_tokens": 565797617.0, "step": 14834 }, { "epoch": 1.8871644828902174, "ewc_loss": 0.06296989321708679, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003025504993274808, "grad_norm": 7.56642484664917, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8623824715614319, "num_tokens": 565839254.0, "step": 14835 }, { "epoch": 1.887291693168808, "ewc_loss": 0.06302490830421448, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000303100619930774, "grad_norm": 7.652859210968018, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.864565908908844, "num_tokens": 565874742.0, "step": 14836 }, { "epoch": 1.8874189034473985, "ewc_loss": 0.0628684014081955, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030153561965562403, "grad_norm": 7.590147972106934, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8645578622817993, "num_tokens": 565911043.0, "step": 14837 }, { "epoch": 1.887546113725989, "ewc_loss": 0.06303136050701141, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030316514312289655, "grad_norm": 7.626747131347656, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8593069314956665, "num_tokens": 565950228.0, "step": 14838 }, { "epoch": 1.8876733240045795, "ewc_loss": 0.06294304132461548, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003022820164915174, "grad_norm": 7.648810863494873, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8756920695304871, "num_tokens": 565982891.0, "step": 14839 }, { "epoch": 1.88780053428317, "ewc_loss": 0.06285233050584793, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030137485009618104, "grad_norm": 7.556739807128906, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8622408509254456, "num_tokens": 566022601.0, "step": 14840 }, { "epoch": 1.8879277445617606, "ewc_loss": 0.06303934752941132, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030324506224133074, "grad_norm": 7.617753982543945, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8658614158630371, "num_tokens": 566054928.0, "step": 14841 }, { "epoch": 1.8880549548403511, "ewc_loss": 0.0629638135433197, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003024897014256567, "grad_norm": 7.580982685089111, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8568179607391357, "num_tokens": 566094900.0, "step": 14842 }, { "epoch": 1.8881821651189417, "ewc_loss": 0.06304381042718887, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030328964930959046, "grad_norm": 7.660399436950684, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8617370128631592, "num_tokens": 566125230.0, "step": 14843 }, { "epoch": 1.8883093753975322, "ewc_loss": 0.06292591989040375, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030211076955311, "grad_norm": 7.532156467437744, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8788104057312012, "num_tokens": 566165320.0, "step": 14844 }, { "epoch": 1.8884365856761227, "ewc_loss": 0.06314291805028915, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003042807220481336, "grad_norm": 7.610696792602539, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8538936972618103, "num_tokens": 566202769.0, "step": 14845 }, { "epoch": 1.8885637959547132, "ewc_loss": 0.06300510466098785, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030290265567600727, "grad_norm": 7.547566890716553, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8674823045730591, "num_tokens": 566245874.0, "step": 14846 }, { "epoch": 1.8886910062333038, "ewc_loss": 0.06313279271125793, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003041794989258051, "grad_norm": 7.5604939460754395, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8446835279464722, "num_tokens": 566284855.0, "step": 14847 }, { "epoch": 1.8888182165118943, "ewc_loss": 0.06304998695850372, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030335140763781965, "grad_norm": 7.565618515014648, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8556108474731445, "num_tokens": 566323422.0, "step": 14848 }, { "epoch": 1.8889454267904848, "ewc_loss": 0.06316506862640381, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045022313017398, "grad_norm": 7.625936985015869, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8724222779273987, "num_tokens": 566364357.0, "step": 14849 }, { "epoch": 1.8890726370690751, "ewc_loss": 0.06354105472564697, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003033792891073972, "grad_norm": 7.544126033782959, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8603379726409912, "num_tokens": 566404498.0, "step": 14850 }, { "epoch": 1.8891998473476657, "ewc_loss": 0.06323555111885071, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030520709697157145, "grad_norm": 7.605156898498535, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8595677614212036, "num_tokens": 566442960.0, "step": 14851 }, { "epoch": 1.8893270576262562, "ewc_loss": 0.06307297199964523, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003035812987945974, "grad_norm": 7.674408435821533, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8733971118927002, "num_tokens": 566475087.0, "step": 14852 }, { "epoch": 1.8894542679048467, "ewc_loss": 0.06296379864215851, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030248958501033485, "grad_norm": 7.550158977508545, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8545286059379578, "num_tokens": 566514337.0, "step": 14853 }, { "epoch": 1.8895814781834372, "ewc_loss": 0.06317486613988876, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030460022389888763, "grad_norm": 7.611661911010742, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8524070382118225, "num_tokens": 566553204.0, "step": 14854 }, { "epoch": 1.8897086884620278, "ewc_loss": 0.06299294531345367, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030278105987235904, "grad_norm": 7.624414443969727, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8612139225006104, "num_tokens": 566589697.0, "step": 14855 }, { "epoch": 1.889835898740618, "ewc_loss": 0.06310293078422546, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000303880893625319, "grad_norm": 7.576590061187744, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8507664203643799, "num_tokens": 566629680.0, "step": 14856 }, { "epoch": 1.8899631090192086, "ewc_loss": 0.06309095025062561, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003037611022591591, "grad_norm": 7.568513870239258, "learning_rate": 1e-06, "loss": 0.3846, "mean_token_accuracy": 0.886187732219696, "num_tokens": 566670931.0, "step": 14857 }, { "epoch": 1.8900903192977991, "ewc_loss": 0.06307126581668854, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030356418574228883, "grad_norm": 7.6054606437683105, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8578183054924011, "num_tokens": 566706673.0, "step": 14858 }, { "epoch": 1.8902175295763897, "ewc_loss": 0.06302673369646072, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030311892624013126, "grad_norm": 7.558812618255615, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8594064116477966, "num_tokens": 566749503.0, "step": 14859 }, { "epoch": 1.8903447398549802, "ewc_loss": 0.06304613500833511, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030331293237395585, "grad_norm": 7.539555549621582, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8777939081192017, "num_tokens": 566790700.0, "step": 14860 }, { "epoch": 1.8904719501335707, "ewc_loss": 0.06318138539791107, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030466538737528026, "grad_norm": 7.5255937576293945, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8636579513549805, "num_tokens": 566832995.0, "step": 14861 }, { "epoch": 1.8905991604121613, "ewc_loss": 0.063094861805439, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000303800159599632, "grad_norm": 7.640329837799072, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8514142036437988, "num_tokens": 566868190.0, "step": 14862 }, { "epoch": 1.8907263706907518, "ewc_loss": 0.0631018579006195, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030387012520805, "grad_norm": 7.5409111976623535, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8351759910583496, "num_tokens": 566906637.0, "step": 14863 }, { "epoch": 1.8908535809693423, "ewc_loss": 0.06325382739305496, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003053898399230093, "grad_norm": 7.603172779083252, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8642573952674866, "num_tokens": 566946006.0, "step": 14864 }, { "epoch": 1.8909807912479328, "ewc_loss": 0.06291588395833969, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030201039044186473, "grad_norm": 7.505768299102783, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8515562415122986, "num_tokens": 566987726.0, "step": 14865 }, { "epoch": 1.8911080015265234, "ewc_loss": 0.06333105266094208, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003061620518565178, "grad_norm": 7.553109645843506, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8787996768951416, "num_tokens": 567027432.0, "step": 14866 }, { "epoch": 1.891235211805114, "ewc_loss": 0.06312508136034012, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030410237377509475, "grad_norm": 7.611617088317871, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8690383434295654, "num_tokens": 567060607.0, "step": 14867 }, { "epoch": 1.8913624220837044, "ewc_loss": 0.06306889653205872, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030354055343195796, "grad_norm": 7.538060665130615, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8569642305374146, "num_tokens": 567100691.0, "step": 14868 }, { "epoch": 1.891489632362295, "ewc_loss": 0.06322656571865082, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003051171952392906, "grad_norm": 7.534899711608887, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8815935254096985, "num_tokens": 567139460.0, "step": 14869 }, { "epoch": 1.8916168426408855, "ewc_loss": 0.06311887502670288, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000304040324408561, "grad_norm": 7.550846099853516, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.851786732673645, "num_tokens": 567185384.0, "step": 14870 }, { "epoch": 1.891744052919476, "ewc_loss": 0.06322996318340302, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003051511594094336, "grad_norm": 7.6282429695129395, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.861242949962616, "num_tokens": 567219898.0, "step": 14871 }, { "epoch": 1.8918712631980665, "ewc_loss": 0.06311202049255371, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003039717848878354, "grad_norm": 7.474332332611084, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8708140254020691, "num_tokens": 567260322.0, "step": 14872 }, { "epoch": 1.891998473476657, "ewc_loss": 0.0633014589548111, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030586618231609464, "grad_norm": 7.683349132537842, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8595537543296814, "num_tokens": 567293739.0, "step": 14873 }, { "epoch": 1.8921256837552476, "ewc_loss": 0.0630856454372406, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030370804597623646, "grad_norm": 7.526214599609375, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8659998774528503, "num_tokens": 567327282.0, "step": 14874 }, { "epoch": 1.892252894033838, "ewc_loss": 0.06339508295059204, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003068024234380573, "grad_norm": 7.674656867980957, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8575268983840942, "num_tokens": 567366402.0, "step": 14875 }, { "epoch": 1.8923801043124284, "ewc_loss": 0.06302966177463531, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030314820469357073, "grad_norm": 7.458306312561035, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8559359908103943, "num_tokens": 567409924.0, "step": 14876 }, { "epoch": 1.892507314591019, "ewc_loss": 0.06326967477798462, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003079896850977093, "grad_norm": 7.68695068359375, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8678364753723145, "num_tokens": 567450355.0, "step": 14877 }, { "epoch": 1.8926345248696095, "ewc_loss": 0.06274856626987457, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030277864425443113, "grad_norm": 7.436866760253906, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8769581913948059, "num_tokens": 567487701.0, "step": 14878 }, { "epoch": 1.8927617351482, "ewc_loss": 0.06368762254714966, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003097277949564159, "grad_norm": 7.756579875946045, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8638765811920166, "num_tokens": 567527619.0, "step": 14879 }, { "epoch": 1.8928889454267905, "ewc_loss": 0.06295693665742874, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003024209290742874, "grad_norm": 7.47320032119751, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8637200593948364, "num_tokens": 567567016.0, "step": 14880 }, { "epoch": 1.8930161557053808, "ewc_loss": 0.06369638442993164, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003098154265899211, "grad_norm": 7.7010064125061035, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8515549302101135, "num_tokens": 567605465.0, "step": 14881 }, { "epoch": 1.8931433659839714, "ewc_loss": 0.0628538727760315, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030383167904801667, "grad_norm": 7.5060811042785645, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8650038242340088, "num_tokens": 567646680.0, "step": 14882 }, { "epoch": 1.893270576262562, "ewc_loss": 0.06362191587686539, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030907074688002467, "grad_norm": 7.691189289093018, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8649433851242065, "num_tokens": 567685685.0, "step": 14883 }, { "epoch": 1.8933977865411524, "ewc_loss": 0.06322552263736725, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000305106834275648, "grad_norm": 7.559178829193115, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8523946404457092, "num_tokens": 567717222.0, "step": 14884 }, { "epoch": 1.893524996819743, "ewc_loss": 0.06350667774677277, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003079183807130903, "grad_norm": 7.618844985961914, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8533949255943298, "num_tokens": 567761050.0, "step": 14885 }, { "epoch": 1.8936522070983335, "ewc_loss": 0.06331469118595123, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030599843012169003, "grad_norm": 7.569797515869141, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8605707287788391, "num_tokens": 567801423.0, "step": 14886 }, { "epoch": 1.893779417376924, "ewc_loss": 0.06346821039915085, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003075336862821132, "grad_norm": 7.601627349853516, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8538841605186462, "num_tokens": 567844614.0, "step": 14887 }, { "epoch": 1.8939066276555145, "ewc_loss": 0.06332769244909286, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003061284660361707, "grad_norm": 7.6094136238098145, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8666011095046997, "num_tokens": 567878803.0, "step": 14888 }, { "epoch": 1.894033837934105, "ewc_loss": 0.0633457601070404, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030630920082330704, "grad_norm": 7.632485389709473, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8761025667190552, "num_tokens": 567909419.0, "step": 14889 }, { "epoch": 1.8941610482126956, "ewc_loss": 0.06334146857261658, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030626627267338336, "grad_norm": 7.571213245391846, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8640427589416504, "num_tokens": 567949221.0, "step": 14890 }, { "epoch": 1.8942882584912861, "ewc_loss": 0.0634380653500557, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030723222880624235, "grad_norm": 7.593797206878662, "learning_rate": 1e-06, "loss": 0.4059, "mean_token_accuracy": 0.8798555731773376, "num_tokens": 567985101.0, "step": 14891 }, { "epoch": 1.8944154687698767, "ewc_loss": 0.06336505711078644, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030650210101157427, "grad_norm": 7.606330871582031, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8478758335113525, "num_tokens": 568030542.0, "step": 14892 }, { "epoch": 1.8945426790484672, "ewc_loss": 0.06337592005729675, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003066107165068388, "grad_norm": 7.552354335784912, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8656149506568909, "num_tokens": 568070723.0, "step": 14893 }, { "epoch": 1.8946698893270577, "ewc_loss": 0.06345152854919434, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030736689222976565, "grad_norm": 7.586377143859863, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8568664789199829, "num_tokens": 568113444.0, "step": 14894 }, { "epoch": 1.8947970996056482, "ewc_loss": 0.06331999599933624, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003060515446122736, "grad_norm": 7.546115875244141, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8663647174835205, "num_tokens": 568152646.0, "step": 14895 }, { "epoch": 1.8949243098842388, "ewc_loss": 0.06332112848758698, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003085042699240148, "grad_norm": 7.668376445770264, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8633962869644165, "num_tokens": 568187046.0, "step": 14896 }, { "epoch": 1.8950515201628293, "ewc_loss": 0.06305908411741257, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003058838192373514, "grad_norm": 7.571637153625488, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8587071895599365, "num_tokens": 568225052.0, "step": 14897 }, { "epoch": 1.8951787304414198, "ewc_loss": 0.06323591619729996, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030765210976824164, "grad_norm": 7.61978006362915, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8558335304260254, "num_tokens": 568258299.0, "step": 14898 }, { "epoch": 1.8953059407200101, "ewc_loss": 0.06313058733940125, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000306598813040182, "grad_norm": 7.591409683227539, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8704743385314941, "num_tokens": 568297137.0, "step": 14899 }, { "epoch": 1.8954331509986007, "ewc_loss": 0.0634840577840805, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030769218574278057, "grad_norm": 7.597678184509277, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.8845901489257812, "num_tokens": 568331102.0, "step": 14900 }, { "epoch": 1.8955603612771912, "ewc_loss": 0.06331706792116165, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003060222661588341, "grad_norm": 7.621976375579834, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8605331182479858, "num_tokens": 568373926.0, "step": 14901 }, { "epoch": 1.8956875715557817, "ewc_loss": 0.06311134994029999, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003064064949285239, "grad_norm": 7.658629417419434, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8733004927635193, "num_tokens": 568406470.0, "step": 14902 }, { "epoch": 1.8958147818343722, "ewc_loss": 0.06300188601016998, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003053118707612157, "grad_norm": 7.564144611358643, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.871491014957428, "num_tokens": 568443452.0, "step": 14903 }, { "epoch": 1.8959419921129628, "ewc_loss": 0.06311995536088943, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.000306492525851354, "grad_norm": 7.64849328994751, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8575701713562012, "num_tokens": 568480085.0, "step": 14904 }, { "epoch": 1.896069202391553, "ewc_loss": 0.06320004910230637, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030485205934382975, "grad_norm": 7.5975165367126465, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8608726263046265, "num_tokens": 568519191.0, "step": 14905 }, { "epoch": 1.8961964126701436, "ewc_loss": 0.06336455047130585, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003064970369450748, "grad_norm": 7.665474891662598, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8696485161781311, "num_tokens": 568554224.0, "step": 14906 }, { "epoch": 1.8963236229487341, "ewc_loss": 0.06318724900484085, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003047240315936506, "grad_norm": 7.5563459396362305, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.880661129951477, "num_tokens": 568586102.0, "step": 14907 }, { "epoch": 1.8964508332273247, "ewc_loss": 0.0633038580417633, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030589010566473007, "grad_norm": 7.61912727355957, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.858795166015625, "num_tokens": 568625224.0, "step": 14908 }, { "epoch": 1.8965780435059152, "ewc_loss": 0.06325501203536987, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030540168518200517, "grad_norm": 7.59050989151001, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8433389663696289, "num_tokens": 568665588.0, "step": 14909 }, { "epoch": 1.8967052537845057, "ewc_loss": 0.06326758116483688, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030552735552191734, "grad_norm": 7.578638553619385, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8803318738937378, "num_tokens": 568698408.0, "step": 14910 }, { "epoch": 1.8968324640630962, "ewc_loss": 0.06336452066898346, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030649674590677023, "grad_norm": 7.65048885345459, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8514827489852905, "num_tokens": 568731982.0, "step": 14911 }, { "epoch": 1.8969596743416868, "ewc_loss": 0.06314155459403992, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003042671305593103, "grad_norm": 7.556345462799072, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8633304238319397, "num_tokens": 568769388.0, "step": 14912 }, { "epoch": 1.8970868846202773, "ewc_loss": 0.06330805271863937, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003059320733882487, "grad_norm": 7.620725154876709, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8705973625183105, "num_tokens": 568810329.0, "step": 14913 }, { "epoch": 1.8972140948988678, "ewc_loss": 0.06315085291862488, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030436005908995867, "grad_norm": 7.52076530456543, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8677363991737366, "num_tokens": 568852241.0, "step": 14914 }, { "epoch": 1.8973413051774584, "ewc_loss": 0.06331343948841095, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000305986002786085, "grad_norm": 7.614784240722656, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8588789701461792, "num_tokens": 568889660.0, "step": 14915 }, { "epoch": 1.8974685154560489, "ewc_loss": 0.06311381608247757, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003039897419512272, "grad_norm": 7.472978115081787, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8607693910598755, "num_tokens": 568934489.0, "step": 14916 }, { "epoch": 1.8975957257346394, "ewc_loss": 0.0634307861328125, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030715938191860914, "grad_norm": 7.582570552825928, "learning_rate": 1e-06, "loss": 0.3872, "mean_token_accuracy": 0.8845883011817932, "num_tokens": 568971904.0, "step": 14917 }, { "epoch": 1.89772293601323, "ewc_loss": 0.06319726258516312, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030482420697808266, "grad_norm": 7.556630611419678, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8460167646408081, "num_tokens": 569011863.0, "step": 14918 }, { "epoch": 1.8978501462918205, "ewc_loss": 0.06339322775602341, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030678382609039545, "grad_norm": 7.568235397338867, "learning_rate": 1e-06, "loss": 0.4003, "mean_token_accuracy": 0.8849092721939087, "num_tokens": 569050295.0, "step": 14919 }, { "epoch": 1.897977356570411, "ewc_loss": 0.06323753297328949, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003052269166801125, "grad_norm": 7.583634853363037, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8584337830543518, "num_tokens": 569087013.0, "step": 14920 }, { "epoch": 1.8981045668490015, "ewc_loss": 0.06329938769340515, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030584546038880944, "grad_norm": 7.646535396575928, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.857781708240509, "num_tokens": 569125226.0, "step": 14921 }, { "epoch": 1.898231777127592, "ewc_loss": 0.06329639256000519, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030581545433960855, "grad_norm": 7.577906608581543, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.860877513885498, "num_tokens": 569161910.0, "step": 14922 }, { "epoch": 1.8983589874061826, "ewc_loss": 0.06327296793460846, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003055811976082623, "grad_norm": 7.584229469299316, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8663391470909119, "num_tokens": 569196519.0, "step": 14923 }, { "epoch": 1.898486197684773, "ewc_loss": 0.06325425207614899, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030539408908225596, "grad_norm": 7.6807098388671875, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8604609370231628, "num_tokens": 569234404.0, "step": 14924 }, { "epoch": 1.8986134079633634, "ewc_loss": 0.06317974627017975, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030464906012639403, "grad_norm": 7.546849250793457, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8561452627182007, "num_tokens": 569272728.0, "step": 14925 }, { "epoch": 1.898740618241954, "ewc_loss": 0.06328558921813965, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003057074500247836, "grad_norm": 7.597535133361816, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8550528287887573, "num_tokens": 569315434.0, "step": 14926 }, { "epoch": 1.8988678285205445, "ewc_loss": 0.06317906826734543, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003046422207262367, "grad_norm": 7.6500349044799805, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8477741479873657, "num_tokens": 569357473.0, "step": 14927 }, { "epoch": 1.898995038799135, "ewc_loss": 0.06326790153980255, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003055306151509285, "grad_norm": 7.515015125274658, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8514199256896973, "num_tokens": 569398747.0, "step": 14928 }, { "epoch": 1.8991222490777255, "ewc_loss": 0.06339262425899506, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003067778015974909, "grad_norm": 7.612639427185059, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8716357350349426, "num_tokens": 569434855.0, "step": 14929 }, { "epoch": 1.8992494593563158, "ewc_loss": 0.06321485340595245, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003050000814255327, "grad_norm": 7.665002822875977, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8637048006057739, "num_tokens": 569475236.0, "step": 14930 }, { "epoch": 1.8993766696349064, "ewc_loss": 0.06333678960800171, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000306219415506348, "grad_norm": 7.566551685333252, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8630152940750122, "num_tokens": 569513599.0, "step": 14931 }, { "epoch": 1.899503879913497, "ewc_loss": 0.06333622336387634, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003062138275709003, "grad_norm": 7.6902666091918945, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8717179298400879, "num_tokens": 569559059.0, "step": 14932 }, { "epoch": 1.8996310901920874, "ewc_loss": 0.0636078417301178, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030404713470488787, "grad_norm": 7.571608066558838, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8818789720535278, "num_tokens": 569595240.0, "step": 14933 }, { "epoch": 1.899758300470678, "ewc_loss": 0.06328819692134857, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003057335561607033, "grad_norm": 7.582794666290283, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8534108400344849, "num_tokens": 569629336.0, "step": 14934 }, { "epoch": 1.8998855107492685, "ewc_loss": 0.06321436166763306, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003049951628781855, "grad_norm": 7.653798580169678, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8761389255523682, "num_tokens": 569663613.0, "step": 14935 }, { "epoch": 1.900012721027859, "ewc_loss": 0.06315229833126068, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003043745527975261, "grad_norm": 7.5426716804504395, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8479626178741455, "num_tokens": 569702884.0, "step": 14936 }, { "epoch": 1.9001399313064495, "ewc_loss": 0.06342305988073349, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030708214035257697, "grad_norm": 7.598537921905518, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8787174224853516, "num_tokens": 569744618.0, "step": 14937 }, { "epoch": 1.90026714158504, "ewc_loss": 0.06313091516494751, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030416075605899096, "grad_norm": 7.586989879608154, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8724778890609741, "num_tokens": 569777083.0, "step": 14938 }, { "epoch": 1.9003943518636306, "ewc_loss": 0.06327241659164429, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000305575696984306, "grad_norm": 7.586078643798828, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8700992465019226, "num_tokens": 569809652.0, "step": 14939 }, { "epoch": 1.9005215621422211, "ewc_loss": 0.06330262869596481, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030587782384827733, "grad_norm": 7.608476638793945, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8680248260498047, "num_tokens": 569846465.0, "step": 14940 }, { "epoch": 1.9006487724208116, "ewc_loss": 0.06326665729284286, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000305518158711493, "grad_norm": 7.658648490905762, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8563737869262695, "num_tokens": 569883655.0, "step": 14941 }, { "epoch": 1.9007759826994022, "ewc_loss": 0.06315498799085617, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003044014156330377, "grad_norm": 7.603413105010986, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8629614114761353, "num_tokens": 569923358.0, "step": 14942 }, { "epoch": 1.9009031929779927, "ewc_loss": 0.06324009597301483, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030525249894708395, "grad_norm": 7.6523237228393555, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8604995012283325, "num_tokens": 569961393.0, "step": 14943 }, { "epoch": 1.9010304032565832, "ewc_loss": 0.06314603984355927, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003043119504582137, "grad_norm": 7.6468939781188965, "learning_rate": 1e-06, "loss": 0.5474, "mean_token_accuracy": 0.836945116519928, "num_tokens": 570002092.0, "step": 14944 }, { "epoch": 1.9011576135351738, "ewc_loss": 0.06313720345497131, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003042236203327775, "grad_norm": 7.601476669311523, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.848299503326416, "num_tokens": 570037699.0, "step": 14945 }, { "epoch": 1.9012848238137643, "ewc_loss": 0.06322726607322693, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030512420926243067, "grad_norm": 7.589349269866943, "learning_rate": 1e-06, "loss": 0.5314, "mean_token_accuracy": 0.8395187258720398, "num_tokens": 570077693.0, "step": 14946 }, { "epoch": 1.9014120340923548, "ewc_loss": 0.06318452954292297, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030469687771983445, "grad_norm": 7.615673542022705, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8825507164001465, "num_tokens": 570115354.0, "step": 14947 }, { "epoch": 1.9015392443709451, "ewc_loss": 0.06315696984529495, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003044212353415787, "grad_norm": 7.535262107849121, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8717738389968872, "num_tokens": 570154387.0, "step": 14948 }, { "epoch": 1.9016664546495357, "ewc_loss": 0.06316237896680832, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030447536846622825, "grad_norm": 7.632579803466797, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8584839701652527, "num_tokens": 570189743.0, "step": 14949 }, { "epoch": 1.9017936649281262, "ewc_loss": 0.06314512342214584, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030430281185545027, "grad_norm": 7.619993686676025, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8753255605697632, "num_tokens": 570224009.0, "step": 14950 }, { "epoch": 1.9019208752067167, "ewc_loss": 0.06325950473546982, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003054465923923999, "grad_norm": 7.57588529586792, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8709943890571594, "num_tokens": 570266703.0, "step": 14951 }, { "epoch": 1.9020480854853072, "ewc_loss": 0.06325600296258926, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030541158048436046, "grad_norm": 7.67210054397583, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8633416891098022, "num_tokens": 570308241.0, "step": 14952 }, { "epoch": 1.9021752957638978, "ewc_loss": 0.06301496922969818, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003030012594535947, "grad_norm": 7.561120510101318, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8630280494689941, "num_tokens": 570346931.0, "step": 14953 }, { "epoch": 1.902302506042488, "ewc_loss": 0.06326571851968765, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003055087581742555, "grad_norm": 7.609460830688477, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8775604367256165, "num_tokens": 570380373.0, "step": 14954 }, { "epoch": 1.9024297163210786, "ewc_loss": 0.06299871951341629, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030283877276815474, "grad_norm": 7.504578590393066, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8758745193481445, "num_tokens": 570420106.0, "step": 14955 }, { "epoch": 1.9025569265996691, "ewc_loss": 0.0631038099527359, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003063310286961496, "grad_norm": 7.556646823883057, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8853910565376282, "num_tokens": 570464534.0, "step": 14956 }, { "epoch": 1.9026841368782597, "ewc_loss": 0.06317466497421265, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045982448384166, "grad_norm": 7.572268009185791, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8537224531173706, "num_tokens": 570505734.0, "step": 14957 }, { "epoch": 1.9028113471568502, "ewc_loss": 0.06299690157175064, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030526198679581285, "grad_norm": 7.576879024505615, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.873277485370636, "num_tokens": 570545913.0, "step": 14958 }, { "epoch": 1.9029385574354407, "ewc_loss": 0.06296645104885101, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003049574443139136, "grad_norm": 7.598568916320801, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8716986179351807, "num_tokens": 570584397.0, "step": 14959 }, { "epoch": 1.9030657677140312, "ewc_loss": 0.0630185604095459, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003054786066059023, "grad_norm": 7.604827880859375, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8713864088058472, "num_tokens": 570622285.0, "step": 14960 }, { "epoch": 1.9031929779926218, "ewc_loss": 0.06294557452201843, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003047487116418779, "grad_norm": 7.623030662536621, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8483829498291016, "num_tokens": 570656331.0, "step": 14961 }, { "epoch": 1.9033201882712123, "ewc_loss": 0.06287216395139694, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003040146257262677, "grad_norm": 7.5051045417785645, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8693697452545166, "num_tokens": 570699944.0, "step": 14962 }, { "epoch": 1.9034473985498028, "ewc_loss": 0.0631822943687439, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030711592989973724, "grad_norm": 7.64857292175293, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8586363792419434, "num_tokens": 570737649.0, "step": 14963 }, { "epoch": 1.9035746088283934, "ewc_loss": 0.06285151839256287, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003038081631530076, "grad_norm": 7.559874057769775, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.863921046257019, "num_tokens": 570773193.0, "step": 14964 }, { "epoch": 1.9037018191069839, "ewc_loss": 0.06309856474399567, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030627858359366655, "grad_norm": 7.61551570892334, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8577872514724731, "num_tokens": 570809117.0, "step": 14965 }, { "epoch": 1.9038290293855744, "ewc_loss": 0.06281840801239014, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030347704887390137, "grad_norm": 7.553142070770264, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8757983446121216, "num_tokens": 570844268.0, "step": 14966 }, { "epoch": 1.903956239664165, "ewc_loss": 0.06313817203044891, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030667465762235224, "grad_norm": 7.6740593910217285, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.855607271194458, "num_tokens": 570890166.0, "step": 14967 }, { "epoch": 1.9040834499427555, "ewc_loss": 0.06286446750164032, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003039376169908792, "grad_norm": 7.581714630126953, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8574584126472473, "num_tokens": 570921419.0, "step": 14968 }, { "epoch": 1.904210660221346, "ewc_loss": 0.06304329633712769, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030572593095712364, "grad_norm": 7.612893581390381, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.85541832447052, "num_tokens": 570966444.0, "step": 14969 }, { "epoch": 1.9043378704999365, "ewc_loss": 0.06289664655923843, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003042594180442393, "grad_norm": 7.5307512283325195, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8737059831619263, "num_tokens": 571011656.0, "step": 14970 }, { "epoch": 1.904465080778527, "ewc_loss": 0.06301465630531311, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003054395492654294, "grad_norm": 7.632017135620117, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8704572916030884, "num_tokens": 571047828.0, "step": 14971 }, { "epoch": 1.9045922910571176, "ewc_loss": 0.0628509595990181, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003038025461137295, "grad_norm": 7.540623188018799, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8615998029708862, "num_tokens": 571084541.0, "step": 14972 }, { "epoch": 1.9047195013357079, "ewc_loss": 0.06330030411481857, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003058546280954033, "grad_norm": 7.65224027633667, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8583437204360962, "num_tokens": 571120978.0, "step": 14973 }, { "epoch": 1.9048467116142984, "ewc_loss": 0.06316760182380676, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045275807380676, "grad_norm": 7.584079742431641, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8769353628158569, "num_tokens": 571160181.0, "step": 14974 }, { "epoch": 1.904973921892889, "ewc_loss": 0.06322327256202698, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030508424970321357, "grad_norm": 7.627762794494629, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8689147233963013, "num_tokens": 571194648.0, "step": 14975 }, { "epoch": 1.9051011321714795, "ewc_loss": 0.06314805150032043, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030433203210122883, "grad_norm": 7.604430198669434, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8581774234771729, "num_tokens": 571238449.0, "step": 14976 }, { "epoch": 1.90522834245007, "ewc_loss": 0.06332701444625854, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003061217430513352, "grad_norm": 7.646383762359619, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8639086484909058, "num_tokens": 571277266.0, "step": 14977 }, { "epoch": 1.9053555527286605, "ewc_loss": 0.06291154026985168, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003044084005523473, "grad_norm": 7.5918755531311035, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8756328821182251, "num_tokens": 571316891.0, "step": 14978 }, { "epoch": 1.9054827630072508, "ewc_loss": 0.06305594742298126, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003058524744119495, "grad_norm": 7.638282299041748, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8651056289672852, "num_tokens": 571353097.0, "step": 14979 }, { "epoch": 1.9056099732858414, "ewc_loss": 0.06311889737844467, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003040405281353742, "grad_norm": 7.542947292327881, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.877796471118927, "num_tokens": 571392217.0, "step": 14980 }, { "epoch": 1.905737183564432, "ewc_loss": 0.06306244432926178, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030591743416152894, "grad_norm": 7.613106727600098, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8730945587158203, "num_tokens": 571430440.0, "step": 14981 }, { "epoch": 1.9058643938430224, "ewc_loss": 0.06296998262405396, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030499283457174897, "grad_norm": 7.5813374519348145, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8613550662994385, "num_tokens": 571472002.0, "step": 14982 }, { "epoch": 1.905991604121613, "ewc_loss": 0.06303471326828003, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030564010376110673, "grad_norm": 7.649285316467285, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8454322814941406, "num_tokens": 571509368.0, "step": 14983 }, { "epoch": 1.9061188144002035, "ewc_loss": 0.06287588179111481, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030405179131776094, "grad_norm": 7.560613632202148, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.8494356870651245, "num_tokens": 571543898.0, "step": 14984 }, { "epoch": 1.906246024678794, "ewc_loss": 0.06314738094806671, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030676680034957826, "grad_norm": 7.594716548919678, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8579418659210205, "num_tokens": 571584395.0, "step": 14985 }, { "epoch": 1.9063732349573845, "ewc_loss": 0.06294690817594528, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003047620411962271, "grad_norm": 7.61321496963501, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8646814227104187, "num_tokens": 571624792.0, "step": 14986 }, { "epoch": 1.906500445235975, "ewc_loss": 0.06294985115528107, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030479152337647974, "grad_norm": 7.491971492767334, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8669382333755493, "num_tokens": 571667197.0, "step": 14987 }, { "epoch": 1.9066276555145656, "ewc_loss": 0.06334952265024185, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030634680297225714, "grad_norm": 7.60773229598999, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8579981327056885, "num_tokens": 571703648.0, "step": 14988 }, { "epoch": 1.9067548657931561, "ewc_loss": 0.06315763294696808, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030442787101492286, "grad_norm": 7.60037899017334, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8544078469276428, "num_tokens": 571738980.0, "step": 14989 }, { "epoch": 1.9068820760717466, "ewc_loss": 0.06324488669633865, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003053004329558462, "grad_norm": 7.562403678894043, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8659235239028931, "num_tokens": 571774540.0, "step": 14990 }, { "epoch": 1.9070092863503372, "ewc_loss": 0.06331335753202438, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003059851296711713, "grad_norm": 7.610044002532959, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8710812330245972, "num_tokens": 571805286.0, "step": 14991 }, { "epoch": 1.9071364966289277, "ewc_loss": 0.06319773942232132, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003048289509024471, "grad_norm": 7.498744010925293, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8575760126113892, "num_tokens": 571847936.0, "step": 14992 }, { "epoch": 1.9072637069075182, "ewc_loss": 0.06348899751901627, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030774151673540473, "grad_norm": 7.665655136108398, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8593888282775879, "num_tokens": 571885454.0, "step": 14993 }, { "epoch": 1.9073909171861088, "ewc_loss": 0.0631561353802681, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030441294074989855, "grad_norm": 7.5406107902526855, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8628660440444946, "num_tokens": 571924870.0, "step": 14994 }, { "epoch": 1.9075181274646993, "ewc_loss": 0.06350795924663544, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003079311572946608, "grad_norm": 7.676062107086182, "learning_rate": 1e-06, "loss": 0.536, "mean_token_accuracy": 0.8425703048706055, "num_tokens": 571957870.0, "step": 14995 }, { "epoch": 1.9076453377432898, "ewc_loss": 0.06320826709270477, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003049342194572091, "grad_norm": 7.52758264541626, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8548519611358643, "num_tokens": 571993645.0, "step": 14996 }, { "epoch": 1.9077725480218801, "ewc_loss": 0.06347353011369705, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003075868880841881, "grad_norm": 7.673854351043701, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8716446161270142, "num_tokens": 572028594.0, "step": 14997 }, { "epoch": 1.9078997583004706, "ewc_loss": 0.06317075341939926, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045590710826218, "grad_norm": 7.523777008056641, "learning_rate": 1e-06, "loss": 0.5446, "mean_token_accuracy": 0.8364559412002563, "num_tokens": 572066971.0, "step": 14998 }, { "epoch": 1.9080269685790612, "ewc_loss": 0.06359563022851944, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030880788108333945, "grad_norm": 7.680958271026611, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8690854907035828, "num_tokens": 572107609.0, "step": 14999 }, { "epoch": 1.9081541788576517, "ewc_loss": 0.06310605257749557, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003039120929315686, "grad_norm": 7.516786098480225, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8716263771057129, "num_tokens": 572146940.0, "step": 15000 }, { "epoch": 1.9082813891362422, "ewc_loss": 0.06346466392278671, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030749820871278644, "grad_norm": 7.643279552459717, "learning_rate": 1e-06, "loss": 0.3806, "mean_token_accuracy": 0.8890048265457153, "num_tokens": 572184010.0, "step": 15001 }, { "epoch": 1.9084085994148328, "ewc_loss": 0.06306421011686325, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030349366716109216, "grad_norm": 7.578291893005371, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.847916841506958, "num_tokens": 572223184.0, "step": 15002 }, { "epoch": 1.908535809693423, "ewc_loss": 0.06333020329475403, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030615361174568534, "grad_norm": 7.6542277336120605, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8636652827262878, "num_tokens": 572261770.0, "step": 15003 }, { "epoch": 1.9086630199720136, "ewc_loss": 0.06325238943099976, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003053754917345941, "grad_norm": 7.56493616104126, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8666415810585022, "num_tokens": 572303229.0, "step": 15004 }, { "epoch": 1.9087902302506041, "ewc_loss": 0.0632554218173027, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003054057597182691, "grad_norm": 7.686712741851807, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8516895771026611, "num_tokens": 572337839.0, "step": 15005 }, { "epoch": 1.9089174405291947, "ewc_loss": 0.0631830245256424, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003046817728318274, "grad_norm": 7.586677551269531, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8621563911437988, "num_tokens": 572373604.0, "step": 15006 }, { "epoch": 1.9090446508077852, "ewc_loss": 0.0633578822016716, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003064303600694984, "grad_norm": 7.689180850982666, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8711068630218506, "num_tokens": 572410425.0, "step": 15007 }, { "epoch": 1.9091718610863757, "ewc_loss": 0.06302240490913391, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030307561974041164, "grad_norm": 7.586565017700195, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.873161256313324, "num_tokens": 572443998.0, "step": 15008 }, { "epoch": 1.9092990713649662, "ewc_loss": 0.0632467120885849, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000305318710161373, "grad_norm": 7.639402866363525, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8503222465515137, "num_tokens": 572485337.0, "step": 15009 }, { "epoch": 1.9094262816435568, "ewc_loss": 0.06297960877418518, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030264764791354537, "grad_norm": 7.570047378540039, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8533864617347717, "num_tokens": 572528273.0, "step": 15010 }, { "epoch": 1.9095534919221473, "ewc_loss": 0.06328156590461731, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003056672285310924, "grad_norm": 7.641158580780029, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8668873906135559, "num_tokens": 572569467.0, "step": 15011 }, { "epoch": 1.9096807022007378, "ewc_loss": 0.06298930943012238, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030274465098045766, "grad_norm": 7.558201789855957, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.871293842792511, "num_tokens": 572609626.0, "step": 15012 }, { "epoch": 1.9098079124793284, "ewc_loss": 0.06324402987957001, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003052918764296919, "grad_norm": 7.665846347808838, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.848085880279541, "num_tokens": 572652907.0, "step": 15013 }, { "epoch": 1.9099351227579189, "ewc_loss": 0.06303921341896057, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003032436943612993, "grad_norm": 7.5735015869140625, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8699706792831421, "num_tokens": 572687460.0, "step": 15014 }, { "epoch": 1.9100623330365094, "ewc_loss": 0.06332696974277496, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003061212773900479, "grad_norm": 7.7135009765625, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8676948547363281, "num_tokens": 572723937.0, "step": 15015 }, { "epoch": 1.9101895433151, "ewc_loss": 0.06301023066043854, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003029538784176111, "grad_norm": 7.495864391326904, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8592238426208496, "num_tokens": 572764929.0, "step": 15016 }, { "epoch": 1.9103167535936905, "ewc_loss": 0.06350650638341904, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030791660537943244, "grad_norm": 7.731494426727295, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8618072271347046, "num_tokens": 572802545.0, "step": 15017 }, { "epoch": 1.910443963872281, "ewc_loss": 0.06299245357513428, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003027760540135205, "grad_norm": 7.500966548919678, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8583600521087646, "num_tokens": 572850008.0, "step": 15018 }, { "epoch": 1.9105711741508715, "ewc_loss": 0.06337647885084152, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003090577374678105, "grad_norm": 7.693181037902832, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8718901872634888, "num_tokens": 572888968.0, "step": 15019 }, { "epoch": 1.910698384429462, "ewc_loss": 0.06300809979438782, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003029325744137168, "grad_norm": 7.5175323486328125, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8615943789482117, "num_tokens": 572926314.0, "step": 15020 }, { "epoch": 1.9108255947080524, "ewc_loss": 0.0635613426566124, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003084650088567287, "grad_norm": 7.705878257751465, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8683842420578003, "num_tokens": 572961789.0, "step": 15021 }, { "epoch": 1.9109528049866429, "ewc_loss": 0.06309198588132858, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030377143411897123, "grad_norm": 7.595615863800049, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8650559186935425, "num_tokens": 572993680.0, "step": 15022 }, { "epoch": 1.9110800152652334, "ewc_loss": 0.06340928375720978, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003069444210268557, "grad_norm": 7.669312477111816, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8657170534133911, "num_tokens": 573024003.0, "step": 15023 }, { "epoch": 1.911207225543824, "ewc_loss": 0.06319699436426163, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030482152942568064, "grad_norm": 7.537665843963623, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8711843490600586, "num_tokens": 573055138.0, "step": 15024 }, { "epoch": 1.9113344358224145, "ewc_loss": 0.06353899091482162, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030824149143882096, "grad_norm": 7.717410564422607, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8676842451095581, "num_tokens": 573092373.0, "step": 15025 }, { "epoch": 1.911461646101005, "ewc_loss": 0.06314487010240555, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030430027982220054, "grad_norm": 7.5276618003845215, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8606759309768677, "num_tokens": 573131486.0, "step": 15026 }, { "epoch": 1.9115888563795955, "ewc_loss": 0.06361221522092819, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003089737147092819, "grad_norm": 7.677403926849365, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8738618493080139, "num_tokens": 573170114.0, "step": 15027 }, { "epoch": 1.9117160666581858, "ewc_loss": 0.06317342072725296, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045857883989811, "grad_norm": 7.598947048187256, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8666861057281494, "num_tokens": 573200906.0, "step": 15028 }, { "epoch": 1.9118432769367764, "ewc_loss": 0.06346066296100616, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030745816184207797, "grad_norm": 7.582391738891602, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8527097702026367, "num_tokens": 573243731.0, "step": 15029 }, { "epoch": 1.9119704872153669, "ewc_loss": 0.06374666094779968, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030543538741767406, "grad_norm": 7.578888416290283, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8704921007156372, "num_tokens": 573283335.0, "step": 15030 }, { "epoch": 1.9120976974939574, "ewc_loss": 0.06329545378684998, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030580611201003194, "grad_norm": 7.554281711578369, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8509845733642578, "num_tokens": 573326628.0, "step": 15031 }, { "epoch": 1.912224907772548, "ewc_loss": 0.06337032467126846, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003065548080485314, "grad_norm": 7.580644130706787, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.869519829750061, "num_tokens": 573369025.0, "step": 15032 }, { "epoch": 1.9123521180511385, "ewc_loss": 0.06327769160270691, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030562846222892404, "grad_norm": 7.577338218688965, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8559874296188354, "num_tokens": 573411923.0, "step": 15033 }, { "epoch": 1.912479328329729, "ewc_loss": 0.06345122307538986, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003073637781199068, "grad_norm": 7.648587703704834, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8583282232284546, "num_tokens": 573450455.0, "step": 15034 }, { "epoch": 1.9126065386083195, "ewc_loss": 0.06327679753303528, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003056195564568043, "grad_norm": 7.551562786102295, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8554311394691467, "num_tokens": 573492886.0, "step": 15035 }, { "epoch": 1.91273374888691, "ewc_loss": 0.0635349303483963, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003082008915953338, "grad_norm": 7.655399799346924, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8692212104797363, "num_tokens": 573532073.0, "step": 15036 }, { "epoch": 1.9128609591655006, "ewc_loss": 0.06323752552270889, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003052268293686211, "grad_norm": 7.508705139160156, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8801999688148499, "num_tokens": 573573632.0, "step": 15037 }, { "epoch": 1.9129881694440911, "ewc_loss": 0.0636184811592102, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003090364043600857, "grad_norm": 7.6860785484313965, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8735825419425964, "num_tokens": 573610194.0, "step": 15038 }, { "epoch": 1.9131153797226816, "ewc_loss": 0.06327135115861893, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030556507408618927, "grad_norm": 7.735671520233154, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8554261922836304, "num_tokens": 573650237.0, "step": 15039 }, { "epoch": 1.9132425900012722, "ewc_loss": 0.06338731944561005, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030672477441839874, "grad_norm": 7.556992053985596, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8669525980949402, "num_tokens": 573686859.0, "step": 15040 }, { "epoch": 1.9133698002798627, "ewc_loss": 0.06347448378801346, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000307596375932917, "grad_norm": 7.615683078765869, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8630998134613037, "num_tokens": 573726770.0, "step": 15041 }, { "epoch": 1.9134970105584532, "ewc_loss": 0.06334097683429718, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003062613250222057, "grad_norm": 7.604752540588379, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8582196235656738, "num_tokens": 573768668.0, "step": 15042 }, { "epoch": 1.9136242208370438, "ewc_loss": 0.0634315237402916, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030716677429154515, "grad_norm": 7.583864688873291, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8543415665626526, "num_tokens": 573807880.0, "step": 15043 }, { "epoch": 1.9137514311156343, "ewc_loss": 0.06332236528396606, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030607517692260444, "grad_norm": 7.629334449768066, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8616436123847961, "num_tokens": 573847018.0, "step": 15044 }, { "epoch": 1.9138786413942248, "ewc_loss": 0.06327962130308151, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030564775806851685, "grad_norm": 7.580715179443359, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8797037601470947, "num_tokens": 573881193.0, "step": 15045 }, { "epoch": 1.9140058516728151, "ewc_loss": 0.06348669528961182, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003077185247093439, "grad_norm": 7.61400842666626, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8723585605621338, "num_tokens": 573918457.0, "step": 15046 }, { "epoch": 1.9141330619514056, "ewc_loss": 0.0633264034986496, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030611560214310884, "grad_norm": 7.5554423332214355, "learning_rate": 1e-06, "loss": 0.3976, "mean_token_accuracy": 0.8843889236450195, "num_tokens": 573958818.0, "step": 15047 }, { "epoch": 1.9142602722299962, "ewc_loss": 0.0634794533252716, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030764611437916756, "grad_norm": 7.679487705230713, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8592563271522522, "num_tokens": 573996735.0, "step": 15048 }, { "epoch": 1.9143874825085867, "ewc_loss": 0.06328253448009491, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003056768618989736, "grad_norm": 7.58245849609375, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8680984377861023, "num_tokens": 574035005.0, "step": 15049 }, { "epoch": 1.9145146927871772, "ewc_loss": 0.06341201812028885, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003069717204198241, "grad_norm": 7.563485622406006, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8631365895271301, "num_tokens": 574077817.0, "step": 15050 }, { "epoch": 1.9146419030657678, "ewc_loss": 0.06333091855049133, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003061607130803168, "grad_norm": 7.647480487823486, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8706969022750854, "num_tokens": 574113083.0, "step": 15051 }, { "epoch": 1.914769113344358, "ewc_loss": 0.06327886134386063, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003056401910725981, "grad_norm": 7.575765132904053, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8687887191772461, "num_tokens": 574147324.0, "step": 15052 }, { "epoch": 1.9148963236229486, "ewc_loss": 0.06342122703790665, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003070638340432197, "grad_norm": 7.622326374053955, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8797659873962402, "num_tokens": 574186108.0, "step": 15053 }, { "epoch": 1.9150235339015391, "ewc_loss": 0.0632687360048294, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003055388806387782, "grad_norm": 7.628985404968262, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8755059242248535, "num_tokens": 574218076.0, "step": 15054 }, { "epoch": 1.9151507441801296, "ewc_loss": 0.0633627325296402, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030647884705103934, "grad_norm": 7.655874729156494, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8748443722724915, "num_tokens": 574256416.0, "step": 15055 }, { "epoch": 1.9152779544587202, "ewc_loss": 0.06320691108703613, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003049207152798772, "grad_norm": 7.564513206481934, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8771266341209412, "num_tokens": 574295262.0, "step": 15056 }, { "epoch": 1.9154051647373107, "ewc_loss": 0.06338122487068176, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003066638018935919, "grad_norm": 7.664148330688477, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8531653881072998, "num_tokens": 574329836.0, "step": 15057 }, { "epoch": 1.9155323750159012, "ewc_loss": 0.06314890086650848, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003043405886273831, "grad_norm": 7.585925579071045, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8662151098251343, "num_tokens": 574363189.0, "step": 15058 }, { "epoch": 1.9156595852944918, "ewc_loss": 0.0633024275302887, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003058758156839758, "grad_norm": 7.674826145172119, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8731212615966797, "num_tokens": 574396374.0, "step": 15059 }, { "epoch": 1.9157867955730823, "ewc_loss": 0.06304198503494263, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003032714012078941, "grad_norm": 7.629174709320068, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8668181896209717, "num_tokens": 574435488.0, "step": 15060 }, { "epoch": 1.9159140058516728, "ewc_loss": 0.06323336064815521, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030518515268340707, "grad_norm": 7.587388515472412, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8642114400863647, "num_tokens": 574475261.0, "step": 15061 }, { "epoch": 1.9160412161302633, "ewc_loss": 0.06317019462585449, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045534831471741, "grad_norm": 7.659515380859375, "learning_rate": 1e-06, "loss": 0.3932, "mean_token_accuracy": 0.8841508030891418, "num_tokens": 574513531.0, "step": 15062 }, { "epoch": 1.9161684264088539, "ewc_loss": 0.06309226900339127, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030377425719052553, "grad_norm": 7.646572113037109, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8508552312850952, "num_tokens": 574551736.0, "step": 15063 }, { "epoch": 1.9162956366874444, "ewc_loss": 0.0630994588136673, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003038461145479232, "grad_norm": 7.60299015045166, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8581488728523254, "num_tokens": 574589978.0, "step": 15064 }, { "epoch": 1.916422846966035, "ewc_loss": 0.0631873682141304, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003047252248506993, "grad_norm": 7.65243673324585, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8434821367263794, "num_tokens": 574628845.0, "step": 15065 }, { "epoch": 1.9165500572446255, "ewc_loss": 0.06300950795412064, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003029466315638274, "grad_norm": 7.586738109588623, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8619651794433594, "num_tokens": 574660700.0, "step": 15066 }, { "epoch": 1.916677267523216, "ewc_loss": 0.06329115480184555, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030576312565244734, "grad_norm": 7.657099723815918, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8573485612869263, "num_tokens": 574692195.0, "step": 15067 }, { "epoch": 1.9168044778018065, "ewc_loss": 0.06300978362560272, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030294936732389033, "grad_norm": 7.606375217437744, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8538224101066589, "num_tokens": 574732462.0, "step": 15068 }, { "epoch": 1.916931688080397, "ewc_loss": 0.06320367753505707, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030488838092423975, "grad_norm": 7.584817409515381, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8715656995773315, "num_tokens": 574769353.0, "step": 15069 }, { "epoch": 1.9170588983589874, "ewc_loss": 0.063126340508461, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003041149757336825, "grad_norm": 7.635536193847656, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8644751310348511, "num_tokens": 574804958.0, "step": 15070 }, { "epoch": 1.9171861086375779, "ewc_loss": 0.06309649348258972, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003038165159523487, "grad_norm": 7.591808795928955, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8635134696960449, "num_tokens": 574843966.0, "step": 15071 }, { "epoch": 1.9173133189161684, "ewc_loss": 0.06322520971298218, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030510369106195867, "grad_norm": 7.6308369636535645, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8698382377624512, "num_tokens": 574882916.0, "step": 15072 }, { "epoch": 1.917440529194759, "ewc_loss": 0.06311161816120148, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003039677394554019, "grad_norm": 7.62304162979126, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8719940185546875, "num_tokens": 574922154.0, "step": 15073 }, { "epoch": 1.9175677394733495, "ewc_loss": 0.06315325200557709, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003043841279577464, "grad_norm": 7.640674591064453, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8609643578529358, "num_tokens": 574956894.0, "step": 15074 }, { "epoch": 1.91769494975194, "ewc_loss": 0.06316543370485306, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030450589838437736, "grad_norm": 7.583900451660156, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8789148926734924, "num_tokens": 574998343.0, "step": 15075 }, { "epoch": 1.9178221600305305, "ewc_loss": 0.06328026950359344, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030565427732653916, "grad_norm": 7.6681904792785645, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8728992342948914, "num_tokens": 575036689.0, "step": 15076 }, { "epoch": 1.9179493703091208, "ewc_loss": 0.06310959160327911, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030394751229323447, "grad_norm": 7.576417446136475, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8596442937850952, "num_tokens": 575075510.0, "step": 15077 }, { "epoch": 1.9180765805877114, "ewc_loss": 0.06328675150871277, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003057190333493054, "grad_norm": 7.646448612213135, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8608596324920654, "num_tokens": 575115797.0, "step": 15078 }, { "epoch": 1.9182037908663019, "ewc_loss": 0.06317371129989624, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030458869878202677, "grad_norm": 7.644720077514648, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8506238460540771, "num_tokens": 575155240.0, "step": 15079 }, { "epoch": 1.9183310011448924, "ewc_loss": 0.0632505789399147, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030535736004821956, "grad_norm": 7.660614490509033, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8527397513389587, "num_tokens": 575191048.0, "step": 15080 }, { "epoch": 1.918458211423483, "ewc_loss": 0.06314946711063385, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003043462347704917, "grad_norm": 7.687215328216553, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8528841733932495, "num_tokens": 575227778.0, "step": 15081 }, { "epoch": 1.9185854217020735, "ewc_loss": 0.0631818026304245, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003046695492230356, "grad_norm": 7.613484859466553, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8654381632804871, "num_tokens": 575269774.0, "step": 15082 }, { "epoch": 1.918712631980664, "ewc_loss": 0.06316731870174408, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045247576665133, "grad_norm": 7.612610340118408, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8544837832450867, "num_tokens": 575310710.0, "step": 15083 }, { "epoch": 1.9188398422592545, "ewc_loss": 0.06308721005916595, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003037237038370222, "grad_norm": 7.597860813140869, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8716531991958618, "num_tokens": 575344073.0, "step": 15084 }, { "epoch": 1.918967052537845, "ewc_loss": 0.06318160891532898, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030466768657788634, "grad_norm": 7.59367561340332, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8541179895401001, "num_tokens": 575386737.0, "step": 15085 }, { "epoch": 1.9190942628164356, "ewc_loss": 0.06310297548770905, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030388133018277586, "grad_norm": 7.571760177612305, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8683032989501953, "num_tokens": 575424287.0, "step": 15086 }, { "epoch": 1.919221473095026, "ewc_loss": 0.06323204934597015, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030517205595970154, "grad_norm": 7.6168975830078125, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8656225204467773, "num_tokens": 575463859.0, "step": 15087 }, { "epoch": 1.9193486833736166, "ewc_loss": 0.06323175132274628, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030516908736899495, "grad_norm": 7.606406211853027, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8585951328277588, "num_tokens": 575500838.0, "step": 15088 }, { "epoch": 1.9194758936522072, "ewc_loss": 0.06328950077295303, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003057465946767479, "grad_norm": 7.611927509307861, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8571092486381531, "num_tokens": 575535164.0, "step": 15089 }, { "epoch": 1.9196031039307977, "ewc_loss": 0.06326603889465332, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003055119886994362, "grad_norm": 7.612849235534668, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.870441198348999, "num_tokens": 575569493.0, "step": 15090 }, { "epoch": 1.9197303142093882, "ewc_loss": 0.06326572597026825, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030550878727808595, "grad_norm": 7.634873390197754, "learning_rate": 1e-06, "loss": 0.5195, "mean_token_accuracy": 0.8476947546005249, "num_tokens": 575603396.0, "step": 15091 }, { "epoch": 1.9198575244879788, "ewc_loss": 0.06315852701663971, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000304436864098534, "grad_norm": 7.605732440948486, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8705794811248779, "num_tokens": 575638255.0, "step": 15092 }, { "epoch": 1.9199847347665693, "ewc_loss": 0.06379275023937225, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000305896217469126, "grad_norm": 7.618348598480225, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8539330959320068, "num_tokens": 575685281.0, "step": 15093 }, { "epoch": 1.9201119450451598, "ewc_loss": 0.06317058205604553, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003045573830604553, "grad_norm": 7.579436302185059, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8705109357833862, "num_tokens": 575725030.0, "step": 15094 }, { "epoch": 1.9202391553237501, "ewc_loss": 0.0633099302649498, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030595087446272373, "grad_norm": 7.636403560638428, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8585100769996643, "num_tokens": 575764237.0, "step": 15095 }, { "epoch": 1.9203663656023406, "ewc_loss": 0.06305612623691559, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003034128458239138, "grad_norm": 7.562572002410889, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.862147331237793, "num_tokens": 575796732.0, "step": 15096 }, { "epoch": 1.9204935758809312, "ewc_loss": 0.06337806582450867, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003066322533413768, "grad_norm": 7.622000217437744, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.869530200958252, "num_tokens": 575833133.0, "step": 15097 }, { "epoch": 1.9206207861595217, "ewc_loss": 0.0631522461771965, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003043740289285779, "grad_norm": 7.563612937927246, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8597692251205444, "num_tokens": 575870920.0, "step": 15098 }, { "epoch": 1.9207479964381122, "ewc_loss": 0.06340394169092178, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003068909572903067, "grad_norm": 7.633598327636719, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8674644231796265, "num_tokens": 575909132.0, "step": 15099 }, { "epoch": 1.9208752067167028, "ewc_loss": 0.06325359642505646, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030538751161657274, "grad_norm": 7.581475257873535, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8645122647285461, "num_tokens": 575947979.0, "step": 15100 }, { "epoch": 1.921002416995293, "ewc_loss": 0.06340823322534561, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003069339145440608, "grad_norm": 7.662921905517578, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.853063702583313, "num_tokens": 575988888.0, "step": 15101 }, { "epoch": 1.9211296272738836, "ewc_loss": 0.06323157995939255, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030516734113916755, "grad_norm": 7.587985515594482, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8662756681442261, "num_tokens": 576028614.0, "step": 15102 }, { "epoch": 1.9212568375524741, "ewc_loss": 0.0634889155626297, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003077407309319824, "grad_norm": 7.69320011138916, "learning_rate": 1e-06, "loss": 0.3748, "mean_token_accuracy": 0.8922513723373413, "num_tokens": 576066648.0, "step": 15103 }, { "epoch": 1.9213840478310646, "ewc_loss": 0.06314176321029663, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003042691678274423, "grad_norm": 7.528440952301025, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8607739210128784, "num_tokens": 576104045.0, "step": 15104 }, { "epoch": 1.9215112581096552, "ewc_loss": 0.06347033381462097, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030755490297451615, "grad_norm": 7.665961742401123, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8582783937454224, "num_tokens": 576145419.0, "step": 15105 }, { "epoch": 1.9216384683882457, "ewc_loss": 0.0633959099650383, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000304369255900383, "grad_norm": 7.555147171020508, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8603452444076538, "num_tokens": 576191410.0, "step": 15106 }, { "epoch": 1.9217656786668362, "ewc_loss": 0.06347598880529404, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003076114517170936, "grad_norm": 7.7046990394592285, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8593332767486572, "num_tokens": 576226383.0, "step": 15107 }, { "epoch": 1.9218928889454268, "ewc_loss": 0.06316609680652618, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030451256316155195, "grad_norm": 7.585890769958496, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8568413257598877, "num_tokens": 576259958.0, "step": 15108 }, { "epoch": 1.9220200992240173, "ewc_loss": 0.06395024061203003, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003074712003581226, "grad_norm": 7.672483921051025, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8672914505004883, "num_tokens": 576299467.0, "step": 15109 }, { "epoch": 1.9221473095026078, "ewc_loss": 0.06318413466215134, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003046928904950619, "grad_norm": 7.594823837280273, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.858768105506897, "num_tokens": 576334137.0, "step": 15110 }, { "epoch": 1.9222745197811983, "ewc_loss": 0.0633033961057663, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003058855072595179, "grad_norm": 7.607875823974609, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8601688146591187, "num_tokens": 576379582.0, "step": 15111 }, { "epoch": 1.9224017300597889, "ewc_loss": 0.06332249939441681, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003060766030102968, "grad_norm": 7.63020658493042, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8784183263778687, "num_tokens": 576414590.0, "step": 15112 }, { "epoch": 1.9225289403383794, "ewc_loss": 0.06328341364860535, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000305685680359602, "grad_norm": 7.701801776885986, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.854002833366394, "num_tokens": 576445023.0, "step": 15113 }, { "epoch": 1.92265615061697, "ewc_loss": 0.06329616159200668, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003058131842408329, "grad_norm": 7.657418251037598, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8598901033401489, "num_tokens": 576477861.0, "step": 15114 }, { "epoch": 1.9227833608955605, "ewc_loss": 0.06329132616519928, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030576481367461383, "grad_norm": 7.671064853668213, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.861095130443573, "num_tokens": 576513121.0, "step": 15115 }, { "epoch": 1.922910571174151, "ewc_loss": 0.06333557516336441, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000306207308312878, "grad_norm": 7.651719093322754, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.869947612285614, "num_tokens": 576551039.0, "step": 15116 }, { "epoch": 1.9230377814527415, "ewc_loss": 0.06313709914684296, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003042225434910506, "grad_norm": 7.584332466125488, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8567321300506592, "num_tokens": 576590651.0, "step": 15117 }, { "epoch": 1.923164991731332, "ewc_loss": 0.0634126365184784, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003069779195357114, "grad_norm": 7.66378116607666, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8561387062072754, "num_tokens": 576629119.0, "step": 15118 }, { "epoch": 1.9232922020099223, "ewc_loss": 0.0634470134973526, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030488031916320324, "grad_norm": 7.589928150177002, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.872225284576416, "num_tokens": 576664240.0, "step": 15119 }, { "epoch": 1.9234194122885129, "ewc_loss": 0.0636902004480362, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003073121188208461, "grad_norm": 7.671080112457275, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8577548861503601, "num_tokens": 576701946.0, "step": 15120 }, { "epoch": 1.9235466225671034, "ewc_loss": 0.06348029524087906, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000305213121464476, "grad_norm": 7.595741271972656, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8746669292449951, "num_tokens": 576733541.0, "step": 15121 }, { "epoch": 1.923673832845694, "ewc_loss": 0.06364952772855759, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030690545099787414, "grad_norm": 7.651116847991943, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8494529724121094, "num_tokens": 576770930.0, "step": 15122 }, { "epoch": 1.9238010431242845, "ewc_loss": 0.06351927667856216, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000305602909065783, "grad_norm": 7.625659465789795, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8650631308555603, "num_tokens": 576807455.0, "step": 15123 }, { "epoch": 1.923928253402875, "ewc_loss": 0.06363434344530106, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003067535872105509, "grad_norm": 7.612898826599121, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.87138831615448, "num_tokens": 576849577.0, "step": 15124 }, { "epoch": 1.9240554636814655, "ewc_loss": 0.06350807845592499, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003054909175261855, "grad_norm": 7.646565914154053, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8565425872802734, "num_tokens": 576883506.0, "step": 15125 }, { "epoch": 1.9241826739600558, "ewc_loss": 0.06354443728923798, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003058545116800815, "grad_norm": 7.625920295715332, "learning_rate": 1e-06, "loss": 0.5148, "mean_token_accuracy": 0.8473529815673828, "num_tokens": 576923892.0, "step": 15126 }, { "epoch": 1.9243098842386464, "ewc_loss": 0.06351511925458908, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003055613487958908, "grad_norm": 7.585545063018799, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8809106349945068, "num_tokens": 576970832.0, "step": 15127 }, { "epoch": 1.9244370945172369, "ewc_loss": 0.0635594055056572, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030600419268012047, "grad_norm": 7.637557029724121, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.845116376876831, "num_tokens": 577006990.0, "step": 15128 }, { "epoch": 1.9245643047958274, "ewc_loss": 0.06347789615392685, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030518913990817964, "grad_norm": 7.667498588562012, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8693327307701111, "num_tokens": 577044932.0, "step": 15129 }, { "epoch": 1.924691515074418, "ewc_loss": 0.06340748071670532, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030448497273027897, "grad_norm": 7.58701753616333, "learning_rate": 1e-06, "loss": 0.5183, "mean_token_accuracy": 0.8473896384239197, "num_tokens": 577076217.0, "step": 15130 }, { "epoch": 1.9248187253530085, "ewc_loss": 0.06356100738048553, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003060202579945326, "grad_norm": 7.725351333618164, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8800151348114014, "num_tokens": 577112696.0, "step": 15131 }, { "epoch": 1.924945935631599, "ewc_loss": 0.06361298263072968, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003040985611733049, "grad_norm": 7.554150581359863, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8638440370559692, "num_tokens": 577156397.0, "step": 15132 }, { "epoch": 1.9250731459101895, "ewc_loss": 0.06392111629247665, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003071799292229116, "grad_norm": 7.689905643463135, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8683512806892395, "num_tokens": 577194192.0, "step": 15133 }, { "epoch": 1.92520035618878, "ewc_loss": 0.0635545551776886, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030351427267305553, "grad_norm": 7.559156894683838, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8586819171905518, "num_tokens": 577232639.0, "step": 15134 }, { "epoch": 1.9253275664673706, "ewc_loss": 0.06403598189353943, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030832859920337796, "grad_norm": 7.796421527862549, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8726098537445068, "num_tokens": 577267349.0, "step": 15135 }, { "epoch": 1.925454776745961, "ewc_loss": 0.06352894008159637, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003032581298612058, "grad_norm": 7.516541481018066, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8597248792648315, "num_tokens": 577308329.0, "step": 15136 }, { "epoch": 1.9255819870245516, "ewc_loss": 0.0640190839767456, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003081595932599157, "grad_norm": 7.731076717376709, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8654934763908386, "num_tokens": 577347825.0, "step": 15137 }, { "epoch": 1.9257091973031422, "ewc_loss": 0.06348206102848053, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030278932536020875, "grad_norm": 7.570836067199707, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8642593026161194, "num_tokens": 577379981.0, "step": 15138 }, { "epoch": 1.9258364075817327, "ewc_loss": 0.06398358941078186, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030780467204749584, "grad_norm": 7.675865650177002, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8682956099510193, "num_tokens": 577414134.0, "step": 15139 }, { "epoch": 1.9259636178603232, "ewc_loss": 0.06353659927845001, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030333478935062885, "grad_norm": 7.54600715637207, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8624645471572876, "num_tokens": 577451170.0, "step": 15140 }, { "epoch": 1.9260908281389137, "ewc_loss": 0.06396013498306274, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003075700660701841, "grad_norm": 7.6686811447143555, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8491852283477783, "num_tokens": 577487447.0, "step": 15141 }, { "epoch": 1.9262180384175043, "ewc_loss": 0.06365065276622772, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003044752520509064, "grad_norm": 7.564418315887451, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8683632612228394, "num_tokens": 577525732.0, "step": 15142 }, { "epoch": 1.9263452486960948, "ewc_loss": 0.06397444009780884, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030771311139687896, "grad_norm": 7.7453932762146, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8628129959106445, "num_tokens": 577563303.0, "step": 15143 }, { "epoch": 1.926472458974685, "ewc_loss": 0.06368793547153473, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003048481303267181, "grad_norm": 7.535910606384277, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8769075274467468, "num_tokens": 577607592.0, "step": 15144 }, { "epoch": 1.9265996692532756, "ewc_loss": 0.06394660472869873, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003074347914662212, "grad_norm": 7.691199779510498, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8550003170967102, "num_tokens": 577651646.0, "step": 15145 }, { "epoch": 1.9267268795318662, "ewc_loss": 0.06367021799087524, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030467091710306704, "grad_norm": 7.59983491897583, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8719456791877747, "num_tokens": 577689530.0, "step": 15146 }, { "epoch": 1.9268540898104567, "ewc_loss": 0.063897505402565, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030694378074258566, "grad_norm": 7.684610366821289, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8572278022766113, "num_tokens": 577732524.0, "step": 15147 }, { "epoch": 1.9269813000890472, "ewc_loss": 0.06347411870956421, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030515133403241634, "grad_norm": 7.602367877960205, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8683643341064453, "num_tokens": 577767164.0, "step": 15148 }, { "epoch": 1.9271085103676378, "ewc_loss": 0.06360551714897156, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003064652846660465, "grad_norm": 7.650759220123291, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8709660768508911, "num_tokens": 577802347.0, "step": 15149 }, { "epoch": 1.927235720646228, "ewc_loss": 0.06356215476989746, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003060316957999021, "grad_norm": 7.594137191772461, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8560178279876709, "num_tokens": 577842899.0, "step": 15150 }, { "epoch": 1.9273629309248186, "ewc_loss": 0.06357160210609436, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003061261377297342, "grad_norm": 7.634092330932617, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8641915917396545, "num_tokens": 577885137.0, "step": 15151 }, { "epoch": 1.9274901412034091, "ewc_loss": 0.0635749101638794, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003061592869926244, "grad_norm": 7.582606315612793, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.8768343925476074, "num_tokens": 577925727.0, "step": 15152 }, { "epoch": 1.9276173514819996, "ewc_loss": 0.06365358829498291, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030694599263370037, "grad_norm": 7.5890350341796875, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8682816028594971, "num_tokens": 577965399.0, "step": 15153 }, { "epoch": 1.9277445617605902, "ewc_loss": 0.06370849907398224, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030749518191441894, "grad_norm": 7.646117210388184, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8629399538040161, "num_tokens": 578007608.0, "step": 15154 }, { "epoch": 1.9278717720391807, "ewc_loss": 0.06365589052438736, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030696907197125256, "grad_norm": 7.612738609313965, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8739928007125854, "num_tokens": 578043425.0, "step": 15155 }, { "epoch": 1.9279989823177712, "ewc_loss": 0.06368871033191681, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003072972467634827, "grad_norm": 7.67962121963501, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8603546023368835, "num_tokens": 578075838.0, "step": 15156 }, { "epoch": 1.9281261925963618, "ewc_loss": 0.06361262500286102, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003065364435315132, "grad_norm": 7.570868968963623, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8592279553413391, "num_tokens": 578110113.0, "step": 15157 }, { "epoch": 1.9282534028749523, "ewc_loss": 0.06377208232879639, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030813098419457674, "grad_norm": 7.686765670776367, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8641613721847534, "num_tokens": 578145743.0, "step": 15158 }, { "epoch": 1.9283806131535428, "ewc_loss": 0.06352042406797409, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000305614375974983, "grad_norm": 7.619760990142822, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8709204792976379, "num_tokens": 578181588.0, "step": 15159 }, { "epoch": 1.9285078234321333, "ewc_loss": 0.06318150460720062, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030710804276168346, "grad_norm": 7.609105587005615, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8616588711738586, "num_tokens": 578222048.0, "step": 15160 }, { "epoch": 1.9286350337107239, "ewc_loss": 0.06311362981796265, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003064292250201106, "grad_norm": 7.64451265335083, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8662394285202026, "num_tokens": 578262664.0, "step": 15161 }, { "epoch": 1.9287622439893144, "ewc_loss": 0.06362943351268768, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003067044890485704, "grad_norm": 7.634622097015381, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8538304567337036, "num_tokens": 578303431.0, "step": 15162 }, { "epoch": 1.928889454267905, "ewc_loss": 0.06318146735429764, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030710763530805707, "grad_norm": 7.658562660217285, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8541104793548584, "num_tokens": 578338978.0, "step": 15163 }, { "epoch": 1.9290166645464955, "ewc_loss": 0.06316133588552475, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030690632411278784, "grad_norm": 7.626187801361084, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8600379228591919, "num_tokens": 578376340.0, "step": 15164 }, { "epoch": 1.929143874825086, "ewc_loss": 0.06377911567687988, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003082012990489602, "grad_norm": 7.681370258331299, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.864849328994751, "num_tokens": 578414714.0, "step": 15165 }, { "epoch": 1.9292710851036765, "ewc_loss": 0.06358765065670013, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000306286703562364, "grad_norm": 7.650323390960693, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8495935797691345, "num_tokens": 578453668.0, "step": 15166 }, { "epoch": 1.929398295382267, "ewc_loss": 0.06377217918634415, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003081319446209818, "grad_norm": 7.671977519989014, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8567416071891785, "num_tokens": 578494148.0, "step": 15167 }, { "epoch": 1.9295255056608573, "ewc_loss": 0.06358899176120758, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003063000622205436, "grad_norm": 7.672277450561523, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8435446619987488, "num_tokens": 578530743.0, "step": 15168 }, { "epoch": 1.9296527159394479, "ewc_loss": 0.06335355341434479, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003063871117774397, "grad_norm": 7.613702774047852, "learning_rate": 1e-06, "loss": 0.3936, "mean_token_accuracy": 0.8850249648094177, "num_tokens": 578567613.0, "step": 15169 }, { "epoch": 1.9297799262180384, "ewc_loss": 0.06372614949941635, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003076716384384781, "grad_norm": 7.707025527954102, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8619383573532104, "num_tokens": 578602981.0, "step": 15170 }, { "epoch": 1.929907136496629, "ewc_loss": 0.06324658542871475, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030531740048900247, "grad_norm": 7.635490894317627, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8573791980743408, "num_tokens": 578641027.0, "step": 15171 }, { "epoch": 1.9300343467752195, "ewc_loss": 0.06344826519489288, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003073342377319932, "grad_norm": 7.649163246154785, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8774228096008301, "num_tokens": 578683006.0, "step": 15172 }, { "epoch": 1.93016155705381, "ewc_loss": 0.06371559202671051, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000305124674923718, "grad_norm": 7.66005802154541, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8671810626983643, "num_tokens": 578716722.0, "step": 15173 }, { "epoch": 1.9302887673324005, "ewc_loss": 0.0636313185095787, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030672334833070636, "grad_norm": 7.662481307983398, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.844359278678894, "num_tokens": 578750740.0, "step": 15174 }, { "epoch": 1.9304159776109908, "ewc_loss": 0.06353073567152023, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003057174908462912, "grad_norm": 7.621417999267578, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.865552544593811, "num_tokens": 578791358.0, "step": 15175 }, { "epoch": 1.9305431878895813, "ewc_loss": 0.06363704055547714, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003067805664613843, "grad_norm": 7.619111061096191, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8477233648300171, "num_tokens": 578834514.0, "step": 15176 }, { "epoch": 1.9306703981681719, "ewc_loss": 0.0636620968580246, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030703109223395586, "grad_norm": 7.627118110656738, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8563973307609558, "num_tokens": 578875869.0, "step": 15177 }, { "epoch": 1.9307976084467624, "ewc_loss": 0.0636477842926979, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003068879886996001, "grad_norm": 7.670618534088135, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.867945671081543, "num_tokens": 578921079.0, "step": 15178 }, { "epoch": 1.930924818725353, "ewc_loss": 0.06370659172534943, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030503468587994576, "grad_norm": 7.624011993408203, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8617503643035889, "num_tokens": 578959837.0, "step": 15179 }, { "epoch": 1.9310520290039435, "ewc_loss": 0.06389330327510834, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030690181301906705, "grad_norm": 7.676316261291504, "learning_rate": 1e-06, "loss": 0.5668, "mean_token_accuracy": 0.8355339765548706, "num_tokens": 579001629.0, "step": 15180 }, { "epoch": 1.931179239282534, "ewc_loss": 0.06377648562192917, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030573358526453376, "grad_norm": 7.62827205657959, "learning_rate": 1e-06, "loss": 0.3926, "mean_token_accuracy": 0.886804461479187, "num_tokens": 579036762.0, "step": 15181 }, { "epoch": 1.9313064495611245, "ewc_loss": 0.06386788189411163, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003066475910600275, "grad_norm": 7.661558628082275, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8422356843948364, "num_tokens": 579078455.0, "step": 15182 }, { "epoch": 1.931433659839715, "ewc_loss": 0.06397552788257599, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00030528262141160667, "grad_norm": 7.712516784667969, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8658508062362671, "num_tokens": 579116942.0, "step": 15183 }, { "epoch": 1.9315608701183056, "ewc_loss": 0.06379005312919617, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030586932552978396, "grad_norm": 7.659382343292236, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8774349093437195, "num_tokens": 579155180.0, "step": 15184 }, { "epoch": 1.931688080396896, "ewc_loss": 0.06353460252285004, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030575619894079864, "grad_norm": 7.704202175140381, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8566969633102417, "num_tokens": 579194336.0, "step": 15185 }, { "epoch": 1.9318152906754866, "ewc_loss": 0.06334447860717773, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003038549330085516, "grad_norm": 7.63832950592041, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.859014630317688, "num_tokens": 579226609.0, "step": 15186 }, { "epoch": 1.9319425009540772, "ewc_loss": 0.06350002437829971, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030541038722731173, "grad_norm": 7.737375736236572, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8674334287643433, "num_tokens": 579259288.0, "step": 15187 }, { "epoch": 1.9320697112326677, "ewc_loss": 0.06334459036588669, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003038560680579394, "grad_norm": 7.596811771392822, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8721683025360107, "num_tokens": 579294797.0, "step": 15188 }, { "epoch": 1.9321969215112582, "ewc_loss": 0.06347459554672241, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003051561361644417, "grad_norm": 7.676153659820557, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8739098906517029, "num_tokens": 579334352.0, "step": 15189 }, { "epoch": 1.9323241317898487, "ewc_loss": 0.06368428468704224, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030481157591566443, "grad_norm": 7.596058368682861, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8532597422599792, "num_tokens": 579379618.0, "step": 15190 }, { "epoch": 1.9324513420684393, "ewc_loss": 0.06384287774562836, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030639750184491277, "grad_norm": 7.73343563079834, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8620328307151794, "num_tokens": 579420230.0, "step": 15191 }, { "epoch": 1.9325785523470298, "ewc_loss": 0.06331989169120789, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030360909295268357, "grad_norm": 7.584975242614746, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8636720180511475, "num_tokens": 579458236.0, "step": 15192 }, { "epoch": 1.93270576262562, "ewc_loss": 0.06401532143354416, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003081219911109656, "grad_norm": 7.809115886688232, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8623067140579224, "num_tokens": 579495561.0, "step": 15193 }, { "epoch": 1.9328329729042106, "ewc_loss": 0.06332476437091827, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003036577545572072, "grad_norm": 7.618082046508789, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8616516590118408, "num_tokens": 579530225.0, "step": 15194 }, { "epoch": 1.9329601831828012, "ewc_loss": 0.06366542726755142, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003070644452236593, "grad_norm": 7.765047550201416, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8542964458465576, "num_tokens": 579570143.0, "step": 15195 }, { "epoch": 1.9330873934613917, "ewc_loss": 0.06333228200674057, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003037329588551074, "grad_norm": 7.528041362762451, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8771277070045471, "num_tokens": 579607105.0, "step": 15196 }, { "epoch": 1.9332146037399822, "ewc_loss": 0.06375017762184143, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030791189055889845, "grad_norm": 7.826548099517822, "learning_rate": 1e-06, "loss": 0.4093, "mean_token_accuracy": 0.879511296749115, "num_tokens": 579640018.0, "step": 15197 }, { "epoch": 1.9333418140185727, "ewc_loss": 0.06328398734331131, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030325003899633884, "grad_norm": 7.528421878814697, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8584291934967041, "num_tokens": 579681250.0, "step": 15198 }, { "epoch": 1.933469024297163, "ewc_loss": 0.06386353075504303, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030904545565135777, "grad_norm": 7.879549026489258, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8540587425231934, "num_tokens": 579718535.0, "step": 15199 }, { "epoch": 1.9335962345757536, "ewc_loss": 0.06311774253845215, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003015876282006502, "grad_norm": 7.498345851898193, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8510814309120178, "num_tokens": 579756959.0, "step": 15200 }, { "epoch": 1.933723444854344, "ewc_loss": 0.06399594247341156, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003103695926256478, "grad_norm": 7.9460859298706055, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8544027805328369, "num_tokens": 579798266.0, "step": 15201 }, { "epoch": 1.9338506551329346, "ewc_loss": 0.06309804320335388, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030139056616462767, "grad_norm": 7.492689609527588, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8820620775222778, "num_tokens": 579831869.0, "step": 15202 }, { "epoch": 1.9339778654115252, "ewc_loss": 0.06434659659862518, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003114347346127033, "grad_norm": 7.840577125549316, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8655201196670532, "num_tokens": 579874168.0, "step": 15203 }, { "epoch": 1.9341050756901157, "ewc_loss": 0.06351108849048615, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003030796069651842, "grad_norm": 7.523810386657715, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8685557842254639, "num_tokens": 579916313.0, "step": 15204 }, { "epoch": 1.9342322859687062, "ewc_loss": 0.06421881914138794, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031015693093650043, "grad_norm": 7.698322296142578, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8749412298202515, "num_tokens": 579959022.0, "step": 15205 }, { "epoch": 1.9343594962472968, "ewc_loss": 0.06348617374897003, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030527188209816813, "grad_norm": 7.686243057250977, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8490526676177979, "num_tokens": 580003675.0, "step": 15206 }, { "epoch": 1.9344867065258873, "ewc_loss": 0.06362712383270264, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030668138060718775, "grad_norm": 7.600412845611572, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8733497262001038, "num_tokens": 580044384.0, "step": 15207 }, { "epoch": 1.9346139168044778, "ewc_loss": 0.06378057599067688, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030821593827567995, "grad_norm": 7.752140045166016, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8527063131332397, "num_tokens": 580078955.0, "step": 15208 }, { "epoch": 1.9347411270830683, "ewc_loss": 0.06338971853256226, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003043073520530015, "grad_norm": 7.6516194343566895, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.860338568687439, "num_tokens": 580116793.0, "step": 15209 }, { "epoch": 1.9348683373616589, "ewc_loss": 0.06374192237854004, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030782935209572315, "grad_norm": 7.6570539474487305, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8628056049346924, "num_tokens": 580153622.0, "step": 15210 }, { "epoch": 1.9349955476402494, "ewc_loss": 0.06340570002794266, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000304467132082209, "grad_norm": 7.63325834274292, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8610998392105103, "num_tokens": 580195287.0, "step": 15211 }, { "epoch": 1.93512275791884, "ewc_loss": 0.06359849870204926, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030639514443464577, "grad_norm": 7.660737037658691, "learning_rate": 1e-06, "loss": 0.5125, "mean_token_accuracy": 0.8502670526504517, "num_tokens": 580229601.0, "step": 15212 }, { "epoch": 1.9352499681974304, "ewc_loss": 0.06352028250694275, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003056130080949515, "grad_norm": 7.60783052444458, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8673625588417053, "num_tokens": 580268079.0, "step": 15213 }, { "epoch": 1.935377178476021, "ewc_loss": 0.06368334591388702, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003072436375077814, "grad_norm": 7.722898483276367, "learning_rate": 1e-06, "loss": 0.5339, "mean_token_accuracy": 0.8459073305130005, "num_tokens": 580305379.0, "step": 15214 }, { "epoch": 1.9355043887546115, "ewc_loss": 0.06345593929290771, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030496958061121404, "grad_norm": 7.691248893737793, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8618944883346558, "num_tokens": 580338438.0, "step": 15215 }, { "epoch": 1.935631599033202, "ewc_loss": 0.06369461119174957, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003073562402278185, "grad_norm": 7.723362445831299, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8631649613380432, "num_tokens": 580370017.0, "step": 15216 }, { "epoch": 1.9357588093117923, "ewc_loss": 0.06337659806013107, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003041761228814721, "grad_norm": 7.593655586242676, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8750742673873901, "num_tokens": 580409572.0, "step": 15217 }, { "epoch": 1.9358860195903829, "ewc_loss": 0.063723623752594, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003076464345213026, "grad_norm": 7.679958820343018, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8683208227157593, "num_tokens": 580448481.0, "step": 15218 }, { "epoch": 1.9360132298689734, "ewc_loss": 0.06349337100982666, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003053438267670572, "grad_norm": 7.616042613983154, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8756493330001831, "num_tokens": 580481919.0, "step": 15219 }, { "epoch": 1.936140440147564, "ewc_loss": 0.06358636915683746, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003062738396693021, "grad_norm": 7.6483154296875, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8583850860595703, "num_tokens": 580520758.0, "step": 15220 }, { "epoch": 1.9362676504261545, "ewc_loss": 0.0635761171579361, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003061713359784335, "grad_norm": 7.643385887145996, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8553539514541626, "num_tokens": 580559839.0, "step": 15221 }, { "epoch": 1.936394860704745, "ewc_loss": 0.06358179450035095, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030622805934399366, "grad_norm": 7.6643781661987305, "learning_rate": 1e-06, "loss": 0.3852, "mean_token_accuracy": 0.8903153538703918, "num_tokens": 580591767.0, "step": 15222 }, { "epoch": 1.9365220709833355, "ewc_loss": 0.06344948709011078, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030490499921143055, "grad_norm": 7.614319324493408, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.87204909324646, "num_tokens": 580633180.0, "step": 15223 }, { "epoch": 1.9366492812619258, "ewc_loss": 0.06354598701000214, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030586999491788447, "grad_norm": 7.629778861999512, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8657537698745728, "num_tokens": 580677045.0, "step": 15224 }, { "epoch": 1.9367764915405163, "ewc_loss": 0.06346981227397919, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003051083185710013, "grad_norm": 7.5660529136657715, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8769432902336121, "num_tokens": 580717423.0, "step": 15225 }, { "epoch": 1.9369037018191069, "ewc_loss": 0.06363388150930405, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003067489597015083, "grad_norm": 7.697990417480469, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8676738739013672, "num_tokens": 580756677.0, "step": 15226 }, { "epoch": 1.9370309120976974, "ewc_loss": 0.06342381983995438, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003046483325306326, "grad_norm": 7.617538928985596, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.868105411529541, "num_tokens": 580796993.0, "step": 15227 }, { "epoch": 1.937158122376288, "ewc_loss": 0.06367005407810211, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000307110691210255, "grad_norm": 7.665655136108398, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8660726547241211, "num_tokens": 580834616.0, "step": 15228 }, { "epoch": 1.9372853326548785, "ewc_loss": 0.06357359886169434, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003061461611650884, "grad_norm": 7.631377696990967, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8657568693161011, "num_tokens": 580873718.0, "step": 15229 }, { "epoch": 1.937412542933469, "ewc_loss": 0.06359902769327164, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030640041222795844, "grad_norm": 7.671360015869141, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8601313829421997, "num_tokens": 580911678.0, "step": 15230 }, { "epoch": 1.9375397532120595, "ewc_loss": 0.06348717212677002, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003052819229196757, "grad_norm": 7.558804988861084, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8695608377456665, "num_tokens": 580953282.0, "step": 15231 }, { "epoch": 1.93766696349065, "ewc_loss": 0.06381672620773315, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030857743695378304, "grad_norm": 7.733234882354736, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8686855435371399, "num_tokens": 580997954.0, "step": 15232 }, { "epoch": 1.9377941737692406, "ewc_loss": 0.06335878372192383, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000303997949231416, "grad_norm": 7.574581623077393, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8647201061248779, "num_tokens": 581035623.0, "step": 15233 }, { "epoch": 1.937921384047831, "ewc_loss": 0.06375258415937424, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003079360176343471, "grad_norm": 7.721678733825684, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8742631077766418, "num_tokens": 581068727.0, "step": 15234 }, { "epoch": 1.9380485943264216, "ewc_loss": 0.06339292228221893, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030433936626650393, "grad_norm": 7.598581790924072, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8632773160934448, "num_tokens": 581111180.0, "step": 15235 }, { "epoch": 1.9381758046050122, "ewc_loss": 0.06330232322216034, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030831623007543385, "grad_norm": 7.6802191734313965, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8447502255439758, "num_tokens": 581152373.0, "step": 15236 }, { "epoch": 1.9383030148836027, "ewc_loss": 0.06345269083976746, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030493707163259387, "grad_norm": 7.589849472045898, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8619598150253296, "num_tokens": 581195854.0, "step": 15237 }, { "epoch": 1.9384302251621932, "ewc_loss": 0.06335566937923431, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003088496741838753, "grad_norm": 7.710628032684326, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8359411358833313, "num_tokens": 581236414.0, "step": 15238 }, { "epoch": 1.9385574354407837, "ewc_loss": 0.06347858160734177, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000305195979308337, "grad_norm": 7.607770919799805, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8485965132713318, "num_tokens": 581275349.0, "step": 15239 }, { "epoch": 1.9386846457193743, "ewc_loss": 0.06322338432073593, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003075268177781254, "grad_norm": 7.631479740142822, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8707460165023804, "num_tokens": 581313231.0, "step": 15240 }, { "epoch": 1.9388118559979648, "ewc_loss": 0.06321962922811508, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030748924473300576, "grad_norm": 7.628946304321289, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8639348745346069, "num_tokens": 581355295.0, "step": 15241 }, { "epoch": 1.938939066276555, "ewc_loss": 0.06318112462759018, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003071042010560632, "grad_norm": 7.603569030761719, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8677258491516113, "num_tokens": 581392633.0, "step": 15242 }, { "epoch": 1.9390662765551456, "ewc_loss": 0.06375196576118469, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030792984762229025, "grad_norm": 7.6759033203125, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8774306774139404, "num_tokens": 581425552.0, "step": 15243 }, { "epoch": 1.9391934868337362, "ewc_loss": 0.06365428119897842, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030695297755301, "grad_norm": 7.553659439086914, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8504866361618042, "num_tokens": 581467707.0, "step": 15244 }, { "epoch": 1.9393206971123267, "ewc_loss": 0.06392236053943634, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030963373137637973, "grad_norm": 7.670542240142822, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.870339035987854, "num_tokens": 581504888.0, "step": 15245 }, { "epoch": 1.9394479073909172, "ewc_loss": 0.06314957141876221, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030678865732625127, "grad_norm": 7.6312432289123535, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8634326457977295, "num_tokens": 581537800.0, "step": 15246 }, { "epoch": 1.9395751176695077, "ewc_loss": 0.06342598795890808, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030955282272771, "grad_norm": 7.651054382324219, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8842437267303467, "num_tokens": 581570594.0, "step": 15247 }, { "epoch": 1.939702327948098, "ewc_loss": 0.0632091611623764, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030738458735868335, "grad_norm": 7.6164679527282715, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8562297821044922, "num_tokens": 581610764.0, "step": 15248 }, { "epoch": 1.9398295382266886, "ewc_loss": 0.06375717371702194, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030798191437497735, "grad_norm": 7.6060895919799805, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8639586567878723, "num_tokens": 581652341.0, "step": 15249 }, { "epoch": 1.939956748505279, "ewc_loss": 0.06376048922538757, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030801500543020666, "grad_norm": 7.676165580749512, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8564494848251343, "num_tokens": 581689978.0, "step": 15250 }, { "epoch": 1.9400839587838696, "ewc_loss": 0.06324010342359543, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030769401928409934, "grad_norm": 7.641420841217041, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8736231327056885, "num_tokens": 581728811.0, "step": 15251 }, { "epoch": 1.9402111690624602, "ewc_loss": 0.06332629919052124, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003085559292230755, "grad_norm": 7.640780925750732, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8738065361976624, "num_tokens": 581768016.0, "step": 15252 }, { "epoch": 1.9403383793410507, "ewc_loss": 0.06318224966526031, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003071154351346195, "grad_norm": 7.693263530731201, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.866901159286499, "num_tokens": 581805405.0, "step": 15253 }, { "epoch": 1.9404655896196412, "ewc_loss": 0.06383301317691803, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000308740243781358, "grad_norm": 7.677779197692871, "learning_rate": 1e-06, "loss": 0.5236, "mean_token_accuracy": 0.8437197208404541, "num_tokens": 581845302.0, "step": 15254 }, { "epoch": 1.9405927998982317, "ewc_loss": 0.06383591145277023, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030876926030032337, "grad_norm": 7.70429801940918, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8724815845489502, "num_tokens": 581875391.0, "step": 15255 }, { "epoch": 1.9407200101768223, "ewc_loss": 0.06398099660873413, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030777871143072844, "grad_norm": 7.646382808685303, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8630486130714417, "num_tokens": 581916580.0, "step": 15256 }, { "epoch": 1.9408472204554128, "ewc_loss": 0.063791923224926, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030832941411063075, "grad_norm": 7.608985424041748, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8659865856170654, "num_tokens": 581958539.0, "step": 15257 }, { "epoch": 1.9409744307340033, "ewc_loss": 0.06380947679281235, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003085049393121153, "grad_norm": 7.628779411315918, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8626595735549927, "num_tokens": 581996981.0, "step": 15258 }, { "epoch": 1.9411016410125939, "ewc_loss": 0.06381198018789291, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003085299686063081, "grad_norm": 7.6692938804626465, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.873092770576477, "num_tokens": 582036068.0, "step": 15259 }, { "epoch": 1.9412288512911844, "ewc_loss": 0.06378835439682007, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030829370371066034, "grad_norm": 7.628320693969727, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8654440641403198, "num_tokens": 582073322.0, "step": 15260 }, { "epoch": 1.941356061569775, "ewc_loss": 0.06385481357574463, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003089582605753094, "grad_norm": 7.69933557510376, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8670477867126465, "num_tokens": 582110736.0, "step": 15261 }, { "epoch": 1.9414832718483654, "ewc_loss": 0.0637582391500473, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003079925663769245, "grad_norm": 7.671427249908447, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8472633957862854, "num_tokens": 582153431.0, "step": 15262 }, { "epoch": 1.941610482126956, "ewc_loss": 0.06391216069459915, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030953175155445933, "grad_norm": 7.732025623321533, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.845482349395752, "num_tokens": 582191793.0, "step": 15263 }, { "epoch": 1.9417376924055465, "ewc_loss": 0.0635538324713707, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003059484588447958, "grad_norm": 7.59556770324707, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8545079231262207, "num_tokens": 582227923.0, "step": 15264 }, { "epoch": 1.941864902684137, "ewc_loss": 0.06390426307916641, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030945276375859976, "grad_norm": 7.667806625366211, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.847745418548584, "num_tokens": 582268029.0, "step": 15265 }, { "epoch": 1.9419921129627273, "ewc_loss": 0.0635816752910614, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030622686608694494, "grad_norm": 7.650944709777832, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8579176664352417, "num_tokens": 582302173.0, "step": 15266 }, { "epoch": 1.9421193232413179, "ewc_loss": 0.06369969993829727, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003074071719311178, "grad_norm": 7.638472080230713, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8796229362487793, "num_tokens": 582341642.0, "step": 15267 }, { "epoch": 1.9422465335199084, "ewc_loss": 0.06377755105495453, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003081856411881745, "grad_norm": 7.739140510559082, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8666613101959229, "num_tokens": 582378960.0, "step": 15268 }, { "epoch": 1.942373743798499, "ewc_loss": 0.06356868892908096, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003060970630031079, "grad_norm": 7.641237258911133, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8710826635360718, "num_tokens": 582422149.0, "step": 15269 }, { "epoch": 1.9425009540770894, "ewc_loss": 0.06371728330850601, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003075829881709069, "grad_norm": 7.647227764129639, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8493625521659851, "num_tokens": 582462657.0, "step": 15270 }, { "epoch": 1.94262816435568, "ewc_loss": 0.06360086053609848, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030641877674497664, "grad_norm": 7.611738681793213, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8762660026550293, "num_tokens": 582499264.0, "step": 15271 }, { "epoch": 1.9427553746342705, "ewc_loss": 0.06369980424642563, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030740819056518376, "grad_norm": 7.590644836425781, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8670117259025574, "num_tokens": 582534744.0, "step": 15272 }, { "epoch": 1.9428825849128608, "ewc_loss": 0.06316626071929932, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030695556779392064, "grad_norm": 7.6136040687561035, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8593758344650269, "num_tokens": 582577010.0, "step": 15273 }, { "epoch": 1.9430097951914513, "ewc_loss": 0.06330125033855438, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003083054907619953, "grad_norm": 7.640475749969482, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8579993844032288, "num_tokens": 582617103.0, "step": 15274 }, { "epoch": 1.9431370054700419, "ewc_loss": 0.063739113509655, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003078012668993324, "grad_norm": 7.6009697914123535, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8569228649139404, "num_tokens": 582661754.0, "step": 15275 }, { "epoch": 1.9432642157486324, "ewc_loss": 0.06329670548439026, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030826000147499144, "grad_norm": 7.837770462036133, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.865598201751709, "num_tokens": 582701701.0, "step": 15276 }, { "epoch": 1.943391426027223, "ewc_loss": 0.06348253041505814, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003052354441024363, "grad_norm": 7.568875789642334, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8558386564254761, "num_tokens": 582741977.0, "step": 15277 }, { "epoch": 1.9435186363058135, "ewc_loss": 0.06397590786218643, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031016924185678363, "grad_norm": 7.641327381134033, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8738343119621277, "num_tokens": 582783420.0, "step": 15278 }, { "epoch": 1.943645846584404, "ewc_loss": 0.06355884671211243, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003059986047446728, "grad_norm": 7.619096279144287, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8717451691627502, "num_tokens": 582819464.0, "step": 15279 }, { "epoch": 1.9437730568629945, "ewc_loss": 0.06381665915250778, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030857676756568253, "grad_norm": 7.6232523918151855, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8687264323234558, "num_tokens": 582860757.0, "step": 15280 }, { "epoch": 1.943900267141585, "ewc_loss": 0.06378498673439026, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003082600305788219, "grad_norm": 7.590704441070557, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.858826756477356, "num_tokens": 582910300.0, "step": 15281 }, { "epoch": 1.9440274774201756, "ewc_loss": 0.06386971473693848, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003091073303949088, "grad_norm": 7.687983512878418, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8569371104240417, "num_tokens": 582956333.0, "step": 15282 }, { "epoch": 1.944154687698766, "ewc_loss": 0.06370081752538681, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003074183186981827, "grad_norm": 7.656729698181152, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8729225397109985, "num_tokens": 582994068.0, "step": 15283 }, { "epoch": 1.9442818979773566, "ewc_loss": 0.06388001143932343, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003092102997470647, "grad_norm": 7.728361129760742, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8623560667037964, "num_tokens": 583027829.0, "step": 15284 }, { "epoch": 1.9444091082559471, "ewc_loss": 0.06323385238647461, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003076314751524478, "grad_norm": 7.559933185577393, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8620297312736511, "num_tokens": 583068881.0, "step": 15285 }, { "epoch": 1.9445363185345377, "ewc_loss": 0.06345415115356445, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003098344604950398, "grad_norm": 7.715142726898193, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8671262860298157, "num_tokens": 583100097.0, "step": 15286 }, { "epoch": 1.9446635288131282, "ewc_loss": 0.06321671605110168, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00030746011179871857, "grad_norm": 7.623235702514648, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.863625168800354, "num_tokens": 583135340.0, "step": 15287 }, { "epoch": 1.9447907390917187, "ewc_loss": 0.06388436257839203, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003092537517659366, "grad_norm": 7.644711971282959, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.863050639629364, "num_tokens": 583172032.0, "step": 15288 }, { "epoch": 1.9449179493703093, "ewc_loss": 0.06373294442892075, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030773962498642504, "grad_norm": 7.610983848571777, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8680498003959656, "num_tokens": 583205244.0, "step": 15289 }, { "epoch": 1.9450451596488998, "ewc_loss": 0.0638383999466896, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030879417317919433, "grad_norm": 7.688838005065918, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8595601320266724, "num_tokens": 583247537.0, "step": 15290 }, { "epoch": 1.94517236992749, "ewc_loss": 0.06377436220645905, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003081538015976548, "grad_norm": 7.642892837524414, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8580227494239807, "num_tokens": 583285349.0, "step": 15291 }, { "epoch": 1.9452995802060806, "ewc_loss": 0.06378692388534546, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030827935552224517, "grad_norm": 7.6645379066467285, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8660751581192017, "num_tokens": 583323810.0, "step": 15292 }, { "epoch": 1.9454267904846712, "ewc_loss": 0.06371136754751205, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030752382008358836, "grad_norm": 7.563895225524902, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8761439323425293, "num_tokens": 583354244.0, "step": 15293 }, { "epoch": 1.9455540007632617, "ewc_loss": 0.06398200988769531, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031023030169308186, "grad_norm": 7.657370567321777, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8781998157501221, "num_tokens": 583391764.0, "step": 15294 }, { "epoch": 1.9456812110418522, "ewc_loss": 0.06373288482427597, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030773901380598545, "grad_norm": 7.561843395233154, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8517694473266602, "num_tokens": 583426305.0, "step": 15295 }, { "epoch": 1.9458084213204427, "ewc_loss": 0.06399647146463394, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003103748895227909, "grad_norm": 7.613457202911377, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8600577116012573, "num_tokens": 583465252.0, "step": 15296 }, { "epoch": 1.945935631599033, "ewc_loss": 0.06376393139362335, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003080494934692979, "grad_norm": 7.605984687805176, "learning_rate": 1e-06, "loss": 0.4937, "mean_token_accuracy": 0.8552851676940918, "num_tokens": 583510001.0, "step": 15297 }, { "epoch": 1.9460628418776236, "ewc_loss": 0.06392629444599152, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030967313796281815, "grad_norm": 7.653884410858154, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8655520081520081, "num_tokens": 583547380.0, "step": 15298 }, { "epoch": 1.946190052156214, "ewc_loss": 0.06384797394275665, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000308889924781397, "grad_norm": 7.57987117767334, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8653833866119385, "num_tokens": 583588852.0, "step": 15299 }, { "epoch": 1.9463172624348046, "ewc_loss": 0.06390312314033508, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003094413841608912, "grad_norm": 7.586426258087158, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.870454728603363, "num_tokens": 583634006.0, "step": 15300 }, { "epoch": 1.9464444727133952, "ewc_loss": 0.06389494240283966, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030935954418964684, "grad_norm": 7.5845208168029785, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8492478728294373, "num_tokens": 583679430.0, "step": 15301 }, { "epoch": 1.9465716829919857, "ewc_loss": 0.06390321254730225, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003094423154834658, "grad_norm": 7.629487991333008, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8697262406349182, "num_tokens": 583708856.0, "step": 15302 }, { "epoch": 1.9466988932705762, "ewc_loss": 0.06381747871637344, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003085849166382104, "grad_norm": 7.601869106292725, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8622381687164307, "num_tokens": 583750737.0, "step": 15303 }, { "epoch": 1.9468261035491667, "ewc_loss": 0.06398636847734451, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031027384102344513, "grad_norm": 7.590219974517822, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8673975467681885, "num_tokens": 583786342.0, "step": 15304 }, { "epoch": 1.9469533138277573, "ewc_loss": 0.06405754387378693, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031098563340492547, "grad_norm": 7.613816261291504, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8696084022521973, "num_tokens": 583821681.0, "step": 15305 }, { "epoch": 1.9470805241063478, "ewc_loss": 0.06386473774909973, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003090575337409973, "grad_norm": 7.655114650726318, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.845143735408783, "num_tokens": 583855640.0, "step": 15306 }, { "epoch": 1.9472077343849383, "ewc_loss": 0.06404353678226471, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003108454984612763, "grad_norm": 7.6465535163879395, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.873536229133606, "num_tokens": 583888047.0, "step": 15307 }, { "epoch": 1.9473349446635289, "ewc_loss": 0.06396286934614182, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000310038827592507, "grad_norm": 7.596404075622559, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8634907007217407, "num_tokens": 583923220.0, "step": 15308 }, { "epoch": 1.9474621549421194, "ewc_loss": 0.06427577883005142, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003107265511061996, "grad_norm": 7.662563800811768, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8591819405555725, "num_tokens": 583965718.0, "step": 15309 }, { "epoch": 1.94758936522071, "ewc_loss": 0.06424832344055176, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031045195646584034, "grad_norm": 7.527908802032471, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8687682151794434, "num_tokens": 584014027.0, "step": 15310 }, { "epoch": 1.9477165754993004, "ewc_loss": 0.06417903304100037, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031220045639202, "grad_norm": 7.755763053894043, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8662247657775879, "num_tokens": 584045019.0, "step": 15311 }, { "epoch": 1.947843785777891, "ewc_loss": 0.06408679485321045, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003088366938754916, "grad_norm": 7.568426609039307, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8477494120597839, "num_tokens": 584086839.0, "step": 15312 }, { "epoch": 1.9479709960564815, "ewc_loss": 0.06448725610971451, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003128412936348468, "grad_norm": 7.682529926300049, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8786978721618652, "num_tokens": 584123785.0, "step": 15313 }, { "epoch": 1.948098206335072, "ewc_loss": 0.06410013884305954, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030897013493813574, "grad_norm": 7.619062423706055, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8551750183105469, "num_tokens": 584163768.0, "step": 15314 }, { "epoch": 1.9482254166136623, "ewc_loss": 0.06434980779886246, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003114668361376971, "grad_norm": 7.695015907287598, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8598092794418335, "num_tokens": 584201668.0, "step": 15315 }, { "epoch": 1.9483526268922529, "ewc_loss": 0.06418463587760925, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003098151064477861, "grad_norm": 7.606271266937256, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8719597458839417, "num_tokens": 584236626.0, "step": 15316 }, { "epoch": 1.9484798371708434, "ewc_loss": 0.06424272060394287, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003103959606960416, "grad_norm": 7.626573085784912, "learning_rate": 1e-06, "loss": 0.5187, "mean_token_accuracy": 0.8477631211280823, "num_tokens": 584279811.0, "step": 15317 }, { "epoch": 1.948607047449434, "ewc_loss": 0.06417115032672882, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003096802392974496, "grad_norm": 7.589330673217773, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8649582862854004, "num_tokens": 584317314.0, "step": 15318 }, { "epoch": 1.9487342577280244, "ewc_loss": 0.06433604657649994, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003113292041234672, "grad_norm": 7.660524845123291, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8687289953231812, "num_tokens": 584355768.0, "step": 15319 }, { "epoch": 1.948861468006615, "ewc_loss": 0.06422276794910431, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003101964248344302, "grad_norm": 7.634530067443848, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8596593141555786, "num_tokens": 584386205.0, "step": 15320 }, { "epoch": 1.9489886782852053, "ewc_loss": 0.06424535065889359, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031042227055877447, "grad_norm": 7.608865261077881, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8755026459693909, "num_tokens": 584419721.0, "step": 15321 }, { "epoch": 1.9491158885637958, "ewc_loss": 0.06425215303897858, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003104902571067214, "grad_norm": 7.659815311431885, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8620367050170898, "num_tokens": 584451029.0, "step": 15322 }, { "epoch": 1.9492430988423863, "ewc_loss": 0.06418067961931229, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003097755543421954, "grad_norm": 7.598532676696777, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.877876877784729, "num_tokens": 584483991.0, "step": 15323 }, { "epoch": 1.9493703091209769, "ewc_loss": 0.06429103016853333, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003108790551777929, "grad_norm": 7.659012794494629, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8622827529907227, "num_tokens": 584517652.0, "step": 15324 }, { "epoch": 1.9494975193995674, "ewc_loss": 0.06416729092597961, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003096417058259249, "grad_norm": 7.559708595275879, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8726822137832642, "num_tokens": 584561058.0, "step": 15325 }, { "epoch": 1.949624729678158, "ewc_loss": 0.0643826350569725, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031179512734524906, "grad_norm": 7.662467956542969, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8667742013931274, "num_tokens": 584595097.0, "step": 15326 }, { "epoch": 1.9497519399567484, "ewc_loss": 0.06413140892982483, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003092828264925629, "grad_norm": 7.585540294647217, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8714765310287476, "num_tokens": 584639073.0, "step": 15327 }, { "epoch": 1.949879150235339, "ewc_loss": 0.06444154679775238, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003123842179775238, "grad_norm": 7.671723365783691, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8547204732894897, "num_tokens": 584680517.0, "step": 15328 }, { "epoch": 1.9500063605139295, "ewc_loss": 0.0641489326953888, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030945808975957334, "grad_norm": 7.563429355621338, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8634681701660156, "num_tokens": 584717796.0, "step": 15329 }, { "epoch": 1.95013357079252, "ewc_loss": 0.06435421854257584, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031151092844083905, "grad_norm": 7.665395736694336, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.876977801322937, "num_tokens": 584755595.0, "step": 15330 }, { "epoch": 1.9502607810711106, "ewc_loss": 0.06414943933486938, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030946312472224236, "grad_norm": 7.582192420959473, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8531271815299988, "num_tokens": 584798681.0, "step": 15331 }, { "epoch": 1.950387991349701, "ewc_loss": 0.06428089737892151, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003107776865363121, "grad_norm": 7.624422550201416, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8658239245414734, "num_tokens": 584835424.0, "step": 15332 }, { "epoch": 1.9505152016282916, "ewc_loss": 0.06414136290550232, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003093823615927249, "grad_norm": 7.586158275604248, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8773924112319946, "num_tokens": 584872063.0, "step": 15333 }, { "epoch": 1.9506424119068821, "ewc_loss": 0.06438274681568146, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003117961750831455, "grad_norm": 7.663562774658203, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8799144625663757, "num_tokens": 584905850.0, "step": 15334 }, { "epoch": 1.9507696221854727, "ewc_loss": 0.06409694254398346, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030893817893229425, "grad_norm": 7.551441669464111, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8727431297302246, "num_tokens": 584946480.0, "step": 15335 }, { "epoch": 1.9508968324640632, "ewc_loss": 0.06437987089157104, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031176742049865425, "grad_norm": 7.698165416717529, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8703705072402954, "num_tokens": 584977382.0, "step": 15336 }, { "epoch": 1.9510240427426537, "ewc_loss": 0.06405776739120483, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030854641227051616, "grad_norm": 7.542569160461426, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8667882680892944, "num_tokens": 585010885.0, "step": 15337 }, { "epoch": 1.9511512530212443, "ewc_loss": 0.06435425579547882, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031151127768680453, "grad_norm": 7.6929497718811035, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8609488606452942, "num_tokens": 585048208.0, "step": 15338 }, { "epoch": 1.9512784632998348, "ewc_loss": 0.06399773061275482, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003079460875596851, "grad_norm": 7.532405376434326, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.85954749584198, "num_tokens": 585087477.0, "step": 15339 }, { "epoch": 1.951405673578425, "ewc_loss": 0.06431940197944641, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003111627302132547, "grad_norm": 7.656892776489258, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8622741103172302, "num_tokens": 585120860.0, "step": 15340 }, { "epoch": 1.9515328838570156, "ewc_loss": 0.06399234384298325, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030789218726567924, "grad_norm": 7.521069526672363, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8663831353187561, "num_tokens": 585161887.0, "step": 15341 }, { "epoch": 1.9516600941356061, "ewc_loss": 0.06433729082345963, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003113416605629027, "grad_norm": 7.621145248413086, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8725838661193848, "num_tokens": 585200496.0, "step": 15342 }, { "epoch": 1.9517873044141967, "ewc_loss": 0.0640517920255661, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003084866621065885, "grad_norm": 7.543165683746338, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.865559458732605, "num_tokens": 585243976.0, "step": 15343 }, { "epoch": 1.9519145146927872, "ewc_loss": 0.06434912979602814, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003114600549452007, "grad_norm": 7.654560565948486, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.859556257724762, "num_tokens": 585282366.0, "step": 15344 }, { "epoch": 1.9520417249713777, "ewc_loss": 0.06410791724920273, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030904790037311614, "grad_norm": 7.600831985473633, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8638283610343933, "num_tokens": 585325224.0, "step": 15345 }, { "epoch": 1.952168935249968, "ewc_loss": 0.0643426850438118, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003113956190645695, "grad_norm": 7.6451311111450195, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8743304014205933, "num_tokens": 585361954.0, "step": 15346 }, { "epoch": 1.9522961455285586, "ewc_loss": 0.06416833400726318, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030965206678956747, "grad_norm": 7.546432018280029, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8619832396507263, "num_tokens": 585406033.0, "step": 15347 }, { "epoch": 1.952423355807149, "ewc_loss": 0.06423963606357574, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031036511063575745, "grad_norm": 7.657932281494141, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8645339012145996, "num_tokens": 585440457.0, "step": 15348 }, { "epoch": 1.9525505660857396, "ewc_loss": 0.06416769325733185, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003096457221545279, "grad_norm": 7.627089500427246, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8874971866607666, "num_tokens": 585473451.0, "step": 15349 }, { "epoch": 1.9526777763643302, "ewc_loss": 0.06428372114896774, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003108059463556856, "grad_norm": 7.661405086517334, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.868553638458252, "num_tokens": 585509652.0, "step": 15350 }, { "epoch": 1.9528049866429207, "ewc_loss": 0.06419794261455536, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030994819826446474, "grad_norm": 7.5762248039245605, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8690075874328613, "num_tokens": 585545591.0, "step": 15351 }, { "epoch": 1.9529321969215112, "ewc_loss": 0.06427948921918869, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031076365849003196, "grad_norm": 7.94510555267334, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.852980375289917, "num_tokens": 585583757.0, "step": 15352 }, { "epoch": 1.9530594072001017, "ewc_loss": 0.0636686384677887, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003070965176448226, "grad_norm": 7.509487152099609, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8534754514694214, "num_tokens": 585623897.0, "step": 15353 }, { "epoch": 1.9531866174786923, "ewc_loss": 0.06424686312675476, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003128787502646446, "grad_norm": 7.6900763511657715, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8669518232345581, "num_tokens": 585661043.0, "step": 15354 }, { "epoch": 1.9533138277572828, "ewc_loss": 0.0637158751487732, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003075688728131354, "grad_norm": 7.589896202087402, "learning_rate": 1e-06, "loss": 0.5391, "mean_token_accuracy": 0.8379248380661011, "num_tokens": 585699201.0, "step": 15355 }, { "epoch": 1.9534410380358733, "ewc_loss": 0.06417959928512573, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003122061025351286, "grad_norm": 7.674926280975342, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8573580980300903, "num_tokens": 585735325.0, "step": 15356 }, { "epoch": 1.9535682483144639, "ewc_loss": 0.06364119797945023, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030926355975680053, "grad_norm": 7.6018195152282715, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8723527193069458, "num_tokens": 585769909.0, "step": 15357 }, { "epoch": 1.9536954585930544, "ewc_loss": 0.06428277492523193, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031079648761078715, "grad_norm": 7.7320356369018555, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8539688587188721, "num_tokens": 585804590.0, "step": 15358 }, { "epoch": 1.953822668871645, "ewc_loss": 0.06378243863582611, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030823450651951134, "grad_norm": 7.5808186531066895, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8612573146820068, "num_tokens": 585843349.0, "step": 15359 }, { "epoch": 1.9539498791502354, "ewc_loss": 0.06436236202716827, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031159239006228745, "grad_norm": 7.678191184997559, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.876911997795105, "num_tokens": 585881846.0, "step": 15360 }, { "epoch": 1.954077089428826, "ewc_loss": 0.06362690776586533, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003066792560275644, "grad_norm": 7.60570764541626, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.861744225025177, "num_tokens": 585917848.0, "step": 15361 }, { "epoch": 1.9542042997074165, "ewc_loss": 0.06404414772987366, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003108516684733331, "grad_norm": 7.6044230461120605, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.869312047958374, "num_tokens": 585961249.0, "step": 15362 }, { "epoch": 1.954331509986007, "ewc_loss": 0.0638512521982193, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003089226665906608, "grad_norm": 7.5809197425842285, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.8731520771980286, "num_tokens": 586005795.0, "step": 15363 }, { "epoch": 1.9544587202645973, "ewc_loss": 0.06417285650968552, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003096973232459277, "grad_norm": 7.645637512207031, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8739712834358215, "num_tokens": 586041523.0, "step": 15364 }, { "epoch": 1.9545859305431879, "ewc_loss": 0.06386293470859528, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030903948936611414, "grad_norm": 7.66001558303833, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8564064502716064, "num_tokens": 586077800.0, "step": 15365 }, { "epoch": 1.9547131408217784, "ewc_loss": 0.06386946886777878, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003091048274654895, "grad_norm": 7.621752738952637, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8603904247283936, "num_tokens": 586116010.0, "step": 15366 }, { "epoch": 1.954840351100369, "ewc_loss": 0.06380414962768555, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030845165019854903, "grad_norm": 7.591526508331299, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8760956525802612, "num_tokens": 586155359.0, "step": 15367 }, { "epoch": 1.9549675613789594, "ewc_loss": 0.06361623108386993, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030901390709914267, "grad_norm": 7.648307800292969, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8682431578636169, "num_tokens": 586188190.0, "step": 15368 }, { "epoch": 1.95509477165755, "ewc_loss": 0.06359385699033737, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030879012774676085, "grad_norm": 7.621281623840332, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8633726835250854, "num_tokens": 586225971.0, "step": 15369 }, { "epoch": 1.9552219819361403, "ewc_loss": 0.06365995854139328, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030945116304792464, "grad_norm": 7.646515846252441, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8602336645126343, "num_tokens": 586263295.0, "step": 15370 }, { "epoch": 1.9553491922147308, "ewc_loss": 0.06348428875207901, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030769442673772573, "grad_norm": 7.636662006378174, "learning_rate": 1e-06, "loss": 0.3896, "mean_token_accuracy": 0.8856035470962524, "num_tokens": 586293547.0, "step": 15371 }, { "epoch": 1.9554764024933213, "ewc_loss": 0.06359421461820602, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000308793707517907, "grad_norm": 7.600890636444092, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8599809408187866, "num_tokens": 586333180.0, "step": 15372 }, { "epoch": 1.9556036127719119, "ewc_loss": 0.0636693686246872, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003095452848356217, "grad_norm": 7.630457401275635, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8659511208534241, "num_tokens": 586368780.0, "step": 15373 }, { "epoch": 1.9557308230505024, "ewc_loss": 0.0640093982219696, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030806270660832524, "grad_norm": 7.621901512145996, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8483113050460815, "num_tokens": 586404177.0, "step": 15374 }, { "epoch": 1.955858033329093, "ewc_loss": 0.06395217031240463, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030993184191174805, "grad_norm": 7.628547668457031, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.849785566329956, "num_tokens": 586446917.0, "step": 15375 }, { "epoch": 1.9559852436076834, "ewc_loss": 0.06388597190380096, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030926987528800964, "grad_norm": 7.652817249298096, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8650183081626892, "num_tokens": 586485832.0, "step": 15376 }, { "epoch": 1.956112453886274, "ewc_loss": 0.06396105140447617, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000310020666802302, "grad_norm": 7.6211771965026855, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8750102519989014, "num_tokens": 586524089.0, "step": 15377 }, { "epoch": 1.9562396641648645, "ewc_loss": 0.06420078873634338, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030997663270682096, "grad_norm": 7.604880332946777, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8681358695030212, "num_tokens": 586564097.0, "step": 15378 }, { "epoch": 1.956366874443455, "ewc_loss": 0.06420650333166122, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031003376352600753, "grad_norm": 7.637964248657227, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8768876194953918, "num_tokens": 586600653.0, "step": 15379 }, { "epoch": 1.9564940847220456, "ewc_loss": 0.06426387280225754, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031060745823197067, "grad_norm": 7.597870826721191, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.870414137840271, "num_tokens": 586639527.0, "step": 15380 }, { "epoch": 1.956621295000636, "ewc_loss": 0.06416867673397064, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003096555592492223, "grad_norm": 7.699128150939941, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8684307932853699, "num_tokens": 586670885.0, "step": 15381 }, { "epoch": 1.9567485052792266, "ewc_loss": 0.06411227583885193, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003090915270149708, "grad_norm": 7.619959354400635, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8774120807647705, "num_tokens": 586709875.0, "step": 15382 }, { "epoch": 1.9568757155578171, "ewc_loss": 0.06426765024662018, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031064520590007305, "grad_norm": 7.62206506729126, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8589431643486023, "num_tokens": 586753164.0, "step": 15383 }, { "epoch": 1.9570029258364077, "ewc_loss": 0.06417867541313171, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030975553090684116, "grad_norm": 7.611030578613281, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8727513551712036, "num_tokens": 586793737.0, "step": 15384 }, { "epoch": 1.9571301361149982, "ewc_loss": 0.06410203874111176, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030898916884325445, "grad_norm": 7.633768558502197, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8604135513305664, "num_tokens": 586833428.0, "step": 15385 }, { "epoch": 1.9572573463935887, "ewc_loss": 0.06424486637115479, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031041738111525774, "grad_norm": 7.755068302154541, "learning_rate": 1e-06, "loss": 0.5405, "mean_token_accuracy": 0.8417643308639526, "num_tokens": 586871202.0, "step": 15386 }, { "epoch": 1.9573845566721793, "ewc_loss": 0.06400448828935623, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003080136375501752, "grad_norm": 7.561967849731445, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8762280941009521, "num_tokens": 586905537.0, "step": 15387 }, { "epoch": 1.9575117669507698, "ewc_loss": 0.0642876923084259, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003108457021880895, "grad_norm": 7.724291801452637, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8537757396697998, "num_tokens": 586940378.0, "step": 15388 }, { "epoch": 1.95763897722936, "ewc_loss": 0.06401398777961731, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030810863245278597, "grad_norm": 7.617910861968994, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8758876919746399, "num_tokens": 586978806.0, "step": 15389 }, { "epoch": 1.9577661875079506, "ewc_loss": 0.06432180106639862, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003111867990810424, "grad_norm": 7.747457504272461, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8591548800468445, "num_tokens": 587007367.0, "step": 15390 }, { "epoch": 1.9578933977865411, "ewc_loss": 0.06392926722764969, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030726141994819045, "grad_norm": 7.538962364196777, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8878225088119507, "num_tokens": 587040422.0, "step": 15391 }, { "epoch": 1.9580206080651317, "ewc_loss": 0.06439530849456787, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031192187452688813, "grad_norm": 7.751838684082031, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8831915855407715, "num_tokens": 587070238.0, "step": 15392 }, { "epoch": 1.9581478183437222, "ewc_loss": 0.06386712938547134, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003066400531679392, "grad_norm": 7.531696796417236, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8720182180404663, "num_tokens": 587105973.0, "step": 15393 }, { "epoch": 1.9582750286223127, "ewc_loss": 0.06445930153131485, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003125617513433099, "grad_norm": 7.690424919128418, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8774405121803284, "num_tokens": 587137975.0, "step": 15394 }, { "epoch": 1.958402238900903, "ewc_loss": 0.06395138055086136, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030748257995583117, "grad_norm": 7.665926933288574, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8613709807395935, "num_tokens": 587169176.0, "step": 15395 }, { "epoch": 1.9585294491794936, "ewc_loss": 0.06427807360887527, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031074948492459953, "grad_norm": 7.634376049041748, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8623003959655762, "num_tokens": 587205570.0, "step": 15396 }, { "epoch": 1.958656659458084, "ewc_loss": 0.06412994116544724, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030926818726584315, "grad_norm": 7.588222503662109, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8569063544273376, "num_tokens": 587246729.0, "step": 15397 }, { "epoch": 1.9587838697366746, "ewc_loss": 0.06416989862918854, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030966769554652274, "grad_norm": 7.64627742767334, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.868496835231781, "num_tokens": 587284537.0, "step": 15398 }, { "epoch": 1.9589110800152651, "ewc_loss": 0.06404651701450348, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030843395506963134, "grad_norm": 7.577494144439697, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.870589017868042, "num_tokens": 587324394.0, "step": 15399 }, { "epoch": 1.9590382902938557, "ewc_loss": 0.06424971669912338, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003104658972006291, "grad_norm": 7.596258163452148, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8736572265625, "num_tokens": 587359935.0, "step": 15400 }, { "epoch": 1.9591655005724462, "ewc_loss": 0.0640992671251297, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030896137468516827, "grad_norm": 7.651183605194092, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8758783340454102, "num_tokens": 587402081.0, "step": 15401 }, { "epoch": 1.9592927108510367, "ewc_loss": 0.06412176042795181, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003091863763984293, "grad_norm": 7.596042633056641, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.862775981426239, "num_tokens": 587442939.0, "step": 15402 }, { "epoch": 1.9594199211296273, "ewc_loss": 0.06419404596090317, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003099091991316527, "grad_norm": 7.607135772705078, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8572876453399658, "num_tokens": 587479155.0, "step": 15403 }, { "epoch": 1.9595471314082178, "ewc_loss": 0.0640462189912796, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030843098647892475, "grad_norm": 7.578067779541016, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.853394091129303, "num_tokens": 587519285.0, "step": 15404 }, { "epoch": 1.9596743416868083, "ewc_loss": 0.06505804508924484, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031122498330660164, "grad_norm": 7.702093124389648, "learning_rate": 1e-06, "loss": 0.5164, "mean_token_accuracy": 0.8464264273643494, "num_tokens": 587563978.0, "step": 15405 }, { "epoch": 1.9598015519653988, "ewc_loss": 0.06407134234905243, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003086822107434273, "grad_norm": 7.645074367523193, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8591388463973999, "num_tokens": 587599052.0, "step": 15406 }, { "epoch": 1.9599287622439894, "ewc_loss": 0.06420256197452545, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003099943569395691, "grad_norm": 7.612874984741211, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8687059283256531, "num_tokens": 587639980.0, "step": 15407 }, { "epoch": 1.96005597252258, "ewc_loss": 0.06414877623319626, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003094565181527287, "grad_norm": 7.633786678314209, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8600164651870728, "num_tokens": 587673836.0, "step": 15408 }, { "epoch": 1.9601831828011704, "ewc_loss": 0.06410191208124161, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003089878591708839, "grad_norm": 7.581521034240723, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8654628992080688, "num_tokens": 587712896.0, "step": 15409 }, { "epoch": 1.960310393079761, "ewc_loss": 0.06429575383663177, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109262906946242, "grad_norm": 7.683650970458984, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8585122227668762, "num_tokens": 587747352.0, "step": 15410 }, { "epoch": 1.9604376033583515, "ewc_loss": 0.06402231007814407, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030819186940789223, "grad_norm": 7.559973239898682, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8643445372581482, "num_tokens": 587790234.0, "step": 15411 }, { "epoch": 1.960564813636942, "ewc_loss": 0.06429135799407959, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031088231480680406, "grad_norm": 7.657287120819092, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.878849983215332, "num_tokens": 587830185.0, "step": 15412 }, { "epoch": 1.9606920239155323, "ewc_loss": 0.06401917338371277, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030816043727099895, "grad_norm": 7.612740516662598, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8659197688102722, "num_tokens": 587866265.0, "step": 15413 }, { "epoch": 1.9608192341941229, "ewc_loss": 0.06430015712976456, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109703247901052, "grad_norm": 7.685932636260986, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8767606616020203, "num_tokens": 587904956.0, "step": 15414 }, { "epoch": 1.9609464444727134, "ewc_loss": 0.06404975056648254, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003084662603214383, "grad_norm": 7.6080217361450195, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8720354437828064, "num_tokens": 587942369.0, "step": 15415 }, { "epoch": 1.961073654751304, "ewc_loss": 0.0642457902431488, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000310426636133343, "grad_norm": 7.653637886047363, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8694040775299072, "num_tokens": 587979340.0, "step": 15416 }, { "epoch": 1.9612008650298944, "ewc_loss": 0.06417690962553024, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030973783577792346, "grad_norm": 7.636689186096191, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8604934811592102, "num_tokens": 588021812.0, "step": 15417 }, { "epoch": 1.961328075308485, "ewc_loss": 0.06416358798742294, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030960465664975345, "grad_norm": 7.591500759124756, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8614950180053711, "num_tokens": 588064978.0, "step": 15418 }, { "epoch": 1.9614552855870753, "ewc_loss": 0.06416122615337372, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003120223991572857, "grad_norm": 7.679903984069824, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8609848022460938, "num_tokens": 588098142.0, "step": 15419 }, { "epoch": 1.9615824958656658, "ewc_loss": 0.06355127692222595, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030836433870717883, "grad_norm": 7.556535720825195, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8718035817146301, "num_tokens": 588141808.0, "step": 15420 }, { "epoch": 1.9617097061442563, "ewc_loss": 0.06417100876569748, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031212021713145077, "grad_norm": 7.682313442230225, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8712095618247986, "num_tokens": 588178137.0, "step": 15421 }, { "epoch": 1.9618369164228469, "ewc_loss": 0.0639050304889679, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003094604180660099, "grad_norm": 7.618636131286621, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8585488200187683, "num_tokens": 588215646.0, "step": 15422 }, { "epoch": 1.9619641267014374, "ewc_loss": 0.06416530907154083, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003120632900390774, "grad_norm": 7.711910247802734, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8559184074401855, "num_tokens": 588251345.0, "step": 15423 }, { "epoch": 1.962091336980028, "ewc_loss": 0.06388704478740692, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003092805854976177, "grad_norm": 7.569878578186035, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8708515167236328, "num_tokens": 588291222.0, "step": 15424 }, { "epoch": 1.9622185472586184, "ewc_loss": 0.06397251039743423, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003125766816083342, "grad_norm": 7.750299453735352, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8558012247085571, "num_tokens": 588329480.0, "step": 15425 }, { "epoch": 1.962345757537209, "ewc_loss": 0.06358403712511063, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003086919605266303, "grad_norm": 7.618340492248535, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8569235801696777, "num_tokens": 588367143.0, "step": 15426 }, { "epoch": 1.9624729678157995, "ewc_loss": 0.06391490995883942, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003120006585959345, "grad_norm": 7.6395697593688965, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8751561045646667, "num_tokens": 588408363.0, "step": 15427 }, { "epoch": 1.96260017809439, "ewc_loss": 0.0639670267701149, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031008044607006013, "grad_norm": 7.641041278839111, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8706876635551453, "num_tokens": 588445397.0, "step": 15428 }, { "epoch": 1.9627273883729806, "ewc_loss": 0.06402607262134552, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003106708754785359, "grad_norm": 7.673251628875732, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.870259165763855, "num_tokens": 588480436.0, "step": 15429 }, { "epoch": 1.962854598651571, "ewc_loss": 0.06383311748504639, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031118272454477847, "grad_norm": 7.636173725128174, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8646587133407593, "num_tokens": 588526918.0, "step": 15430 }, { "epoch": 1.9629818089301616, "ewc_loss": 0.06385721266269684, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003114236460532993, "grad_norm": 7.661085605621338, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8654078245162964, "num_tokens": 588561300.0, "step": 15431 }, { "epoch": 1.9631090192087521, "ewc_loss": 0.06373827159404755, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031023425981402397, "grad_norm": 7.603717803955078, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8567209243774414, "num_tokens": 588603393.0, "step": 15432 }, { "epoch": 1.9632362294873427, "ewc_loss": 0.06388772279024124, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003117287706118077, "grad_norm": 7.653111457824707, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8724567890167236, "num_tokens": 588643806.0, "step": 15433 }, { "epoch": 1.9633634397659332, "ewc_loss": 0.06374973803758621, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003103489289060235, "grad_norm": 7.604475975036621, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8639662265777588, "num_tokens": 588685090.0, "step": 15434 }, { "epoch": 1.9634906500445237, "ewc_loss": 0.06434033811092377, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031137210316956043, "grad_norm": 7.701910495758057, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8529713153839111, "num_tokens": 588720983.0, "step": 15435 }, { "epoch": 1.9636178603231143, "ewc_loss": 0.06393814831972122, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030979164876043797, "grad_norm": 7.594525337219238, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8593102097511292, "num_tokens": 588760143.0, "step": 15436 }, { "epoch": 1.9637450706017048, "ewc_loss": 0.06396681070327759, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031251966720446944, "grad_norm": 7.696281433105469, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8659572005271912, "num_tokens": 588796636.0, "step": 15437 }, { "epoch": 1.963872280880295, "ewc_loss": 0.06372174620628357, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003100690373685211, "grad_norm": 7.641829490661621, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8620524406433105, "num_tokens": 588831681.0, "step": 15438 }, { "epoch": 1.9639994911588856, "ewc_loss": 0.06395739316940308, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003124255163129419, "grad_norm": 7.638395309448242, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8762410283088684, "num_tokens": 588875408.0, "step": 15439 }, { "epoch": 1.9641267014374761, "ewc_loss": 0.06409169733524323, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003113271377515048, "grad_norm": 7.6414289474487305, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8696417808532715, "num_tokens": 588913583.0, "step": 15440 }, { "epoch": 1.9642539117160667, "ewc_loss": 0.0638638287782669, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003114898281637579, "grad_norm": 7.611129283905029, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8742486834526062, "num_tokens": 588949833.0, "step": 15441 }, { "epoch": 1.9643811219946572, "ewc_loss": 0.06406386196613312, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003134901635348797, "grad_norm": 7.701431751251221, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8686184883117676, "num_tokens": 588985678.0, "step": 15442 }, { "epoch": 1.9645083322732477, "ewc_loss": 0.06367963552474976, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00030964796314947307, "grad_norm": 7.633749961853027, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8631432056427002, "num_tokens": 589018276.0, "step": 15443 }, { "epoch": 1.964635542551838, "ewc_loss": 0.06402911245822906, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031314269290305674, "grad_norm": 7.664819717407227, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8628114461898804, "num_tokens": 589061900.0, "step": 15444 }, { "epoch": 1.9647627528304286, "ewc_loss": 0.0641428679227829, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003118388121947646, "grad_norm": 7.616095066070557, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8666682243347168, "num_tokens": 589102740.0, "step": 15445 }, { "epoch": 1.964889963109019, "ewc_loss": 0.06378066539764404, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003130996192339808, "grad_norm": 7.677811622619629, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8741752505302429, "num_tokens": 589135276.0, "step": 15446 }, { "epoch": 1.9650171733876096, "ewc_loss": 0.06357394903898239, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00031103246146813035, "grad_norm": 7.628687858581543, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.865237295627594, "num_tokens": 589173390.0, "step": 15447 }, { "epoch": 1.9651443836662001, "ewc_loss": 0.06411571800708771, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003140087064821273, "grad_norm": 7.716554164886475, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8580625057220459, "num_tokens": 589214592.0, "step": 15448 }, { "epoch": 1.9652715939447907, "ewc_loss": 0.06396757811307907, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000310085917590186, "grad_norm": 7.611719608306885, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8656435012817383, "num_tokens": 589250682.0, "step": 15449 }, { "epoch": 1.9653988042233812, "ewc_loss": 0.06379546225070953, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00031324755400419235, "grad_norm": 7.736067295074463, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8599101305007935, "num_tokens": 589288326.0, "step": 15450 }, { "epoch": 1.9655260145019717, "ewc_loss": 0.06404159963130951, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003108261153101921, "grad_norm": 7.646568298339844, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8620913028717041, "num_tokens": 589321632.0, "step": 15451 }, { "epoch": 1.9656532247805623, "ewc_loss": 0.06419790536165237, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003123891947325319, "grad_norm": 7.695702075958252, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8547087907791138, "num_tokens": 589361134.0, "step": 15452 }, { "epoch": 1.9657804350591528, "ewc_loss": 0.06428194046020508, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031078813481144607, "grad_norm": 7.600796699523926, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8683990836143494, "num_tokens": 589402338.0, "step": 15453 }, { "epoch": 1.9659076453377433, "ewc_loss": 0.06440357863903046, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003120045003015548, "grad_norm": 7.69870662689209, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8580897450447083, "num_tokens": 589441080.0, "step": 15454 }, { "epoch": 1.9660348556163338, "ewc_loss": 0.0642596036195755, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031056482112035155, "grad_norm": 7.590590953826904, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8627520799636841, "num_tokens": 589484019.0, "step": 15455 }, { "epoch": 1.9661620658949244, "ewc_loss": 0.06457988172769547, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031376758124679327, "grad_norm": 7.736313343048096, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8593522906303406, "num_tokens": 589525890.0, "step": 15456 }, { "epoch": 1.966289276173515, "ewc_loss": 0.06427191197872162, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031068784301169217, "grad_norm": 7.617228984832764, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8700052499771118, "num_tokens": 589560644.0, "step": 15457 }, { "epoch": 1.9664164864521054, "ewc_loss": 0.06451714038848877, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031314013176597655, "grad_norm": 7.679417133331299, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8842323422431946, "num_tokens": 589600863.0, "step": 15458 }, { "epoch": 1.966543696730696, "ewc_loss": 0.06426739692687988, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031064273207448423, "grad_norm": 7.646011829376221, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.863227128982544, "num_tokens": 589637948.0, "step": 15459 }, { "epoch": 1.9666709070092865, "ewc_loss": 0.06411918252706528, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031160199432633817, "grad_norm": 7.638349533081055, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8789492845535278, "num_tokens": 589678061.0, "step": 15460 }, { "epoch": 1.966798117287877, "ewc_loss": 0.06408777832984924, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031128796399571, "grad_norm": 7.6219801902771, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8540141582489014, "num_tokens": 589718050.0, "step": 15461 }, { "epoch": 1.9669253275664673, "ewc_loss": 0.06422484666109085, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003102172049693763, "grad_norm": 7.706840991973877, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8674067258834839, "num_tokens": 589759206.0, "step": 15462 }, { "epoch": 1.9670525378450578, "ewc_loss": 0.06405395269393921, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003109496901743114, "grad_norm": 7.612083911895752, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8532586097717285, "num_tokens": 589800218.0, "step": 15463 }, { "epoch": 1.9671797481236484, "ewc_loss": 0.06434312462806702, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031140001374296844, "grad_norm": 7.604009628295898, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8651589155197144, "num_tokens": 589843105.0, "step": 15464 }, { "epoch": 1.967306958402239, "ewc_loss": 0.06414861977100372, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003118963504675776, "grad_norm": 7.694295406341553, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.855908989906311, "num_tokens": 589881560.0, "step": 15465 }, { "epoch": 1.9674341686808294, "ewc_loss": 0.06408286094665527, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003112387203145772, "grad_norm": 7.636780738830566, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8537241220474243, "num_tokens": 589922755.0, "step": 15466 }, { "epoch": 1.96756137895942, "ewc_loss": 0.06455445289611816, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003135133010800928, "grad_norm": 7.705359935760498, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8457125425338745, "num_tokens": 589964982.0, "step": 15467 }, { "epoch": 1.9676885892380103, "ewc_loss": 0.06431704014539719, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031113915611058474, "grad_norm": 7.6487250328063965, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8513219356536865, "num_tokens": 590003371.0, "step": 15468 }, { "epoch": 1.9678157995166008, "ewc_loss": 0.06456474959850311, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003136162704322487, "grad_norm": 7.643486022949219, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8617560863494873, "num_tokens": 590048361.0, "step": 15469 }, { "epoch": 1.9679430097951913, "ewc_loss": 0.06419483572244644, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003123585192952305, "grad_norm": 7.67257833480835, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8624154329299927, "num_tokens": 590083141.0, "step": 15470 }, { "epoch": 1.9680702200737819, "ewc_loss": 0.06423061341047287, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031271629268303514, "grad_norm": 7.640326499938965, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8596057891845703, "num_tokens": 590128161.0, "step": 15471 }, { "epoch": 1.9681974303523724, "ewc_loss": 0.0645863264799118, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000313831988023594, "grad_norm": 7.674520969390869, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8808578252792358, "num_tokens": 590166101.0, "step": 15472 }, { "epoch": 1.968324640630963, "ewc_loss": 0.06423772871494293, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031278745154850185, "grad_norm": 7.6390061378479, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8625762462615967, "num_tokens": 590207643.0, "step": 15473 }, { "epoch": 1.9684518509095534, "ewc_loss": 0.0641627311706543, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031447882065549493, "grad_norm": 7.714423656463623, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8581079244613647, "num_tokens": 590247567.0, "step": 15474 }, { "epoch": 1.968579061188144, "ewc_loss": 0.06364919990301132, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.0003117849410045892, "grad_norm": 7.660024166107178, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8601284623146057, "num_tokens": 590287940.0, "step": 15475 }, { "epoch": 1.9687062714667345, "ewc_loss": 0.06410308182239532, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031388233765028417, "grad_norm": 7.729537010192871, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8577958941459656, "num_tokens": 590322558.0, "step": 15476 }, { "epoch": 1.968833481745325, "ewc_loss": 0.06414244323968887, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003118345921393484, "grad_norm": 7.6158905029296875, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8632484674453735, "num_tokens": 590362913.0, "step": 15477 }, { "epoch": 1.9689606920239155, "ewc_loss": 0.06388363242149353, "ewc_loss_diag": 3.24249267578125e-05, "ewc_loss_parallel": 0.00031412934185937047, "grad_norm": 7.691848278045654, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8562288284301758, "num_tokens": 590405877.0, "step": 15478 }, { "epoch": 1.969087902302506, "ewc_loss": 0.06395158916711807, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003123674541711807, "grad_norm": 7.676401138305664, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8544825911521912, "num_tokens": 590445373.0, "step": 15479 }, { "epoch": 1.9692151125810966, "ewc_loss": 0.06404828280210495, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031333439983427525, "grad_norm": 7.687863349914551, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8534414768218994, "num_tokens": 590483319.0, "step": 15480 }, { "epoch": 1.9693423228596871, "ewc_loss": 0.06404565274715424, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.00031330808997154236, "grad_norm": 7.66935396194458, "learning_rate": 1e-06, "loss": 0.511, "mean_token_accuracy": 0.8500922918319702, "num_tokens": 590522433.0, "step": 15481 }, { "epoch": 1.9694695331382777, "ewc_loss": 0.06404820084571838, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.000313333555823192, "grad_norm": 7.6632280349731445, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8682791590690613, "num_tokens": 590564342.0, "step": 15482 }, { "epoch": 1.9695967434168682, "ewc_loss": 0.06437966227531433, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003142067580483854, "grad_norm": 7.759506702423096, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8661180734634399, "num_tokens": 590594527.0, "step": 15483 }, { "epoch": 1.9697239536954587, "ewc_loss": 0.0639670342206955, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003125219081994146, "grad_norm": 7.643307209014893, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8747265338897705, "num_tokens": 590630573.0, "step": 15484 }, { "epoch": 1.9698511639740492, "ewc_loss": 0.06420807540416718, "ewc_loss_diag": 3.266334533691406e-05, "ewc_loss_parallel": 0.0003149323456455022, "grad_norm": 7.7264485359191895, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.864902138710022, "num_tokens": 590669333.0, "step": 15485 }, { "epoch": 1.9699783742526398, "ewc_loss": 0.06419701874256134, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003123803762719035, "grad_norm": 7.626277923583984, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8796861171722412, "num_tokens": 590709884.0, "step": 15486 }, { "epoch": 1.97010558453123, "ewc_loss": 0.06447944045066833, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031520455377176404, "grad_norm": 7.699477672576904, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8699125647544861, "num_tokens": 590746109.0, "step": 15487 }, { "epoch": 1.9702327948098206, "ewc_loss": 0.0642174482345581, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031258465605787933, "grad_norm": 7.632336616516113, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8486137390136719, "num_tokens": 590787881.0, "step": 15488 }, { "epoch": 1.9703600050884111, "ewc_loss": 0.06440765410661697, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031448667868971825, "grad_norm": 7.689831733703613, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8496425151824951, "num_tokens": 590826861.0, "step": 15489 }, { "epoch": 1.9704872153670017, "ewc_loss": 0.06420034170150757, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031241358374245465, "grad_norm": 7.6784586906433105, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8719592690467834, "num_tokens": 590863104.0, "step": 15490 }, { "epoch": 1.9706144256455922, "ewc_loss": 0.06430654227733612, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031347558251582086, "grad_norm": 7.691155433654785, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8716237545013428, "num_tokens": 590900582.0, "step": 15491 }, { "epoch": 1.9707416359241827, "ewc_loss": 0.06422144174575806, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031262452830560505, "grad_norm": 7.625490665435791, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8720769882202148, "num_tokens": 590937735.0, "step": 15492 }, { "epoch": 1.970868846202773, "ewc_loss": 0.0643356591463089, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031376670813187957, "grad_norm": 7.654163837432861, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8792119026184082, "num_tokens": 590973089.0, "step": 15493 }, { "epoch": 1.9709960564813636, "ewc_loss": 0.06425587832927704, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003129689139313996, "grad_norm": 7.665473461151123, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8499667644500732, "num_tokens": 591012117.0, "step": 15494 }, { "epoch": 1.971123266759954, "ewc_loss": 0.06422220170497894, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031263212440535426, "grad_norm": 7.698626518249512, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.844890832901001, "num_tokens": 591044978.0, "step": 15495 }, { "epoch": 1.9712504770385446, "ewc_loss": 0.06426668167114258, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000313076947350055, "grad_norm": 7.6243720054626465, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8603809475898743, "num_tokens": 591085562.0, "step": 15496 }, { "epoch": 1.9713776873171351, "ewc_loss": 0.06427538394927979, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003131640260107815, "grad_norm": 7.704505443572998, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.855366587638855, "num_tokens": 591120140.0, "step": 15497 }, { "epoch": 1.9715048975957257, "ewc_loss": 0.06408689171075821, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003112790873274207, "grad_norm": 7.644394874572754, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.853199303150177, "num_tokens": 591153952.0, "step": 15498 }, { "epoch": 1.9716321078743162, "ewc_loss": 0.06456395983695984, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031360829598270357, "grad_norm": 7.627697467803955, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8816869258880615, "num_tokens": 591192983.0, "step": 15499 }, { "epoch": 1.9717593181529067, "ewc_loss": 0.06442295759916306, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003121983027085662, "grad_norm": 7.6873297691345215, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8652446269989014, "num_tokens": 591226260.0, "step": 15500 }, { "epoch": 1.9718865284314973, "ewc_loss": 0.06447316706180573, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031270040199160576, "grad_norm": 7.690124988555908, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8500209450721741, "num_tokens": 591259361.0, "step": 15501 }, { "epoch": 1.9720137387100878, "ewc_loss": 0.06434626877307892, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031143147498369217, "grad_norm": 7.614593029022217, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8633160591125488, "num_tokens": 591299753.0, "step": 15502 }, { "epoch": 1.9721409489886783, "ewc_loss": 0.06435602158308029, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031152894371189177, "grad_norm": 7.679371356964111, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.865507960319519, "num_tokens": 591335008.0, "step": 15503 }, { "epoch": 1.9722681592672688, "ewc_loss": 0.06435270607471466, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003114957653451711, "grad_norm": 7.624919891357422, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8710861206054688, "num_tokens": 591373001.0, "step": 15504 }, { "epoch": 1.9723953695458594, "ewc_loss": 0.06441902369260788, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003121590125374496, "grad_norm": 7.64646053314209, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8675805926322937, "num_tokens": 591408637.0, "step": 15505 }, { "epoch": 1.97252257982445, "ewc_loss": 0.064298614859581, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109548706561327, "grad_norm": 7.678108215332031, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8607559204101562, "num_tokens": 591444908.0, "step": 15506 }, { "epoch": 1.9726497901030404, "ewc_loss": 0.06409937143325806, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031140382634475827, "grad_norm": 7.634239196777344, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8667232990264893, "num_tokens": 591484641.0, "step": 15507 }, { "epoch": 1.972777000381631, "ewc_loss": 0.06432440131902695, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031121278880164027, "grad_norm": 7.651278495788574, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8612903356552124, "num_tokens": 591521859.0, "step": 15508 }, { "epoch": 1.9729042106602215, "ewc_loss": 0.06429130584001541, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003108818200416863, "grad_norm": 7.679448127746582, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8635427355766296, "num_tokens": 591559124.0, "step": 15509 }, { "epoch": 1.973031420938812, "ewc_loss": 0.06433908641338348, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031135964673012495, "grad_norm": 7.66467809677124, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8630934357643127, "num_tokens": 591597170.0, "step": 15510 }, { "epoch": 1.9731586312174023, "ewc_loss": 0.0642981305718422, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109500394202769, "grad_norm": 7.658784866333008, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8688892126083374, "num_tokens": 591634784.0, "step": 15511 }, { "epoch": 1.9732858414959928, "ewc_loss": 0.06434240192174911, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031139279599301517, "grad_norm": 7.667425155639648, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.868927001953125, "num_tokens": 591671904.0, "step": 15512 }, { "epoch": 1.9734130517745834, "ewc_loss": 0.06404188275337219, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003108290256932378, "grad_norm": 7.653621673583984, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8667237162590027, "num_tokens": 591712990.0, "step": 15513 }, { "epoch": 1.973540262053174, "ewc_loss": 0.06430230289697647, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109918034169823, "grad_norm": 7.645092487335205, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8616416454315186, "num_tokens": 591748579.0, "step": 15514 }, { "epoch": 1.9736674723317644, "ewc_loss": 0.06426861137151718, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031065483926795423, "grad_norm": 7.6531195640563965, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8580446243286133, "num_tokens": 591785865.0, "step": 15515 }, { "epoch": 1.973794682610355, "ewc_loss": 0.06421603262424469, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031012907857075334, "grad_norm": 7.616332054138184, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.873604416847229, "num_tokens": 591820087.0, "step": 15516 }, { "epoch": 1.9739218928889453, "ewc_loss": 0.0642254576086998, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003102233458776027, "grad_norm": 7.634440898895264, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8640239834785461, "num_tokens": 591859713.0, "step": 15517 }, { "epoch": 1.9740491031675358, "ewc_loss": 0.06428798288106918, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003108485834673047, "grad_norm": 7.646392822265625, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8720356822013855, "num_tokens": 591893142.0, "step": 15518 }, { "epoch": 1.9741763134461263, "ewc_loss": 0.06429965794086456, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109653480350971, "grad_norm": 7.657381057739258, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8595288991928101, "num_tokens": 591931529.0, "step": 15519 }, { "epoch": 1.9743035237247168, "ewc_loss": 0.06423206627368927, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000310289382468909, "grad_norm": 7.645196914672852, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8681725263595581, "num_tokens": 591976248.0, "step": 15520 }, { "epoch": 1.9744307340033074, "ewc_loss": 0.06435853242874146, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031155411852523685, "grad_norm": 7.690853595733643, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8616212010383606, "num_tokens": 592018008.0, "step": 15521 }, { "epoch": 1.974557944281898, "ewc_loss": 0.06412538886070251, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030922263977117836, "grad_norm": 7.619140625, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8499051332473755, "num_tokens": 592062934.0, "step": 15522 }, { "epoch": 1.9746851545604884, "ewc_loss": 0.06439373642206192, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003119061002507806, "grad_norm": 7.630147457122803, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8628467321395874, "num_tokens": 592103129.0, "step": 15523 }, { "epoch": 1.974812364839079, "ewc_loss": 0.06430605798959732, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003110293473582715, "grad_norm": 7.669300556182861, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8593603372573853, "num_tokens": 592142116.0, "step": 15524 }, { "epoch": 1.9749395751176695, "ewc_loss": 0.06435306370258331, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031149937422014773, "grad_norm": 7.6789422035217285, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8654441237449646, "num_tokens": 592182507.0, "step": 15525 }, { "epoch": 1.97506678539626, "ewc_loss": 0.06432892382144928, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003112579579465091, "grad_norm": 7.669145584106445, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8430905342102051, "num_tokens": 592225593.0, "step": 15526 }, { "epoch": 1.9751939956748505, "ewc_loss": 0.06435588747262955, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003115276340395212, "grad_norm": 7.598282337188721, "learning_rate": 1e-06, "loss": 0.389, "mean_token_accuracy": 0.8887226581573486, "num_tokens": 592270676.0, "step": 15527 }, { "epoch": 1.975321205953441, "ewc_loss": 0.06435810774564743, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031154981115832925, "grad_norm": 7.640384674072266, "learning_rate": 1e-06, "loss": 0.3997, "mean_token_accuracy": 0.8832338452339172, "num_tokens": 592305537.0, "step": 15528 }, { "epoch": 1.9754484162320316, "ewc_loss": 0.06433750689029694, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031134378514252603, "grad_norm": 7.73955774307251, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8705402612686157, "num_tokens": 592338289.0, "step": 15529 }, { "epoch": 1.9755756265106221, "ewc_loss": 0.06421421468257904, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003101108595728874, "grad_norm": 7.671350955963135, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8673821687698364, "num_tokens": 592379189.0, "step": 15530 }, { "epoch": 1.9757028367892127, "ewc_loss": 0.06435315310955048, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031150030554272234, "grad_norm": 7.675090312957764, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8782931566238403, "num_tokens": 592421875.0, "step": 15531 }, { "epoch": 1.9758300470678032, "ewc_loss": 0.06394866853952408, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003098968300037086, "grad_norm": 7.66493558883667, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8452750444412231, "num_tokens": 592457438.0, "step": 15532 }, { "epoch": 1.9759572573463937, "ewc_loss": 0.06403085589408875, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031071872217580676, "grad_norm": 7.677714824676514, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8748815059661865, "num_tokens": 592493230.0, "step": 15533 }, { "epoch": 1.9760844676249842, "ewc_loss": 0.06402391940355301, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003106493386439979, "grad_norm": 7.720619201660156, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8487383127212524, "num_tokens": 592533656.0, "step": 15534 }, { "epoch": 1.9762116779035748, "ewc_loss": 0.0642012283205986, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003099810564890504, "grad_norm": 7.682901859283447, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8771979808807373, "num_tokens": 592571622.0, "step": 15535 }, { "epoch": 1.976338888182165, "ewc_loss": 0.06427954882383347, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031076421146281064, "grad_norm": 7.665434837341309, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8589255809783936, "num_tokens": 592613260.0, "step": 15536 }, { "epoch": 1.9764660984607556, "ewc_loss": 0.0642978772521019, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109475364908576, "grad_norm": 7.68444299697876, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8623294830322266, "num_tokens": 592648872.0, "step": 15537 }, { "epoch": 1.9765933087393461, "ewc_loss": 0.06424808502197266, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031044959905557334, "grad_norm": 7.698808193206787, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8660272359848022, "num_tokens": 592682336.0, "step": 15538 }, { "epoch": 1.9767205190179367, "ewc_loss": 0.06411326676607132, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031154282623901963, "grad_norm": 7.685527801513672, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8781114220619202, "num_tokens": 592716412.0, "step": 15539 }, { "epoch": 1.9768477292965272, "ewc_loss": 0.06399262696504593, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031033644336275756, "grad_norm": 7.633790969848633, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.866324245929718, "num_tokens": 592753936.0, "step": 15540 }, { "epoch": 1.9769749395751177, "ewc_loss": 0.06435994803905487, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003115682047791779, "grad_norm": 7.6923980712890625, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8640092611312866, "num_tokens": 592792731.0, "step": 15541 }, { "epoch": 1.977102149853708, "ewc_loss": 0.06436387449502945, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003116074949502945, "grad_norm": 7.64093017578125, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8674944639205933, "num_tokens": 592832035.0, "step": 15542 }, { "epoch": 1.9772293601322986, "ewc_loss": 0.06412394344806671, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031164957908913493, "grad_norm": 7.669252872467041, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8822661638259888, "num_tokens": 592869693.0, "step": 15543 }, { "epoch": 1.977356570410889, "ewc_loss": 0.06412331014871597, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031164323445409536, "grad_norm": 7.602777481079102, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8679858446121216, "num_tokens": 592909648.0, "step": 15544 }, { "epoch": 1.9774837806894796, "ewc_loss": 0.06424789130687714, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003128890530206263, "grad_norm": 7.739877223968506, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8671694993972778, "num_tokens": 592942539.0, "step": 15545 }, { "epoch": 1.9776109909680701, "ewc_loss": 0.0641510933637619, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031192111782729626, "grad_norm": 7.665271282196045, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8635468482971191, "num_tokens": 592983680.0, "step": 15546 }, { "epoch": 1.9777382012466607, "ewc_loss": 0.06414720416069031, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031188217690214515, "grad_norm": 7.653658390045166, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8543631434440613, "num_tokens": 593023161.0, "step": 15547 }, { "epoch": 1.9778654115252512, "ewc_loss": 0.06405682116746902, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003109783574473113, "grad_norm": 7.598494529724121, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8464592695236206, "num_tokens": 593066612.0, "step": 15548 }, { "epoch": 1.9779926218038417, "ewc_loss": 0.0641321912407875, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003117320593446493, "grad_norm": 7.689101219177246, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8512377738952637, "num_tokens": 593105224.0, "step": 15549 }, { "epoch": 1.9781198320824323, "ewc_loss": 0.06434889882802963, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003114577557425946, "grad_norm": 7.640481472015381, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8640490770339966, "num_tokens": 593143683.0, "step": 15550 }, { "epoch": 1.9782470423610228, "ewc_loss": 0.0643337294459343, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003113060665782541, "grad_norm": 7.679951190948486, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8500605821609497, "num_tokens": 593179827.0, "step": 15551 }, { "epoch": 1.9783742526396133, "ewc_loss": 0.06406751275062561, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031108525581657887, "grad_norm": 7.6509623527526855, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.865587592124939, "num_tokens": 593220936.0, "step": 15552 }, { "epoch": 1.9785014629182038, "ewc_loss": 0.06444862484931946, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003124549693893641, "grad_norm": 7.690489292144775, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8598564267158508, "num_tokens": 593258503.0, "step": 15553 }, { "epoch": 1.9786286731967944, "ewc_loss": 0.06405889242887497, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003109990793745965, "grad_norm": 7.674482822418213, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8445611000061035, "num_tokens": 593293738.0, "step": 15554 }, { "epoch": 1.978755883475385, "ewc_loss": 0.06435392796993256, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003115080180577934, "grad_norm": 7.650317668914795, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8549525141716003, "num_tokens": 593338037.0, "step": 15555 }, { "epoch": 1.9788830937539754, "ewc_loss": 0.0643477588891983, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031144634704105556, "grad_norm": 7.655750274658203, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8502918481826782, "num_tokens": 593383509.0, "step": 15556 }, { "epoch": 1.979010304032566, "ewc_loss": 0.0643409788608551, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031137853511609137, "grad_norm": 7.694101810455322, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8674405813217163, "num_tokens": 593417734.0, "step": 15557 }, { "epoch": 1.9791375143111565, "ewc_loss": 0.06427799165248871, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003107486409135163, "grad_norm": 7.6183600425720215, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8763418197631836, "num_tokens": 593454502.0, "step": 15558 }, { "epoch": 1.979264724589747, "ewc_loss": 0.06441733241081238, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031214210321195424, "grad_norm": 7.668568134307861, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8596609830856323, "num_tokens": 593498864.0, "step": 15559 }, { "epoch": 1.9793919348683373, "ewc_loss": 0.06432618200778961, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003112305421382189, "grad_norm": 7.6541032791137695, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8621083498001099, "num_tokens": 593540125.0, "step": 15560 }, { "epoch": 1.9795191451469278, "ewc_loss": 0.06434345990419388, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003114033315796405, "grad_norm": 7.760208606719971, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.87227463722229, "num_tokens": 593566937.0, "step": 15561 }, { "epoch": 1.9796463554255184, "ewc_loss": 0.06421362608671188, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003101050097029656, "grad_norm": 7.660677909851074, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8584258556365967, "num_tokens": 593600109.0, "step": 15562 }, { "epoch": 1.979773565704109, "ewc_loss": 0.06442852318286896, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003122539783362299, "grad_norm": 7.6645588874816895, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8635919094085693, "num_tokens": 593638982.0, "step": 15563 }, { "epoch": 1.9799007759826994, "ewc_loss": 0.06423577666282654, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031032648985274136, "grad_norm": 7.647312641143799, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8539658188819885, "num_tokens": 593677270.0, "step": 15564 }, { "epoch": 1.98002798626129, "ewc_loss": 0.06447216123342514, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003126903611700982, "grad_norm": 7.695377826690674, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8726295232772827, "num_tokens": 593714056.0, "step": 15565 }, { "epoch": 1.9801551965398803, "ewc_loss": 0.06421253085136414, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031009403755888343, "grad_norm": 7.653591156005859, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8516494035720825, "num_tokens": 593753516.0, "step": 15566 }, { "epoch": 1.9802824068184708, "ewc_loss": 0.06437493860721588, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003117180895060301, "grad_norm": 7.698493003845215, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8656230568885803, "num_tokens": 593789276.0, "step": 15567 }, { "epoch": 1.9804096170970613, "ewc_loss": 0.06423789262771606, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003103477065451443, "grad_norm": 7.64589786529541, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8667070865631104, "num_tokens": 593828112.0, "step": 15568 }, { "epoch": 1.9805368273756518, "ewc_loss": 0.06440630555152893, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003120317996945232, "grad_norm": 7.679177284240723, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8521201610565186, "num_tokens": 593866370.0, "step": 15569 }, { "epoch": 1.9806640376542424, "ewc_loss": 0.06423726677894592, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003103413910139352, "grad_norm": 7.620903015136719, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8691009283065796, "num_tokens": 593907863.0, "step": 15570 }, { "epoch": 1.980791247932833, "ewc_loss": 0.06439840793609619, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003119528410024941, "grad_norm": 7.676946640014648, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8752144575119019, "num_tokens": 593945254.0, "step": 15571 }, { "epoch": 1.9809184582114234, "ewc_loss": 0.06425561755895615, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031052491976879537, "grad_norm": 7.655980110168457, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8694758415222168, "num_tokens": 593987105.0, "step": 15572 }, { "epoch": 1.981045668490014, "ewc_loss": 0.06428318470716476, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031080059125088155, "grad_norm": 7.6771464347839355, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8755857944488525, "num_tokens": 594024091.0, "step": 15573 }, { "epoch": 1.9811728787686045, "ewc_loss": 0.06423356384038925, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003103044000454247, "grad_norm": 7.662537574768066, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8751438856124878, "num_tokens": 594056018.0, "step": 15574 }, { "epoch": 1.981300089047195, "ewc_loss": 0.06423916667699814, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003103604249190539, "grad_norm": 7.6121826171875, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8677088022232056, "num_tokens": 594102297.0, "step": 15575 }, { "epoch": 1.9814272993257855, "ewc_loss": 0.0642913281917572, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003108820237684995, "grad_norm": 7.690215110778809, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8654836416244507, "num_tokens": 594140278.0, "step": 15576 }, { "epoch": 1.981554509604376, "ewc_loss": 0.06420432031154633, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031001196475699544, "grad_norm": 7.616022109985352, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8702419996261597, "num_tokens": 594175025.0, "step": 15577 }, { "epoch": 1.9816817198829666, "ewc_loss": 0.06443329900503159, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003123017377220094, "grad_norm": 7.680698394775391, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8637686967849731, "num_tokens": 594217906.0, "step": 15578 }, { "epoch": 1.9818089301615571, "ewc_loss": 0.0641978532075882, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030994723783805966, "grad_norm": 7.644229412078857, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8467130064964294, "num_tokens": 594264331.0, "step": 15579 }, { "epoch": 1.9819361404401477, "ewc_loss": 0.06443602591753006, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003123290080111474, "grad_norm": 7.685760021209717, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.881064772605896, "num_tokens": 594300220.0, "step": 15580 }, { "epoch": 1.9820633507187382, "ewc_loss": 0.06428220868110657, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000310790870571509, "grad_norm": 7.686494827270508, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8710560202598572, "num_tokens": 594337849.0, "step": 15581 }, { "epoch": 1.9821905609973287, "ewc_loss": 0.0642772689461708, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000310741423163563, "grad_norm": 7.663060188293457, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8577668070793152, "num_tokens": 594379295.0, "step": 15582 }, { "epoch": 1.9823177712759192, "ewc_loss": 0.06435543298721313, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003115230647381395, "grad_norm": 7.7534708976745605, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.860929012298584, "num_tokens": 594408350.0, "step": 15583 }, { "epoch": 1.9824449815545098, "ewc_loss": 0.0641176849603653, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030914557282812893, "grad_norm": 7.664066791534424, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8537098169326782, "num_tokens": 594455110.0, "step": 15584 }, { "epoch": 1.9825721918331, "ewc_loss": 0.06427813321352005, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031075006700120866, "grad_norm": 7.731976509094238, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8688491582870483, "num_tokens": 594499231.0, "step": 15585 }, { "epoch": 1.9826994021116906, "ewc_loss": 0.06415968388319016, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003095655993092805, "grad_norm": 7.6576995849609375, "learning_rate": 1e-06, "loss": 0.5293, "mean_token_accuracy": 0.8459203839302063, "num_tokens": 594541478.0, "step": 15586 }, { "epoch": 1.9828266123902811, "ewc_loss": 0.06407871842384338, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003111973055638373, "grad_norm": 7.738637924194336, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.867444634437561, "num_tokens": 594579075.0, "step": 15587 }, { "epoch": 1.9829538226688717, "ewc_loss": 0.06411096453666687, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030907843029126525, "grad_norm": 7.645816802978516, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.881321132183075, "num_tokens": 594616189.0, "step": 15588 }, { "epoch": 1.9830810329474622, "ewc_loss": 0.06410744041204453, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031148456037044525, "grad_norm": 7.729813098907471, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8719188570976257, "num_tokens": 594656633.0, "step": 15589 }, { "epoch": 1.9832082432260527, "ewc_loss": 0.06380381435155869, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030844827415421605, "grad_norm": 7.685317039489746, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8467717170715332, "num_tokens": 594696833.0, "step": 15590 }, { "epoch": 1.983335453504643, "ewc_loss": 0.06403353810310364, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003107455268036574, "grad_norm": 7.6731414794921875, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8523097038269043, "num_tokens": 594735138.0, "step": 15591 }, { "epoch": 1.9834626637832335, "ewc_loss": 0.06392142176628113, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003096244181506336, "grad_norm": 7.706367015838623, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.859235405921936, "num_tokens": 594769587.0, "step": 15592 }, { "epoch": 1.983589874061824, "ewc_loss": 0.06394834071397781, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030989357037469745, "grad_norm": 7.723071098327637, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8700923919677734, "num_tokens": 594803635.0, "step": 15593 }, { "epoch": 1.9837170843404146, "ewc_loss": 0.06415882706642151, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030955704278312624, "grad_norm": 7.670519828796387, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8800482153892517, "num_tokens": 594839872.0, "step": 15594 }, { "epoch": 1.9838442946190051, "ewc_loss": 0.06404862552881241, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003108964301645756, "grad_norm": 7.688791751861572, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8525757789611816, "num_tokens": 594879208.0, "step": 15595 }, { "epoch": 1.9839715048975957, "ewc_loss": 0.06416484713554382, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003096172586083412, "grad_norm": 7.654369354248047, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8715844750404358, "num_tokens": 594910377.0, "step": 15596 }, { "epoch": 1.9840987151761862, "ewc_loss": 0.0640334039926529, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031074415892362595, "grad_norm": 7.684196472167969, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8670080304145813, "num_tokens": 594956574.0, "step": 15597 }, { "epoch": 1.9842259254547767, "ewc_loss": 0.06394749879837036, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000309885130263865, "grad_norm": 7.677674293518066, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8498070240020752, "num_tokens": 594998036.0, "step": 15598 }, { "epoch": 1.9843531357333672, "ewc_loss": 0.06396947056055069, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003101048350799829, "grad_norm": 7.698273181915283, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8597507476806641, "num_tokens": 595031404.0, "step": 15599 }, { "epoch": 1.9844803460119578, "ewc_loss": 0.06412769854068756, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031168718123808503, "grad_norm": 7.671751499176025, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8726218938827515, "num_tokens": 595069800.0, "step": 15600 }, { "epoch": 1.9846075562905483, "ewc_loss": 0.06404231488704681, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031083327485248446, "grad_norm": 7.724481582641602, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8703442811965942, "num_tokens": 595110987.0, "step": 15601 }, { "epoch": 1.9847347665691388, "ewc_loss": 0.06402319669723511, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003106420917902142, "grad_norm": 7.689968585968018, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8687188029289246, "num_tokens": 595149192.0, "step": 15602 }, { "epoch": 1.9848619768477294, "ewc_loss": 0.06408153474330902, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003112254780717194, "grad_norm": 7.793076038360596, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8625754714012146, "num_tokens": 595182993.0, "step": 15603 }, { "epoch": 1.9849891871263199, "ewc_loss": 0.06396938860416412, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000310104078380391, "grad_norm": 7.739124774932861, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8627076148986816, "num_tokens": 595216177.0, "step": 15604 }, { "epoch": 1.9851163974049104, "ewc_loss": 0.06397613883018494, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003101715410593897, "grad_norm": 7.730111122131348, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8650670051574707, "num_tokens": 595253379.0, "step": 15605 }, { "epoch": 1.985243607683501, "ewc_loss": 0.06403094530105591, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003107196243945509, "grad_norm": 7.782138347625732, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.868372917175293, "num_tokens": 595292878.0, "step": 15606 }, { "epoch": 1.9853708179620915, "ewc_loss": 0.06379514932632446, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003083616029471159, "grad_norm": 7.672593116760254, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8655524849891663, "num_tokens": 595329082.0, "step": 15607 }, { "epoch": 1.985498028240682, "ewc_loss": 0.06422539055347443, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003102226764895022, "grad_norm": 7.783293724060059, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8562915921211243, "num_tokens": 595367246.0, "step": 15608 }, { "epoch": 1.9856252385192723, "ewc_loss": 0.06390418857336044, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030701066134497523, "grad_norm": 7.6669230461120605, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8714357018470764, "num_tokens": 595404450.0, "step": 15609 }, { "epoch": 1.9857524487978628, "ewc_loss": 0.06403416395187378, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003107518423348665, "grad_norm": 7.775142192840576, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8774725198745728, "num_tokens": 595439718.0, "step": 15610 }, { "epoch": 1.9858796590764534, "ewc_loss": 0.06364578008651733, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003068679361604154, "grad_norm": 7.707197666168213, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8671838045120239, "num_tokens": 595473617.0, "step": 15611 }, { "epoch": 1.986006869355044, "ewc_loss": 0.06430146098136902, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003109833924099803, "grad_norm": 7.779372692108154, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8621504306793213, "num_tokens": 595516432.0, "step": 15612 }, { "epoch": 1.9861340796336344, "ewc_loss": 0.06390149891376495, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00030698374030180275, "grad_norm": 7.617103576660156, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8655962347984314, "num_tokens": 595556278.0, "step": 15613 }, { "epoch": 1.986261289912225, "ewc_loss": 0.06437137722969055, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003116825537290424, "grad_norm": 7.80402946472168, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8707119226455688, "num_tokens": 595598713.0, "step": 15614 }, { "epoch": 1.9863885001908153, "ewc_loss": 0.06375308334827423, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003079409943893552, "grad_norm": 7.71164608001709, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8677629232406616, "num_tokens": 595630597.0, "step": 15615 }, { "epoch": 1.9865157104694058, "ewc_loss": 0.06397745013237, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031018463778309524, "grad_norm": 7.739285469055176, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8830733299255371, "num_tokens": 595664163.0, "step": 15616 }, { "epoch": 1.9866429207479963, "ewc_loss": 0.06393588334321976, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030976897687651217, "grad_norm": 7.711792469024658, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8743883371353149, "num_tokens": 595698809.0, "step": 15617 }, { "epoch": 1.9867701310265868, "ewc_loss": 0.0641089603304863, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003090583486482501, "grad_norm": 7.646948337554932, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8590611815452576, "num_tokens": 595741166.0, "step": 15618 }, { "epoch": 1.9868973413051774, "ewc_loss": 0.06407316029071808, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003111418045591563, "grad_norm": 7.73221492767334, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8593133091926575, "num_tokens": 595787079.0, "step": 15619 }, { "epoch": 1.987024551583768, "ewc_loss": 0.06417694687843323, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003097382141277194, "grad_norm": 7.7026753425598145, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8402299880981445, "num_tokens": 595829085.0, "step": 15620 }, { "epoch": 1.9871517618623584, "ewc_loss": 0.0642486959695816, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003104557399637997, "grad_norm": 7.7105913162231445, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8634659051895142, "num_tokens": 595870646.0, "step": 15621 }, { "epoch": 1.987278972140949, "ewc_loss": 0.06424309313297272, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003103996568825096, "grad_norm": 7.60728645324707, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8543128371238708, "num_tokens": 595918473.0, "step": 15622 }, { "epoch": 1.9874061824195395, "ewc_loss": 0.06427523493766785, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031316254171542823, "grad_norm": 7.820123672485352, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8533965349197388, "num_tokens": 595954097.0, "step": 15623 }, { "epoch": 1.98753339269813, "ewc_loss": 0.0638384073972702, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003087942022830248, "grad_norm": 7.62087869644165, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8474169969558716, "num_tokens": 595990191.0, "step": 15624 }, { "epoch": 1.9876606029767205, "ewc_loss": 0.06441925466060638, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031460265745408833, "grad_norm": 7.832154273986816, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8571837544441223, "num_tokens": 596027109.0, "step": 15625 }, { "epoch": 1.987787813255311, "ewc_loss": 0.06388326734304428, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030924283782951534, "grad_norm": 7.648329734802246, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8727916479110718, "num_tokens": 596067930.0, "step": 15626 }, { "epoch": 1.9879150235339016, "ewc_loss": 0.06430573761463165, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031346757896244526, "grad_norm": 7.765406131744385, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8749359846115112, "num_tokens": 596105809.0, "step": 15627 }, { "epoch": 1.9880422338124921, "ewc_loss": 0.06397148221731186, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003101250040344894, "grad_norm": 7.729905605316162, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8683632612228394, "num_tokens": 596144220.0, "step": 15628 }, { "epoch": 1.9881694440910826, "ewc_loss": 0.06419343501329422, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003123445203527808, "grad_norm": 7.826247692108154, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8419445753097534, "num_tokens": 596183825.0, "step": 15629 }, { "epoch": 1.9882966543696732, "ewc_loss": 0.06399049609899521, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003103151102550328, "grad_norm": 7.699284553527832, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8733680248260498, "num_tokens": 596232333.0, "step": 15630 }, { "epoch": 1.9884238646482637, "ewc_loss": 0.06413483619689941, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031175854383036494, "grad_norm": 7.787637710571289, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8734039068222046, "num_tokens": 596272138.0, "step": 15631 }, { "epoch": 1.9885510749268542, "ewc_loss": 0.0639345645904541, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003097557637374848, "grad_norm": 7.711938381195068, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8744314312934875, "num_tokens": 596310254.0, "step": 15632 }, { "epoch": 1.9886782852054448, "ewc_loss": 0.06415756046772003, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003119857283309102, "grad_norm": 7.758594989776611, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8620260953903198, "num_tokens": 596350396.0, "step": 15633 }, { "epoch": 1.988805495484035, "ewc_loss": 0.06395880877971649, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030999822774901986, "grad_norm": 7.7027764320373535, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8683212995529175, "num_tokens": 596388809.0, "step": 15634 }, { "epoch": 1.9889327057626256, "ewc_loss": 0.06407184898853302, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031112864962778986, "grad_norm": 7.766506671905518, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8721404075622559, "num_tokens": 596427366.0, "step": 15635 }, { "epoch": 1.9890599160412161, "ewc_loss": 0.06395977735519409, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031000791932456195, "grad_norm": 7.70814323425293, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.856782078742981, "num_tokens": 596469999.0, "step": 15636 }, { "epoch": 1.9891871263198067, "ewc_loss": 0.06399780511856079, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003103882190771401, "grad_norm": 7.7838544845581055, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8490935564041138, "num_tokens": 596506773.0, "step": 15637 }, { "epoch": 1.9893143365983972, "ewc_loss": 0.06402046978473663, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031061487970873713, "grad_norm": 7.7311811447143555, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8668015003204346, "num_tokens": 596547440.0, "step": 15638 }, { "epoch": 1.9894415468769877, "ewc_loss": 0.06402698904275894, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003106800722889602, "grad_norm": 7.834205627441406, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8716052770614624, "num_tokens": 596580212.0, "step": 15639 }, { "epoch": 1.989568757155578, "ewc_loss": 0.0639001727104187, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003094119019806385, "grad_norm": 7.754635334014893, "learning_rate": 1e-06, "loss": 0.3955, "mean_token_accuracy": 0.8866266012191772, "num_tokens": 596611790.0, "step": 15640 }, { "epoch": 1.9896959674341685, "ewc_loss": 0.06389954686164856, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003094056446570903, "grad_norm": 7.698109149932861, "learning_rate": 1e-06, "loss": 0.3964, "mean_token_accuracy": 0.8809506893157959, "num_tokens": 596646984.0, "step": 15641 }, { "epoch": 1.989823177712759, "ewc_loss": 0.06394434720277786, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00030985361081548035, "grad_norm": 7.748237609863281, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8609620928764343, "num_tokens": 596688362.0, "step": 15642 }, { "epoch": 1.9899503879913496, "ewc_loss": 0.0640205442905426, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003106155782006681, "grad_norm": 7.754236221313477, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8672246932983398, "num_tokens": 596725298.0, "step": 15643 }, { "epoch": 1.9900775982699401, "ewc_loss": 0.06393447518348694, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000309754948830232, "grad_norm": 7.774751663208008, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8700104355812073, "num_tokens": 596757400.0, "step": 15644 }, { "epoch": 1.9902048085485307, "ewc_loss": 0.06387387961149216, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003091489488724619, "grad_norm": 7.725885391235352, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.877455472946167, "num_tokens": 596790533.0, "step": 15645 }, { "epoch": 1.9903320188271212, "ewc_loss": 0.06403344869613647, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003107446536887437, "grad_norm": 7.667302131652832, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8592267036437988, "num_tokens": 596834467.0, "step": 15646 }, { "epoch": 1.9904592291057117, "ewc_loss": 0.06404409557580948, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000310851086396724, "grad_norm": 7.755709648132324, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8536889553070068, "num_tokens": 596873262.0, "step": 15647 }, { "epoch": 1.9905864393843022, "ewc_loss": 0.06399679183959961, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031037803273648024, "grad_norm": 7.70426607131958, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8657823801040649, "num_tokens": 596912833.0, "step": 15648 }, { "epoch": 1.9907136496628928, "ewc_loss": 0.06409117579460144, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003113219572696835, "grad_norm": 7.6777753829956055, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8720088005065918, "num_tokens": 596953554.0, "step": 15649 }, { "epoch": 1.9908408599414833, "ewc_loss": 0.06400153040885925, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003104254137724638, "grad_norm": 7.713750839233398, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8596936464309692, "num_tokens": 596991206.0, "step": 15650 }, { "epoch": 1.9909680702200738, "ewc_loss": 0.0640529990196228, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031094011501409113, "grad_norm": 7.661383152008057, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8672034740447998, "num_tokens": 597027222.0, "step": 15651 }, { "epoch": 1.9910952804986644, "ewc_loss": 0.06416404247283936, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031205054256133735, "grad_norm": 7.696015357971191, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.874814510345459, "num_tokens": 597069040.0, "step": 15652 }, { "epoch": 1.9912224907772549, "ewc_loss": 0.06404104828834534, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003108206728938967, "grad_norm": 7.71598482131958, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8573663234710693, "num_tokens": 597112529.0, "step": 15653 }, { "epoch": 1.9913497010558454, "ewc_loss": 0.06443706154823303, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003123393398709595, "grad_norm": 7.733065605163574, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8651432991027832, "num_tokens": 597148304.0, "step": 15654 }, { "epoch": 1.991476911334436, "ewc_loss": 0.06401725113391876, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003105826326645911, "grad_norm": 7.675314903259277, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8738412857055664, "num_tokens": 597184244.0, "step": 15655 }, { "epoch": 1.9916041216130265, "ewc_loss": 0.0645071342587471, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031304010190069675, "grad_norm": 7.748022079467773, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8578211069107056, "num_tokens": 597218178.0, "step": 15656 }, { "epoch": 1.991731331891617, "ewc_loss": 0.06430529803037643, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003110217221546918, "grad_norm": 7.658114910125732, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8630077242851257, "num_tokens": 597259256.0, "step": 15657 }, { "epoch": 1.9918585421702073, "ewc_loss": 0.06426311284303665, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003130412951577455, "grad_norm": 7.7240495681762695, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8757070302963257, "num_tokens": 597296584.0, "step": 15658 }, { "epoch": 1.9919857524487978, "ewc_loss": 0.0641685202717781, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003120953624602407, "grad_norm": 7.642200469970703, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8497760891914368, "num_tokens": 597341116.0, "step": 15659 }, { "epoch": 1.9921129627273884, "ewc_loss": 0.06434257328510284, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031383585883304477, "grad_norm": 7.789157867431641, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8633928298950195, "num_tokens": 597375595.0, "step": 15660 }, { "epoch": 1.9922401730059789, "ewc_loss": 0.06415408849716187, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031195106566883624, "grad_norm": 7.637664794921875, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8505565524101257, "num_tokens": 597419226.0, "step": 15661 }, { "epoch": 1.9923673832845694, "ewc_loss": 0.06446664035320282, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031507652602158487, "grad_norm": 7.8090596199035645, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8455902338027954, "num_tokens": 597461242.0, "step": 15662 }, { "epoch": 1.99249459356316, "ewc_loss": 0.06416092813014984, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031201940146274865, "grad_norm": 7.6617326736450195, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8644857406616211, "num_tokens": 597500030.0, "step": 15663 }, { "epoch": 1.9926218038417502, "ewc_loss": 0.06449170410633087, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031532722641713917, "grad_norm": 7.815620422363281, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8485183715820312, "num_tokens": 597540428.0, "step": 15664 }, { "epoch": 1.9927490141203408, "ewc_loss": 0.06412144005298615, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031162454979494214, "grad_norm": 7.701601505279541, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8759638071060181, "num_tokens": 597575070.0, "step": 15665 }, { "epoch": 1.9928762243989313, "ewc_loss": 0.06446722149848938, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031508240499533713, "grad_norm": 7.786794185638428, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8624715209007263, "num_tokens": 597617425.0, "step": 15666 }, { "epoch": 1.9930034346775218, "ewc_loss": 0.06409963965415955, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003114065621048212, "grad_norm": 7.639301300048828, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8716220259666443, "num_tokens": 597660489.0, "step": 15667 }, { "epoch": 1.9931306449561124, "ewc_loss": 0.06440933048725128, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031450341339223087, "grad_norm": 7.683788776397705, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8702718019485474, "num_tokens": 597703626.0, "step": 15668 }, { "epoch": 1.993257855234703, "ewc_loss": 0.06427767872810364, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003131869016215205, "grad_norm": 7.690036773681641, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8604262471199036, "num_tokens": 597741176.0, "step": 15669 }, { "epoch": 1.9933850655132934, "ewc_loss": 0.06440973281860352, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031450751703232527, "grad_norm": 7.738303184509277, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8516612648963928, "num_tokens": 597779763.0, "step": 15670 }, { "epoch": 1.993512275791884, "ewc_loss": 0.06426753103733063, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031308550387620926, "grad_norm": 7.709715366363525, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8687105178833008, "num_tokens": 597813300.0, "step": 15671 }, { "epoch": 1.9936394860704745, "ewc_loss": 0.06445357203483582, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031494584982283413, "grad_norm": 7.7607293128967285, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8545289635658264, "num_tokens": 597849062.0, "step": 15672 }, { "epoch": 1.993766696349065, "ewc_loss": 0.06417546421289444, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031216477509588003, "grad_norm": 7.700509548187256, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8659377694129944, "num_tokens": 597886565.0, "step": 15673 }, { "epoch": 1.9938939066276555, "ewc_loss": 0.0645713359117508, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031368210329674184, "grad_norm": 7.735508441925049, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8662473559379578, "num_tokens": 597925097.0, "step": 15674 }, { "epoch": 1.994021116906246, "ewc_loss": 0.06427143514156342, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003131244739051908, "grad_norm": 7.679892539978027, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8768793344497681, "num_tokens": 597960786.0, "step": 15675 }, { "epoch": 1.9941483271848366, "ewc_loss": 0.06429626047611237, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031337272957898676, "grad_norm": 7.711571216583252, "learning_rate": 1e-06, "loss": 0.517, "mean_token_accuracy": 0.8466798067092896, "num_tokens": 598000442.0, "step": 15676 }, { "epoch": 1.9942755374634271, "ewc_loss": 0.06421023607254028, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003125125076621771, "grad_norm": 7.657804489135742, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8620643615722656, "num_tokens": 598042946.0, "step": 15677 }, { "epoch": 1.9944027477420176, "ewc_loss": 0.0643693208694458, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003141034103464335, "grad_norm": 7.754486560821533, "learning_rate": 1e-06, "loss": 0.5343, "mean_token_accuracy": 0.842937707901001, "num_tokens": 598081659.0, "step": 15678 }, { "epoch": 1.9945299580206082, "ewc_loss": 0.06417536735534668, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003121638437733054, "grad_norm": 7.648865699768066, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8591088056564331, "num_tokens": 598120203.0, "step": 15679 }, { "epoch": 1.9946571682991987, "ewc_loss": 0.06439097970724106, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003143199428450316, "grad_norm": 7.781462669372559, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8510074019432068, "num_tokens": 598154779.0, "step": 15680 }, { "epoch": 1.9947843785777892, "ewc_loss": 0.06422895193099976, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003126996452920139, "grad_norm": 7.626204013824463, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8647956848144531, "num_tokens": 598197075.0, "step": 15681 }, { "epoch": 1.9949115888563798, "ewc_loss": 0.0645379051566124, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031578922062180936, "grad_norm": 7.7611188888549805, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8559484481811523, "num_tokens": 598240539.0, "step": 15682 }, { "epoch": 1.99503879913497, "ewc_loss": 0.06440334767103195, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003120022010989487, "grad_norm": 7.64140510559082, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8654721975326538, "num_tokens": 598281776.0, "step": 15683 }, { "epoch": 1.9951660094135606, "ewc_loss": 0.06463688611984253, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003167790418956429, "grad_norm": 7.759712219238281, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8717718124389648, "num_tokens": 598322715.0, "step": 15684 }, { "epoch": 1.9952932196921511, "ewc_loss": 0.06420272588729858, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003124374197795987, "grad_norm": 7.638960838317871, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8582067489624023, "num_tokens": 598364495.0, "step": 15685 }, { "epoch": 1.9954204299707416, "ewc_loss": 0.06484049558639526, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031637371284887195, "grad_norm": 7.772728443145752, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8611353039741516, "num_tokens": 598407102.0, "step": 15686 }, { "epoch": 1.9955476402493322, "ewc_loss": 0.06429298222064972, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.000313339929562062, "grad_norm": 7.678174018859863, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8500291705131531, "num_tokens": 598448600.0, "step": 15687 }, { "epoch": 1.9956748505279227, "ewc_loss": 0.0646250993013382, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003166611713822931, "grad_norm": 7.764301776885986, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8545872569084167, "num_tokens": 598491140.0, "step": 15688 }, { "epoch": 1.995802060806513, "ewc_loss": 0.06425732374191284, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003129834367427975, "grad_norm": 7.69489049911499, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8600644469261169, "num_tokens": 598527831.0, "step": 15689 }, { "epoch": 1.9959292710851035, "ewc_loss": 0.06452737748622894, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031568395206704736, "grad_norm": 7.778135299682617, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8512387871742249, "num_tokens": 598568680.0, "step": 15690 }, { "epoch": 1.996056481363694, "ewc_loss": 0.06434542685747147, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003138644096907228, "grad_norm": 7.712625503540039, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8526888489723206, "num_tokens": 598606254.0, "step": 15691 }, { "epoch": 1.9961836916422846, "ewc_loss": 0.06448112428188324, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031522143399342895, "grad_norm": 7.830924987792969, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8658608198165894, "num_tokens": 598642400.0, "step": 15692 }, { "epoch": 1.9963109019208751, "ewc_loss": 0.06416060030460358, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003120162000413984, "grad_norm": 7.675131797790527, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.842115044593811, "num_tokens": 598678379.0, "step": 15693 }, { "epoch": 1.9964381121994657, "ewc_loss": 0.0645422637462616, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.00031583281815983355, "grad_norm": 7.773298740386963, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.862712025642395, "num_tokens": 598718234.0, "step": 15694 }, { "epoch": 1.9965653224780562, "ewc_loss": 0.06416294723749161, "ewc_loss_diag": 3.2901763916015625e-05, "ewc_loss_parallel": 0.0003120396286249161, "grad_norm": 7.673656463623047, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8680034875869751, "num_tokens": 598754897.0, "step": 15695 }, { "epoch": 1.9966925327566467, "ewc_loss": 0.0647900402545929, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031586919794790447, "grad_norm": 7.8211164474487305, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8581659197807312, "num_tokens": 598785268.0, "step": 15696 }, { "epoch": 1.9968197430352372, "ewc_loss": 0.06443086266517639, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003122774069197476, "grad_norm": 7.646030426025391, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8692069053649902, "num_tokens": 598827131.0, "step": 15697 }, { "epoch": 1.9969469533138278, "ewc_loss": 0.0648568868637085, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031653762562200427, "grad_norm": 7.844440937042236, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8646170496940613, "num_tokens": 598859507.0, "step": 15698 }, { "epoch": 1.9970741635924183, "ewc_loss": 0.06439428776502609, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031191162997856736, "grad_norm": 7.674957275390625, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8551139831542969, "num_tokens": 598893280.0, "step": 15699 }, { "epoch": 1.9972013738710088, "ewc_loss": 0.06473366916179657, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031530545675195754, "grad_norm": 7.733999252319336, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.877458930015564, "num_tokens": 598932759.0, "step": 15700 }, { "epoch": 1.9973285841495994, "ewc_loss": 0.06447988003492355, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031276754452846944, "grad_norm": 7.704586505889893, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8737075924873352, "num_tokens": 598972899.0, "step": 15701 }, { "epoch": 1.9974557944281899, "ewc_loss": 0.06464087218046188, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003143774811178446, "grad_norm": 7.719819068908691, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8657065629959106, "num_tokens": 599012811.0, "step": 15702 }, { "epoch": 1.9975830047067804, "ewc_loss": 0.06458204239606857, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031378917628899217, "grad_norm": 7.735429286956787, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.855199933052063, "num_tokens": 599055632.0, "step": 15703 }, { "epoch": 1.997710214985371, "ewc_loss": 0.0646282210946083, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031425096676684916, "grad_norm": 7.753391742706299, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8464146256446838, "num_tokens": 599090924.0, "step": 15704 }, { "epoch": 1.9978374252639615, "ewc_loss": 0.06487038731575012, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00031423120526596904, "grad_norm": 7.691542625427246, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8779229521751404, "num_tokens": 599130434.0, "step": 15705 }, { "epoch": 1.997964635542552, "ewc_loss": 0.06481478363275528, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00031367517658509314, "grad_norm": 7.699324607849121, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8627508878707886, "num_tokens": 599171018.0, "step": 15706 }, { "epoch": 1.9980918458211423, "ewc_loss": 0.06463181972503662, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031428696820512414, "grad_norm": 7.778489589691162, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8559268712997437, "num_tokens": 599212445.0, "step": 15707 }, { "epoch": 1.9982190560997328, "ewc_loss": 0.06451047956943512, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003130735130980611, "grad_norm": 7.704432964324951, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8674305081367493, "num_tokens": 599250591.0, "step": 15708 }, { "epoch": 1.9983462663783234, "ewc_loss": 0.06469052284955978, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031487399246543646, "grad_norm": 7.832070350646973, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8598779439926147, "num_tokens": 599284987.0, "step": 15709 }, { "epoch": 1.9984734766569139, "ewc_loss": 0.06472964584827423, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003128238022327423, "grad_norm": 7.739401340484619, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.859678328037262, "num_tokens": 599324656.0, "step": 15710 }, { "epoch": 1.9986006869355044, "ewc_loss": 0.06461609154939651, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003141296620015055, "grad_norm": 7.729491710662842, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8744762539863586, "num_tokens": 599364014.0, "step": 15711 }, { "epoch": 1.998727897214095, "ewc_loss": 0.06453514844179153, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003133202262688428, "grad_norm": 7.842795372009277, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8568563461303711, "num_tokens": 599396781.0, "step": 15712 }, { "epoch": 1.9988551074926852, "ewc_loss": 0.06447891145944595, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031275785295292735, "grad_norm": 7.7286906242370605, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8741706013679504, "num_tokens": 599433240.0, "step": 15713 }, { "epoch": 1.9989823177712758, "ewc_loss": 0.06455004215240479, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031346912146545947, "grad_norm": 7.712469577789307, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8717117309570312, "num_tokens": 599474883.0, "step": 15714 }, { "epoch": 1.9991095280498663, "ewc_loss": 0.0645240843296051, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031320960260927677, "grad_norm": 7.780529499053955, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.865738570690155, "num_tokens": 599508492.0, "step": 15715 }, { "epoch": 1.9992367383284568, "ewc_loss": 0.0644955113530159, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031292386120185256, "grad_norm": 7.771520614624023, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8664262890815735, "num_tokens": 599545946.0, "step": 15716 }, { "epoch": 1.9993639486070474, "ewc_loss": 0.06441536545753479, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031212237081490457, "grad_norm": 7.614823818206787, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8769944310188293, "num_tokens": 599586217.0, "step": 15717 }, { "epoch": 1.9994911588856379, "ewc_loss": 0.06471848487854004, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003151535929646343, "grad_norm": 7.7993950843811035, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8731526732444763, "num_tokens": 599620431.0, "step": 15718 }, { "epoch": 1.9996183691642284, "ewc_loss": 0.06437376886606216, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003117064479738474, "grad_norm": 7.687308311462402, "learning_rate": 1e-06, "loss": 0.5118, "mean_token_accuracy": 0.8491688966751099, "num_tokens": 599659959.0, "step": 15719 }, { "epoch": 1.999745579442819, "ewc_loss": 0.06472896039485931, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.000315258395858109, "grad_norm": 7.750012397766113, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8614015579223633, "num_tokens": 599696558.0, "step": 15720 }, { "epoch": 1.9998727897214095, "ewc_loss": 0.06439127027988434, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031188142020255327, "grad_norm": 7.731957912445068, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8525295853614807, "num_tokens": 599734925.0, "step": 15721 }, { "epoch": 2.0, "ewc_loss": 0.06469601392745972, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031492894049733877, "grad_norm": 7.76314640045166, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8680512309074402, "num_tokens": 599772613.0, "step": 15722 }, { "epoch": 2.0001272102785905, "ewc_loss": 0.06452742218971252, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003132430138066411, "grad_norm": 7.646549701690674, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8678981065750122, "num_tokens": 599813892.0, "step": 15723 }, { "epoch": 2.000254420557181, "ewc_loss": 0.06471157819032669, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031508452957496047, "grad_norm": 7.83230447769165, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.866028904914856, "num_tokens": 599852807.0, "step": 15724 }, { "epoch": 2.0003816308357716, "ewc_loss": 0.06447690725326538, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003127377713099122, "grad_norm": 7.681462287902832, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8720214366912842, "num_tokens": 599893712.0, "step": 15725 }, { "epoch": 2.000508841114362, "ewc_loss": 0.06475095450878143, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031547824619337916, "grad_norm": 7.785099983215332, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8670797348022461, "num_tokens": 599929954.0, "step": 15726 }, { "epoch": 2.0006360513929526, "ewc_loss": 0.06447497010231018, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031271850457414985, "grad_norm": 7.710317611694336, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8640213012695312, "num_tokens": 599967763.0, "step": 15727 }, { "epoch": 2.000763261671543, "ewc_loss": 0.06475363671779633, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031550507992506027, "grad_norm": 7.769542217254639, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8742069602012634, "num_tokens": 600008938.0, "step": 15728 }, { "epoch": 2.0008904719501337, "ewc_loss": 0.06453676521778107, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003133364371024072, "grad_norm": 10.544320106506348, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8618618249893188, "num_tokens": 600051849.0, "step": 15729 }, { "epoch": 2.0010176822287242, "ewc_loss": 0.06631413102149963, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003311100590508431, "grad_norm": 7.807872772216797, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8726938366889954, "num_tokens": 600088568.0, "step": 15730 }, { "epoch": 2.0011448925073148, "ewc_loss": 0.0661812275648117, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00032733965781517327, "grad_norm": 8.144436836242676, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8697777390480042, "num_tokens": 600131553.0, "step": 15731 }, { "epoch": 2.0012721027859053, "ewc_loss": 0.06433147192001343, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003112834529019892, "grad_norm": 7.624927520751953, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8662812113761902, "num_tokens": 600173198.0, "step": 15732 }, { "epoch": 2.001399313064496, "ewc_loss": 0.06627893447875977, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003307581355329603, "grad_norm": 8.168006896972656, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8701677322387695, "num_tokens": 600211347.0, "step": 15733 }, { "epoch": 2.0015265233430863, "ewc_loss": 0.06454409658908844, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031340974965132773, "grad_norm": 7.737121105194092, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8593673706054688, "num_tokens": 600249953.0, "step": 15734 }, { "epoch": 2.0016537336216764, "ewc_loss": 0.06567076593637466, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003246764244977385, "grad_norm": 7.985464096069336, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8579117059707642, "num_tokens": 600291211.0, "step": 15735 }, { "epoch": 2.001780943900267, "ewc_loss": 0.06463306397199631, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031429939554072917, "grad_norm": 7.794005393981934, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8699634671211243, "num_tokens": 600332906.0, "step": 15736 }, { "epoch": 2.0019081541788575, "ewc_loss": 0.06517845392227173, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031975333695299923, "grad_norm": 7.902771949768066, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8667387366294861, "num_tokens": 600370570.0, "step": 15737 }, { "epoch": 2.002035364457448, "ewc_loss": 0.0647810846567154, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031577955815009773, "grad_norm": 7.819403171539307, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8570284843444824, "num_tokens": 600409368.0, "step": 15738 }, { "epoch": 2.0021625747360385, "ewc_loss": 0.06481873989105225, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003161561908200383, "grad_norm": 7.8264546394348145, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8699088096618652, "num_tokens": 600442019.0, "step": 15739 }, { "epoch": 2.002289785014629, "ewc_loss": 0.06479331851005554, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031590191065333784, "grad_norm": 7.969903945922852, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8737727403640747, "num_tokens": 600471665.0, "step": 15740 }, { "epoch": 2.0024169952932196, "ewc_loss": 0.06441409140825272, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031210968154482543, "grad_norm": 7.7400665283203125, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8631407022476196, "num_tokens": 600508804.0, "step": 15741 }, { "epoch": 2.00254420557181, "ewc_loss": 0.06485702097415924, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003165389643982053, "grad_norm": 7.878146648406982, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8542866706848145, "num_tokens": 600543895.0, "step": 15742 }, { "epoch": 2.0026714158504006, "ewc_loss": 0.06432792544364929, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003112480044364929, "grad_norm": 10.57528018951416, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8680546283721924, "num_tokens": 600585818.0, "step": 15743 }, { "epoch": 2.002798626128991, "ewc_loss": 0.06630436331033707, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003310123865958303, "grad_norm": 7.82097864151001, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8688997030258179, "num_tokens": 600623106.0, "step": 15744 }, { "epoch": 2.0029258364075817, "ewc_loss": 0.06562311947345734, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00032419999479316175, "grad_norm": 8.054177284240723, "learning_rate": 1e-06, "loss": 0.3832, "mean_token_accuracy": 0.8875719308853149, "num_tokens": 600662662.0, "step": 15745 }, { "epoch": 2.0030530466861722, "ewc_loss": 0.06444871425628662, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003124559298157692, "grad_norm": 7.825053691864014, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.853232204914093, "num_tokens": 600699416.0, "step": 15746 }, { "epoch": 2.0031802569647628, "ewc_loss": 0.06553782522678375, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00032334699062630534, "grad_norm": 7.958627223968506, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8597277402877808, "num_tokens": 600739351.0, "step": 15747 }, { "epoch": 2.0033074672433533, "ewc_loss": 0.06465970724821091, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031456584110856056, "grad_norm": 7.855745792388916, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8681275844573975, "num_tokens": 600781198.0, "step": 15748 }, { "epoch": 2.003434677521944, "ewc_loss": 0.06480782479047775, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031604699324816465, "grad_norm": 7.826470375061035, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8635028600692749, "num_tokens": 600821743.0, "step": 15749 }, { "epoch": 2.0035618878005343, "ewc_loss": 0.06473641097545624, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031533281435258687, "grad_norm": 7.809650897979736, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8800891041755676, "num_tokens": 600862463.0, "step": 15750 }, { "epoch": 2.003689098079125, "ewc_loss": 0.06468042731285095, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031477297306992114, "grad_norm": 7.793133735656738, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.856359601020813, "num_tokens": 600902629.0, "step": 15751 }, { "epoch": 2.0038163083577154, "ewc_loss": 0.06479576230049133, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031592632876709104, "grad_norm": 7.7755446434021, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8634986877441406, "num_tokens": 600941778.0, "step": 15752 }, { "epoch": 2.003943518636306, "ewc_loss": 0.0645749643445015, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003137183957733214, "grad_norm": 7.783166408538818, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8742112517356873, "num_tokens": 600979425.0, "step": 15753 }, { "epoch": 2.0040707289148965, "ewc_loss": 0.06481948494911194, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003161636122968048, "grad_norm": 7.881781578063965, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8609458208084106, "num_tokens": 601017882.0, "step": 15754 }, { "epoch": 2.004197939193487, "ewc_loss": 0.06452929973602295, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003132617275696248, "grad_norm": 7.752193927764893, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8608125448226929, "num_tokens": 601053394.0, "step": 15755 }, { "epoch": 2.0043251494720775, "ewc_loss": 0.06462937593460083, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031426255009137094, "grad_norm": 7.713622570037842, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8693351149559021, "num_tokens": 601094940.0, "step": 15756 }, { "epoch": 2.004452359750668, "ewc_loss": 0.06463071703910828, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003142759669572115, "grad_norm": 7.740135669708252, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8669226765632629, "num_tokens": 601132963.0, "step": 15757 }, { "epoch": 2.0045795700292586, "ewc_loss": 0.0647081583738327, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031505036167800426, "grad_norm": 7.774460315704346, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8633636236190796, "num_tokens": 601168511.0, "step": 15758 }, { "epoch": 2.0047067803078487, "ewc_loss": 0.064878448843956, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003143118228763342, "grad_norm": 7.7500739097595215, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8673002123832703, "num_tokens": 601208922.0, "step": 15759 }, { "epoch": 2.004833990586439, "ewc_loss": 0.06463342905044556, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031430303351953626, "grad_norm": 7.714354038238525, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8604063987731934, "num_tokens": 601253814.0, "step": 15760 }, { "epoch": 2.0049612008650297, "ewc_loss": 0.06473404169082642, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031530921114608645, "grad_norm": 7.68705940246582, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8676129579544067, "num_tokens": 601295584.0, "step": 15761 }, { "epoch": 2.0050884111436202, "ewc_loss": 0.06471675634384155, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031513627618551254, "grad_norm": 7.818583965301514, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8578511476516724, "num_tokens": 601329134.0, "step": 15762 }, { "epoch": 2.0052156214222108, "ewc_loss": 0.06449666619300842, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003129354154225439, "grad_norm": 7.693593978881836, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8743353486061096, "num_tokens": 601369100.0, "step": 15763 }, { "epoch": 2.0053428317008013, "ewc_loss": 0.06486326456069946, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031660142121836543, "grad_norm": 7.742237091064453, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8801140785217285, "num_tokens": 601407876.0, "step": 15764 }, { "epoch": 2.005470041979392, "ewc_loss": 0.06473599374294281, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003153287398163229, "grad_norm": 7.715371608734131, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.87941575050354, "num_tokens": 601446996.0, "step": 15765 }, { "epoch": 2.0055972522579824, "ewc_loss": 0.06487254798412323, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003166942624375224, "grad_norm": 7.801795959472656, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8578921556472778, "num_tokens": 601482158.0, "step": 15766 }, { "epoch": 2.005724462536573, "ewc_loss": 0.06616196036338806, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003149399417452514, "grad_norm": 35.850669860839844, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8732824325561523, "num_tokens": 601517940.0, "step": 15767 }, { "epoch": 2.0058516728151634, "ewc_loss": 0.0972948968410492, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0006384763401001692, "grad_norm": 11.695784568786621, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8871712684631348, "num_tokens": 601553823.0, "step": 15768 }, { "epoch": 2.005978883093754, "ewc_loss": 0.06252068281173706, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0002907341404352337, "grad_norm": 6.397303581237793, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8546965718269348, "num_tokens": 601595482.0, "step": 15769 }, { "epoch": 2.0061060933723445, "ewc_loss": 0.08336815237998962, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0005016502691432834, "grad_norm": 10.852136611938477, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8736583590507507, "num_tokens": 601632451.0, "step": 15770 }, { "epoch": 2.006233303650935, "ewc_loss": 0.08539002388715744, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0005194275872781873, "grad_norm": 10.443733215332031, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.863657534122467, "num_tokens": 601667573.0, "step": 15771 }, { "epoch": 2.0063605139295255, "ewc_loss": 0.07121589034795761, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00038012766162864864, "grad_norm": 7.958497047424316, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8738212585449219, "num_tokens": 601704494.0, "step": 15772 }, { "epoch": 2.006487724208116, "ewc_loss": 0.07338638603687286, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003993911959696561, "grad_norm": 9.22846508026123, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8562759160995483, "num_tokens": 601745307.0, "step": 15773 }, { "epoch": 2.0066149344867066, "ewc_loss": 0.07387011498212814, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00040666988934390247, "grad_norm": 8.500571250915527, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8813078999519348, "num_tokens": 601782551.0, "step": 15774 }, { "epoch": 2.006742144765297, "ewc_loss": 0.07009537518024445, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003664810792542994, "grad_norm": 8.396042823791504, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8679801821708679, "num_tokens": 601823378.0, "step": 15775 }, { "epoch": 2.0068693550438876, "ewc_loss": 0.07031027972698212, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003686301060952246, "grad_norm": 8.3560209274292, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8671364784240723, "num_tokens": 601860881.0, "step": 15776 }, { "epoch": 2.006996565322478, "ewc_loss": 0.06896135210990906, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003551409172359854, "grad_norm": 8.113859176635742, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.864535927772522, "num_tokens": 601903502.0, "step": 15777 }, { "epoch": 2.0071237756010687, "ewc_loss": 0.06859289109706879, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00035145628498867154, "grad_norm": 8.226393699645996, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8846518993377686, "num_tokens": 601937097.0, "step": 15778 }, { "epoch": 2.007250985879659, "ewc_loss": 0.06787736713886261, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00034430099185556173, "grad_norm": 7.9959187507629395, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8704152703285217, "num_tokens": 601978050.0, "step": 15779 }, { "epoch": 2.0073781961582498, "ewc_loss": 0.06755618005990982, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003410891513340175, "grad_norm": 8.155585289001465, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8655666708946228, "num_tokens": 602018167.0, "step": 15780 }, { "epoch": 2.0075054064368403, "ewc_loss": 0.0669337585568428, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00033486494794487953, "grad_norm": 7.956119060516357, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8534351587295532, "num_tokens": 602054485.0, "step": 15781 }, { "epoch": 2.007632616715431, "ewc_loss": 0.06655513495206833, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003335200890433043, "grad_norm": 7.980958938598633, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8723312616348267, "num_tokens": 602092881.0, "step": 15782 }, { "epoch": 2.0077598269940213, "ewc_loss": 0.06607633084058762, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003287320723757148, "grad_norm": 7.961907863616943, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8677511811256409, "num_tokens": 602127424.0, "step": 15783 }, { "epoch": 2.0078870372726114, "ewc_loss": 0.0659489706158638, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00032745845965109766, "grad_norm": 7.8563151359558105, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8729647994041443, "num_tokens": 602168340.0, "step": 15784 }, { "epoch": 2.008014247551202, "ewc_loss": 0.06559743732213974, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003239431243855506, "grad_norm": 7.8800530433654785, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8578807711601257, "num_tokens": 602207894.0, "step": 15785 }, { "epoch": 2.0081414578297925, "ewc_loss": 0.06554378569126129, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00032340665347874165, "grad_norm": 7.828364849090576, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8689132928848267, "num_tokens": 602245924.0, "step": 15786 }, { "epoch": 2.008268668108383, "ewc_loss": 0.06543926149606705, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003223613603040576, "grad_norm": 7.816274642944336, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.865788459777832, "num_tokens": 602278417.0, "step": 15787 }, { "epoch": 2.0083958783869735, "ewc_loss": 0.06529852747917175, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003209540154784918, "grad_norm": 7.790579319000244, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8694455027580261, "num_tokens": 602318431.0, "step": 15788 }, { "epoch": 2.008523088665564, "ewc_loss": 0.06525087356567383, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00032047752756625414, "grad_norm": 7.861636161804199, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8575348258018494, "num_tokens": 602356315.0, "step": 15789 }, { "epoch": 2.0086502989441546, "ewc_loss": 0.06520220637321472, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003199908242095262, "grad_norm": 7.780029296875, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8812979459762573, "num_tokens": 602398865.0, "step": 15790 }, { "epoch": 2.008777509222745, "ewc_loss": 0.06530971080064774, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00031862445757724345, "grad_norm": 7.7522783279418945, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8768763542175293, "num_tokens": 602435835.0, "step": 15791 }, { "epoch": 2.0089047195013356, "ewc_loss": 0.06505465507507324, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031851534731686115, "grad_norm": 7.77077579498291, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8740023970603943, "num_tokens": 602470359.0, "step": 15792 }, { "epoch": 2.009031929779926, "ewc_loss": 0.06513553857803345, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031932408455759287, "grad_norm": 7.833530426025391, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8697254657745361, "num_tokens": 602512132.0, "step": 15793 }, { "epoch": 2.0091591400585167, "ewc_loss": 0.06493442505598068, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031731300987303257, "grad_norm": 7.748427391052246, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8640424013137817, "num_tokens": 602552700.0, "step": 15794 }, { "epoch": 2.0092863503371072, "ewc_loss": 0.06508581340312958, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031882687471807003, "grad_norm": 7.783748149871826, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.866508960723877, "num_tokens": 602587779.0, "step": 15795 }, { "epoch": 2.0094135606156978, "ewc_loss": 0.06500019878149033, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003179707273375243, "grad_norm": 7.842658042907715, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8572226762771606, "num_tokens": 602623834.0, "step": 15796 }, { "epoch": 2.0095407708942883, "ewc_loss": 0.0649455338716507, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003174241282977164, "grad_norm": 7.728507995605469, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8791751861572266, "num_tokens": 602660141.0, "step": 15797 }, { "epoch": 2.009667981172879, "ewc_loss": 0.06526533514261246, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00031818068237043917, "grad_norm": 7.758875370025635, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8721877336502075, "num_tokens": 602698990.0, "step": 15798 }, { "epoch": 2.0097951914514693, "ewc_loss": 0.06497019529342651, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031767066684551537, "grad_norm": 7.844491481781006, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.847503662109375, "num_tokens": 602732053.0, "step": 15799 }, { "epoch": 2.00992240173006, "ewc_loss": 0.06479077786207199, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031587653211317956, "grad_norm": 7.686944484710693, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8672275543212891, "num_tokens": 602770975.0, "step": 15800 }, { "epoch": 2.0100496120086504, "ewc_loss": 0.06500154733657837, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003179842315148562, "grad_norm": 7.738243103027344, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8592451810836792, "num_tokens": 602813207.0, "step": 15801 }, { "epoch": 2.010176822287241, "ewc_loss": 0.0648300051689148, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003162687935400754, "grad_norm": 7.766558647155762, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8715057373046875, "num_tokens": 602849951.0, "step": 15802 }, { "epoch": 2.0103040325658315, "ewc_loss": 0.0648408830165863, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.00031637761276215315, "grad_norm": 7.702930927276611, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.857906699180603, "num_tokens": 602891418.0, "step": 15803 }, { "epoch": 2.010431242844422, "ewc_loss": 0.06509813666343689, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003165086964145303, "grad_norm": 7.760757923126221, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8733112215995789, "num_tokens": 602933039.0, "step": 15804 }, { "epoch": 2.0105584531230125, "ewc_loss": 0.06478974223136902, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003158661420457065, "grad_norm": 7.736395835876465, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8822683095932007, "num_tokens": 602965546.0, "step": 15805 }, { "epoch": 2.010685663401603, "ewc_loss": 0.06487586349248886, "ewc_loss_diag": 3.314018249511719e-05, "ewc_loss_parallel": 0.0003167273825965822, "grad_norm": 7.731544017791748, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8668805956840515, "num_tokens": 603004473.0, "step": 15806 }, { "epoch": 2.0108128736801936, "ewc_loss": 0.06498312950134277, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00031535860034637153, "grad_norm": 7.700674057006836, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8588789701461792, "num_tokens": 603041644.0, "step": 15807 }, { "epoch": 2.0109400839587837, "ewc_loss": 0.06511621177196503, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003166894312016666, "grad_norm": 7.768098831176758, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8536680936813354, "num_tokens": 603084698.0, "step": 15808 }, { "epoch": 2.011067294237374, "ewc_loss": 0.06504124402999878, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.00031593977473676205, "grad_norm": 7.794949531555176, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8506155014038086, "num_tokens": 603121059.0, "step": 15809 }, { "epoch": 2.0111945045159647, "ewc_loss": 0.06497733294963837, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003153006255161017, "grad_norm": 7.722662448883057, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8652873039245605, "num_tokens": 603156604.0, "step": 15810 }, { "epoch": 2.0113217147945552, "ewc_loss": 0.06511528044939041, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003166801470797509, "grad_norm": 7.822461128234863, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.867684006690979, "num_tokens": 603183743.0, "step": 15811 }, { "epoch": 2.0114489250731458, "ewc_loss": 0.06486402451992035, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003141675842925906, "grad_norm": 7.709335803985596, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8453145623207092, "num_tokens": 603222099.0, "step": 15812 }, { "epoch": 2.0115761353517363, "ewc_loss": 0.06502427160739899, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003157700411975384, "grad_norm": 7.703455924987793, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8464657068252563, "num_tokens": 603263467.0, "step": 15813 }, { "epoch": 2.011703345630327, "ewc_loss": 0.06500346958637238, "ewc_loss_diag": 3.337860107421875e-05, "ewc_loss_parallel": 0.0003155620361212641, "grad_norm": 7.844829082489014, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8468296527862549, "num_tokens": 603296726.0, "step": 15814 }, { "epoch": 2.0118305559089174, "ewc_loss": 0.06506506353616714, "ewc_loss_diag": 3.361701965332031e-05, "ewc_loss_parallel": 0.00031373658566735685, "grad_norm": 7.717091083526611, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8726948499679565, "num_tokens": 603329691.0, "step": 15815 }, { "epoch": 2.011957766187508, "ewc_loss": 0.06539996713399887, "ewc_loss_diag": 3.361701965332031e-05, "ewc_loss_parallel": 0.0003170856216456741, "grad_norm": 7.711606979370117, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8744372129440308, "num_tokens": 603371619.0, "step": 15816 }, { "epoch": 2.0120849764660984, "ewc_loss": 0.06523854285478592, "ewc_loss_diag": 3.361701965332031e-05, "ewc_loss_parallel": 0.0003154713776893914, "grad_norm": 7.831676483154297, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8697941303253174, "num_tokens": 603400018.0, "step": 15817 }, { "epoch": 2.012212186744689, "ewc_loss": 0.0656355619430542, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003145587688777596, "grad_norm": 7.6867876052856445, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8692184686660767, "num_tokens": 603436971.0, "step": 15818 }, { "epoch": 2.0123393970232795, "ewc_loss": 0.06534793972969055, "ewc_loss_diag": 3.361701965332031e-05, "ewc_loss_parallel": 0.0003165653033647686, "grad_norm": 7.797958850860596, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8665527701377869, "num_tokens": 603472802.0, "step": 15819 }, { "epoch": 2.01246660730187, "ewc_loss": 0.06509776413440704, "ewc_loss_diag": 3.361701965332031e-05, "ewc_loss_parallel": 0.00031406356720253825, "grad_norm": 7.698652267456055, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8717945218086243, "num_tokens": 603506547.0, "step": 15820 }, { "epoch": 2.0125938175804605, "ewc_loss": 0.0652991384267807, "ewc_loss_diag": 3.361701965332031e-05, "ewc_loss_parallel": 0.00031607731943950057, "grad_norm": 7.755074501037598, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8497836589813232, "num_tokens": 603544967.0, "step": 15821 }, { "epoch": 2.012721027859051, "ewc_loss": 0.06542254984378815, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003148700052406639, "grad_norm": 7.738619804382324, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8612498044967651, "num_tokens": 603582522.0, "step": 15822 }, { "epoch": 2.0128482381376416, "ewc_loss": 0.06540847569704056, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003147292882204056, "grad_norm": 7.728274822235107, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.87654048204422, "num_tokens": 603619584.0, "step": 15823 }, { "epoch": 2.012975448416232, "ewc_loss": 0.0654197484254837, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003148420655634254, "grad_norm": 7.704880237579346, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8538304567337036, "num_tokens": 603660542.0, "step": 15824 }, { "epoch": 2.0131026586948226, "ewc_loss": 0.06543932110071182, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003150377597194165, "grad_norm": 7.71505069732666, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8682901263237, "num_tokens": 603698900.0, "step": 15825 }, { "epoch": 2.013229868973413, "ewc_loss": 0.0653756633400917, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031440117163583636, "grad_norm": 7.721930980682373, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8470635414123535, "num_tokens": 603733019.0, "step": 15826 }, { "epoch": 2.0133570792520037, "ewc_loss": 0.06546778976917267, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031532245338894427, "grad_norm": 7.6873860359191895, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8742232322692871, "num_tokens": 603772071.0, "step": 15827 }, { "epoch": 2.013484289530594, "ewc_loss": 0.06549983471632004, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003156428865622729, "grad_norm": 7.727595329284668, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8597893118858337, "num_tokens": 603811008.0, "step": 15828 }, { "epoch": 2.0136114998091847, "ewc_loss": 0.06539839506149292, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000314628443447873, "grad_norm": 10.582886695861816, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8560682535171509, "num_tokens": 603852647.0, "step": 15829 }, { "epoch": 2.0137387100877753, "ewc_loss": 0.06747105717658997, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003353550855536014, "grad_norm": 7.807596206665039, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8662853240966797, "num_tokens": 603894198.0, "step": 15830 }, { "epoch": 2.013865920366366, "ewc_loss": 0.06651949882507324, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00032583947177045047, "grad_norm": 8.053224563598633, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8607890605926514, "num_tokens": 603930168.0, "step": 15831 }, { "epoch": 2.0139931306449563, "ewc_loss": 0.06536372005939484, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031428170041181147, "grad_norm": 7.672882080078125, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8771847486495972, "num_tokens": 603966189.0, "step": 15832 }, { "epoch": 2.0141203409235464, "ewc_loss": 0.0668046697974205, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003286912105977535, "grad_norm": 8.041040420532227, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8629379272460938, "num_tokens": 604011110.0, "step": 15833 }, { "epoch": 2.014247551202137, "ewc_loss": 0.0654468983411789, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031511354609392583, "grad_norm": 7.977631092071533, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8497132062911987, "num_tokens": 604044256.0, "step": 15834 }, { "epoch": 2.0143747614807275, "ewc_loss": 0.06577649712562561, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031840946758165956, "grad_norm": 7.839156627655029, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8755519390106201, "num_tokens": 604078652.0, "step": 15835 }, { "epoch": 2.014501971759318, "ewc_loss": 0.06549347937107086, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031557929469272494, "grad_norm": 7.804403781890869, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8753756284713745, "num_tokens": 604112302.0, "step": 15836 }, { "epoch": 2.0146291820379085, "ewc_loss": 0.06546226143836975, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031526709790341556, "grad_norm": 8.405969619750977, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8832183480262756, "num_tokens": 604150011.0, "step": 15837 }, { "epoch": 2.014756392316499, "ewc_loss": 0.06503757834434509, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031102029606699944, "grad_norm": 7.593918323516846, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8668789863586426, "num_tokens": 604188859.0, "step": 15838 }, { "epoch": 2.0148836025950896, "ewc_loss": 0.06588411331176758, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003194856981281191, "grad_norm": 7.894379615783691, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8624225854873657, "num_tokens": 604232273.0, "step": 15839 }, { "epoch": 2.01501081287368, "ewc_loss": 0.06496932357549667, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003103377821389586, "grad_norm": 7.611029148101807, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8740907907485962, "num_tokens": 604265236.0, "step": 15840 }, { "epoch": 2.0151380231522706, "ewc_loss": 0.06585802882909775, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031922481139190495, "grad_norm": 7.8498358726501465, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8806250691413879, "num_tokens": 604304784.0, "step": 15841 }, { "epoch": 2.015265233430861, "ewc_loss": 0.06516432017087936, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003122877096757293, "grad_norm": 7.656620025634766, "learning_rate": 1e-06, "loss": 0.3725, "mean_token_accuracy": 0.8921089172363281, "num_tokens": 604341461.0, "step": 15842 }, { "epoch": 2.0153924437094517, "ewc_loss": 0.06594084203243256, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031761149875819683, "grad_norm": 7.859342098236084, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8725540041923523, "num_tokens": 604378628.0, "step": 15843 }, { "epoch": 2.0155196539880422, "ewc_loss": 0.06520132720470428, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031265776487998664, "grad_norm": 7.653894901275635, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8760641813278198, "num_tokens": 604418070.0, "step": 15844 }, { "epoch": 2.0156468642666328, "ewc_loss": 0.06571248173713684, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031776935793459415, "grad_norm": 7.854025363922119, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8785034418106079, "num_tokens": 604454445.0, "step": 15845 }, { "epoch": 2.0157740745452233, "ewc_loss": 0.06555067747831345, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.000313709897454828, "grad_norm": 8.397798538208008, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8701971769332886, "num_tokens": 604488559.0, "step": 15846 }, { "epoch": 2.015901284823814, "ewc_loss": 0.06487231701612473, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003093677223660052, "grad_norm": 7.638969421386719, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8585591316223145, "num_tokens": 604522538.0, "step": 15847 }, { "epoch": 2.0160284951024043, "ewc_loss": 0.06604884564876556, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031869160011410713, "grad_norm": 7.891593933105469, "learning_rate": 1e-06, "loss": 0.3807, "mean_token_accuracy": 0.8853492736816406, "num_tokens": 604561750.0, "step": 15848 }, { "epoch": 2.016155705380995, "ewc_loss": 0.06475115567445755, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003081561008002609, "grad_norm": 7.607439994812012, "learning_rate": 1e-06, "loss": 0.4131, "mean_token_accuracy": 0.8801267743110657, "num_tokens": 604594961.0, "step": 15849 }, { "epoch": 2.0162829156595854, "ewc_loss": 0.06575571745634079, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003182017244398594, "grad_norm": 7.884449481964111, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8573795557022095, "num_tokens": 604631301.0, "step": 15850 }, { "epoch": 2.016410125938176, "ewc_loss": 0.06503794342279434, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000311023963149637, "grad_norm": 7.69927978515625, "learning_rate": 1e-06, "loss": 0.5112, "mean_token_accuracy": 0.8469347953796387, "num_tokens": 604666049.0, "step": 15851 }, { "epoch": 2.0165373362167665, "ewc_loss": 0.06559044122695923, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003165489761158824, "grad_norm": 7.7999701499938965, "learning_rate": 1e-06, "loss": 0.3907, "mean_token_accuracy": 0.887661337852478, "num_tokens": 604701984.0, "step": 15852 }, { "epoch": 2.016664546495357, "ewc_loss": 0.06520449370145798, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000312689458951354, "grad_norm": 7.697449207305908, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8786793351173401, "num_tokens": 604738414.0, "step": 15853 }, { "epoch": 2.0167917567739475, "ewc_loss": 0.06550929695367813, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031573750311508775, "grad_norm": 7.8010172843933105, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8700687885284424, "num_tokens": 604779961.0, "step": 15854 }, { "epoch": 2.016918967052538, "ewc_loss": 0.06526948511600494, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031333937658928335, "grad_norm": 7.736207485198975, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8605040311813354, "num_tokens": 604819729.0, "step": 15855 }, { "epoch": 2.0170461773311286, "ewc_loss": 0.06553466618061066, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031599123030900955, "grad_norm": 7.781556129455566, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8778769969940186, "num_tokens": 604864108.0, "step": 15856 }, { "epoch": 2.0171733876097186, "ewc_loss": 0.06535342335700989, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031417873105965555, "grad_norm": 7.736639976501465, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8610765337944031, "num_tokens": 604905582.0, "step": 15857 }, { "epoch": 2.017300597888309, "ewc_loss": 0.06548871099948883, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003155316808260977, "grad_norm": 7.8130927085876465, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8754978179931641, "num_tokens": 604943931.0, "step": 15858 }, { "epoch": 2.0174278081668997, "ewc_loss": 0.06521128118038177, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031275732908397913, "grad_norm": 7.709083557128906, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8634288311004639, "num_tokens": 604982100.0, "step": 15859 }, { "epoch": 2.0175550184454902, "ewc_loss": 0.0653943419456482, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031458790181204677, "grad_norm": 7.751430988311768, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.856613039970398, "num_tokens": 605026289.0, "step": 15860 }, { "epoch": 2.0176822287240808, "ewc_loss": 0.06542313098907471, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003148758551105857, "grad_norm": 7.706230163574219, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8725622296333313, "num_tokens": 605070214.0, "step": 15861 }, { "epoch": 2.0178094390026713, "ewc_loss": 0.0654509961605072, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000315154466079548, "grad_norm": 7.757746696472168, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8628756999969482, "num_tokens": 605113495.0, "step": 15862 }, { "epoch": 2.017936649281262, "ewc_loss": 0.06544888764619827, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003151333949062973, "grad_norm": 7.79411506652832, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8713237047195435, "num_tokens": 605145717.0, "step": 15863 }, { "epoch": 2.0180638595598523, "ewc_loss": 0.06534828990697861, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003141274210065603, "grad_norm": 7.751768112182617, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8591519594192505, "num_tokens": 605180930.0, "step": 15864 }, { "epoch": 2.018191069838443, "ewc_loss": 0.06546156853437424, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003152602002955973, "grad_norm": 7.812788486480713, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8743528127670288, "num_tokens": 605214004.0, "step": 15865 }, { "epoch": 2.0183182801170334, "ewc_loss": 0.06525619328022003, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031320651760324836, "grad_norm": 7.754880428314209, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8687570095062256, "num_tokens": 605253244.0, "step": 15866 }, { "epoch": 2.018445490395624, "ewc_loss": 0.06551823019981384, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031582682277075946, "grad_norm": 7.730226516723633, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8631755113601685, "num_tokens": 605293832.0, "step": 15867 }, { "epoch": 2.0185727006742145, "ewc_loss": 0.06538935750722885, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003145381051581353, "grad_norm": 7.730314254760742, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8772785663604736, "num_tokens": 605333811.0, "step": 15868 }, { "epoch": 2.018699910952805, "ewc_loss": 0.0653497502207756, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003141420311294496, "grad_norm": 10.662612915039062, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8674237728118896, "num_tokens": 605371560.0, "step": 15869 }, { "epoch": 2.0188271212313955, "ewc_loss": 0.06750046461820602, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003356491797603667, "grad_norm": 7.82299280166626, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8652769923210144, "num_tokens": 605410555.0, "step": 15870 }, { "epoch": 2.018954331509986, "ewc_loss": 0.06677334010601044, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003283779078628868, "grad_norm": 8.119463920593262, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8617421984672546, "num_tokens": 605451851.0, "step": 15871 }, { "epoch": 2.0190815417885766, "ewc_loss": 0.06540004163980484, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031464494531974196, "grad_norm": 7.726190567016602, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.858077347278595, "num_tokens": 605490442.0, "step": 15872 }, { "epoch": 2.019208752067167, "ewc_loss": 0.06699693948030472, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003306139260530472, "grad_norm": 8.094290733337402, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8648561239242554, "num_tokens": 605525237.0, "step": 15873 }, { "epoch": 2.0193359623457576, "ewc_loss": 0.06549514830112457, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000315596058499068, "grad_norm": 7.771146297454834, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8714174032211304, "num_tokens": 605563624.0, "step": 15874 }, { "epoch": 2.019463172624348, "ewc_loss": 0.066199392080307, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003226384869776666, "grad_norm": 7.950931072235107, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8769036531448364, "num_tokens": 605597419.0, "step": 15875 }, { "epoch": 2.0195903829029387, "ewc_loss": 0.06566289067268372, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031727345776744187, "grad_norm": 7.814370155334473, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8584418296813965, "num_tokens": 605639954.0, "step": 15876 }, { "epoch": 2.019717593181529, "ewc_loss": 0.06590235233306885, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000319668062729761, "grad_norm": 7.890552997589111, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8680340051651001, "num_tokens": 605678226.0, "step": 15877 }, { "epoch": 2.0198448034601197, "ewc_loss": 0.06558755785226822, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003165201051160693, "grad_norm": 7.76935338973999, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8762271404266357, "num_tokens": 605719925.0, "step": 15878 }, { "epoch": 2.0199720137387103, "ewc_loss": 0.06580573320388794, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003187018446624279, "grad_norm": 7.85047721862793, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8495115637779236, "num_tokens": 605761282.0, "step": 15879 }, { "epoch": 2.020099224017301, "ewc_loss": 0.06552937626838684, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003159383195452392, "grad_norm": 7.791630744934082, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8765833973884583, "num_tokens": 605800681.0, "step": 15880 }, { "epoch": 2.0202264342958913, "ewc_loss": 0.06561046838760376, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003167492104694247, "grad_norm": 7.845311641693115, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8500522375106812, "num_tokens": 605836282.0, "step": 15881 }, { "epoch": 2.0203536445744814, "ewc_loss": 0.06542262434959412, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003148708201479167, "grad_norm": 7.734095573425293, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8693476319313049, "num_tokens": 605880360.0, "step": 15882 }, { "epoch": 2.020480854853072, "ewc_loss": 0.06564148515462875, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003170593990944326, "grad_norm": 7.8153557777404785, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8766917586326599, "num_tokens": 605918363.0, "step": 15883 }, { "epoch": 2.0206080651316625, "ewc_loss": 0.06543867290019989, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003150312986690551, "grad_norm": 7.739953517913818, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8805376291275024, "num_tokens": 605953032.0, "step": 15884 }, { "epoch": 2.020735275410253, "ewc_loss": 0.0655914768576622, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031655930797569454, "grad_norm": 7.817842483520508, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8743441104888916, "num_tokens": 605992633.0, "step": 15885 }, { "epoch": 2.0208624856888435, "ewc_loss": 0.06546416133642197, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031528613180853426, "grad_norm": 7.698270797729492, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8690190315246582, "num_tokens": 606031776.0, "step": 15886 }, { "epoch": 2.020989695967434, "ewc_loss": 0.06565645337104797, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003172090800944716, "grad_norm": 7.823211193084717, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8603077530860901, "num_tokens": 606070614.0, "step": 15887 }, { "epoch": 2.0211169062460246, "ewc_loss": 0.0653456300497055, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003141007910016924, "grad_norm": 7.690138816833496, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8671871423721313, "num_tokens": 606115898.0, "step": 15888 }, { "epoch": 2.021244116524615, "ewc_loss": 0.06571508944034576, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031779540586285293, "grad_norm": 7.812521457672119, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8698852062225342, "num_tokens": 606150231.0, "step": 15889 }, { "epoch": 2.0213713268032056, "ewc_loss": 0.06538645178079605, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000314509030431509, "grad_norm": 7.721948146820068, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8599814176559448, "num_tokens": 606191527.0, "step": 15890 }, { "epoch": 2.021498537081796, "ewc_loss": 0.06568922102451324, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003175367310177535, "grad_norm": 7.853929042816162, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8600428700447083, "num_tokens": 606229070.0, "step": 15891 }, { "epoch": 2.0216257473603867, "ewc_loss": 0.06532327830791473, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003138773317914456, "grad_norm": 7.664859294891357, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8668999671936035, "num_tokens": 606269555.0, "step": 15892 }, { "epoch": 2.021752957638977, "ewc_loss": 0.06581433117389679, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000318787875585258, "grad_norm": 7.886936664581299, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8620452880859375, "num_tokens": 606302040.0, "step": 15893 }, { "epoch": 2.0218801679175677, "ewc_loss": 0.06533719599246979, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003140165063086897, "grad_norm": 7.749256610870361, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8759471774101257, "num_tokens": 606341769.0, "step": 15894 }, { "epoch": 2.0220073781961583, "ewc_loss": 0.06571777164936066, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031782229780219495, "grad_norm": 7.825254917144775, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.853380024433136, "num_tokens": 606379989.0, "step": 15895 }, { "epoch": 2.022134588474749, "ewc_loss": 0.0655137449502945, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031578203197568655, "grad_norm": 7.820810794830322, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.861580491065979, "num_tokens": 606414396.0, "step": 15896 }, { "epoch": 2.0222617987533393, "ewc_loss": 0.06549030542373657, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031554754241369665, "grad_norm": 7.834850788116455, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8712142705917358, "num_tokens": 606445949.0, "step": 15897 }, { "epoch": 2.02238900903193, "ewc_loss": 0.06553515046834946, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003159960324410349, "grad_norm": 7.718997478485107, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8705976009368896, "num_tokens": 606484751.0, "step": 15898 }, { "epoch": 2.0225162193105204, "ewc_loss": 0.06558054685592651, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003164499648846686, "grad_norm": 7.806295394897461, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8712160587310791, "num_tokens": 606525390.0, "step": 15899 }, { "epoch": 2.022643429589111, "ewc_loss": 0.06550168991088867, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003156613965984434, "grad_norm": 7.777864456176758, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8636568784713745, "num_tokens": 606563364.0, "step": 15900 }, { "epoch": 2.0227706398677014, "ewc_loss": 0.06562405824661255, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003168850962538272, "grad_norm": 7.806407928466797, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8695056438446045, "num_tokens": 606598140.0, "step": 15901 }, { "epoch": 2.022897850146292, "ewc_loss": 0.06537458300590515, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031439040321856737, "grad_norm": 7.7691731452941895, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8599879741668701, "num_tokens": 606636480.0, "step": 15902 }, { "epoch": 2.0230250604248825, "ewc_loss": 0.06559120118618011, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031655654311180115, "grad_norm": 7.786237716674805, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8697726726531982, "num_tokens": 606676044.0, "step": 15903 }, { "epoch": 2.023152270703473, "ewc_loss": 0.06542176008224487, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003148621181026101, "grad_norm": 7.7325921058654785, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.860267162322998, "num_tokens": 606713262.0, "step": 15904 }, { "epoch": 2.0232794809820636, "ewc_loss": 0.0655527412891388, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031617196509614587, "grad_norm": 7.767210960388184, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8634895086288452, "num_tokens": 606754844.0, "step": 15905 }, { "epoch": 2.0234066912606536, "ewc_loss": 0.0654178336262703, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031482288613915443, "grad_norm": 7.722714424133301, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.86794513463974, "num_tokens": 606800910.0, "step": 15906 }, { "epoch": 2.023533901539244, "ewc_loss": 0.06557802855968475, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031642481917515397, "grad_norm": 7.779467582702637, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8619587421417236, "num_tokens": 606841855.0, "step": 15907 }, { "epoch": 2.0236611118178347, "ewc_loss": 0.06548019498586655, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003154464648105204, "grad_norm": 10.568910598754883, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8559738397598267, "num_tokens": 606888261.0, "step": 15908 }, { "epoch": 2.0237883220964252, "ewc_loss": 0.06731841713190079, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00033382870606146753, "grad_norm": 7.801181316375732, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8597689270973206, "num_tokens": 606926235.0, "step": 15909 }, { "epoch": 2.0239155323750158, "ewc_loss": 0.06700386106967926, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00033068316406570375, "grad_norm": 8.185140609741211, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8630288243293762, "num_tokens": 606962527.0, "step": 15910 }, { "epoch": 2.0240427426536063, "ewc_loss": 0.06549590826034546, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031560356728732586, "grad_norm": 7.708240032196045, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8730199337005615, "num_tokens": 607008924.0, "step": 15911 }, { "epoch": 2.024169952932197, "ewc_loss": 0.06734855473041534, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00033413010532967746, "grad_norm": 8.132023811340332, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8636634349822998, "num_tokens": 607049788.0, "step": 15912 }, { "epoch": 2.0242971632107873, "ewc_loss": 0.06557922065258026, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003164367226418108, "grad_norm": 7.783112049102783, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8598737716674805, "num_tokens": 607087592.0, "step": 15913 }, { "epoch": 2.024424373489378, "ewc_loss": 0.06651318073272705, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003257763455621898, "grad_norm": 8.005233764648438, "learning_rate": 1e-06, "loss": 0.3974, "mean_token_accuracy": 0.88192218542099, "num_tokens": 607119954.0, "step": 15914 }, { "epoch": 2.0245515837679684, "ewc_loss": 0.06569377332925797, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003175822494085878, "grad_norm": 7.805602550506592, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8584732413291931, "num_tokens": 607156511.0, "step": 15915 }, { "epoch": 2.024678794046559, "ewc_loss": 0.06617389619350433, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00032238353742286563, "grad_norm": 7.985057830810547, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8752599358558655, "num_tokens": 607198292.0, "step": 15916 }, { "epoch": 2.0248060043251495, "ewc_loss": 0.06567308306694031, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031737532117404044, "grad_norm": 7.793946266174316, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8634012937545776, "num_tokens": 607237572.0, "step": 15917 }, { "epoch": 2.02493321460374, "ewc_loss": 0.06599707156419754, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003206152468919754, "grad_norm": 7.9097490310668945, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8582153916358948, "num_tokens": 607282321.0, "step": 15918 }, { "epoch": 2.0250604248823305, "ewc_loss": 0.06566622853279114, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003173068689648062, "grad_norm": 7.848592758178711, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8643590807914734, "num_tokens": 607313355.0, "step": 15919 }, { "epoch": 2.025187635160921, "ewc_loss": 0.0656331330537796, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031697581289336085, "grad_norm": 7.77049446105957, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8632279634475708, "num_tokens": 607351961.0, "step": 15920 }, { "epoch": 2.0253148454395116, "ewc_loss": 0.06586308777332306, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003192753647454083, "grad_norm": 7.861592769622803, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8691468238830566, "num_tokens": 607391368.0, "step": 15921 }, { "epoch": 2.025442055718102, "ewc_loss": 0.06557597219944, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031640424276702106, "grad_norm": 7.760775089263916, "learning_rate": 1e-06, "loss": 0.3922, "mean_token_accuracy": 0.8854649066925049, "num_tokens": 607427328.0, "step": 15922 }, { "epoch": 2.0255692659966926, "ewc_loss": 0.06605733931064606, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031877655419521034, "grad_norm": 8.766809463500977, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8617360591888428, "num_tokens": 607468243.0, "step": 15923 }, { "epoch": 2.025696476275283, "ewc_loss": 0.065071702003479, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003113615675829351, "grad_norm": 7.655699253082275, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8503655195236206, "num_tokens": 607502466.0, "step": 15924 }, { "epoch": 2.0258236865538737, "ewc_loss": 0.06643559783697128, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00032500052475370467, "grad_norm": 8.012866020202637, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8572610020637512, "num_tokens": 607544093.0, "step": 15925 }, { "epoch": 2.025950896832464, "ewc_loss": 0.0650639459490776, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031128397677093744, "grad_norm": 7.642545223236084, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8721646666526794, "num_tokens": 607576054.0, "step": 15926 }, { "epoch": 2.0260781071110547, "ewc_loss": 0.06636427342891693, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00032428724807687104, "grad_norm": 8.008586883544922, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8792786598205566, "num_tokens": 607608462.0, "step": 15927 }, { "epoch": 2.0262053173896453, "ewc_loss": 0.06536048650741577, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003142493951600045, "grad_norm": 7.73600959777832, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8593054413795471, "num_tokens": 607645272.0, "step": 15928 }, { "epoch": 2.026332527668236, "ewc_loss": 0.06600704789161682, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00032071495661512017, "grad_norm": 7.877370357513428, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8694891929626465, "num_tokens": 607682585.0, "step": 15929 }, { "epoch": 2.0264597379468263, "ewc_loss": 0.06558696925640106, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003165142552461475, "grad_norm": 7.747796058654785, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.8843349814414978, "num_tokens": 607721302.0, "step": 15930 }, { "epoch": 2.0265869482254164, "ewc_loss": 0.06577029824256897, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003183475637342781, "grad_norm": 7.855167865753174, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8630622625350952, "num_tokens": 607758357.0, "step": 15931 }, { "epoch": 2.026714158504007, "ewc_loss": 0.06566093862056732, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003172539290972054, "grad_norm": 7.786962509155273, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8675040006637573, "num_tokens": 607795467.0, "step": 15932 }, { "epoch": 2.0268413687825975, "ewc_loss": 0.06570833921432495, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003177278849761933, "grad_norm": 7.807377338409424, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8609509468078613, "num_tokens": 607833192.0, "step": 15933 }, { "epoch": 2.026968579061188, "ewc_loss": 0.06571562588214874, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003178007900714874, "grad_norm": 7.829841136932373, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8760740756988525, "num_tokens": 607866672.0, "step": 15934 }, { "epoch": 2.0270957893397785, "ewc_loss": 0.06563295423984528, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031697406666353345, "grad_norm": 7.785880088806152, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8663787841796875, "num_tokens": 607907976.0, "step": 15935 }, { "epoch": 2.027222999618369, "ewc_loss": 0.06563164293766022, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.000316960911732167, "grad_norm": 7.818199634552002, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8708703517913818, "num_tokens": 607946009.0, "step": 15936 }, { "epoch": 2.0273502098969596, "ewc_loss": 0.06557272374629974, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003163717337884009, "grad_norm": 7.813770294189453, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.870902419090271, "num_tokens": 607986800.0, "step": 15937 }, { "epoch": 2.02747742017555, "ewc_loss": 0.06559629738330841, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031660753302276134, "grad_norm": 7.742089748382568, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8603609800338745, "num_tokens": 608021807.0, "step": 15938 }, { "epoch": 2.0276046304541406, "ewc_loss": 0.0655674934387207, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003163194633089006, "grad_norm": 7.848319053649902, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8658413887023926, "num_tokens": 608060294.0, "step": 15939 }, { "epoch": 2.027731840732731, "ewc_loss": 0.0657421201467514, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031562434742227197, "grad_norm": 7.7581329345703125, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8661390542984009, "num_tokens": 608100790.0, "step": 15940 }, { "epoch": 2.0278590510113217, "ewc_loss": 0.06580965220928192, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031629970180802047, "grad_norm": 7.73898983001709, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8618181943893433, "num_tokens": 608142164.0, "step": 15941 }, { "epoch": 2.027986261289912, "ewc_loss": 0.06558509916067123, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031649554148316383, "grad_norm": 7.801499843597412, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8674449920654297, "num_tokens": 608184175.0, "step": 15942 }, { "epoch": 2.0281134715685027, "ewc_loss": 0.06557226181030273, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003163671353831887, "grad_norm": 7.7873454093933105, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8610886931419373, "num_tokens": 608226801.0, "step": 15943 }, { "epoch": 2.0282406818470933, "ewc_loss": 0.06549259275197983, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003155704471282661, "grad_norm": 7.698615550994873, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8751055002212524, "num_tokens": 608265155.0, "step": 15944 }, { "epoch": 2.028367892125684, "ewc_loss": 0.06570760905742645, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031772066722624004, "grad_norm": 7.879159927368164, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8776548504829407, "num_tokens": 608303817.0, "step": 15945 }, { "epoch": 2.0284951024042743, "ewc_loss": 0.06572990119457245, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003155021113343537, "grad_norm": 7.757424831390381, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.873744547367096, "num_tokens": 608338845.0, "step": 15946 }, { "epoch": 2.028622312682865, "ewc_loss": 0.06587927788496017, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.000316995894536376, "grad_norm": 7.756236553192139, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8677297234535217, "num_tokens": 608377661.0, "step": 15947 }, { "epoch": 2.0287495229614554, "ewc_loss": 0.06578731536865234, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031607632990926504, "grad_norm": 7.823249816894531, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8665138483047485, "num_tokens": 608412897.0, "step": 15948 }, { "epoch": 2.028876733240046, "ewc_loss": 0.06579412519931793, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031614440376870334, "grad_norm": 7.7250075340271, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8587618470191956, "num_tokens": 608453584.0, "step": 15949 }, { "epoch": 2.0290039435186364, "ewc_loss": 0.06566893309354782, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003173338482156396, "grad_norm": 7.812295913696289, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8635076284408569, "num_tokens": 608492211.0, "step": 15950 }, { "epoch": 2.029131153797227, "ewc_loss": 0.06550692021846771, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003157137834932655, "grad_norm": 7.78724479675293, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8625200390815735, "num_tokens": 608534317.0, "step": 15951 }, { "epoch": 2.0292583640758175, "ewc_loss": 0.0657205581665039, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031785009196028113, "grad_norm": 7.807333946228027, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8771394491195679, "num_tokens": 608569763.0, "step": 15952 }, { "epoch": 2.029385574354408, "ewc_loss": 0.06574349105358124, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003156379971187562, "grad_norm": 7.704295635223389, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8697324395179749, "num_tokens": 608612145.0, "step": 15953 }, { "epoch": 2.0295127846329986, "ewc_loss": 0.06571243703365326, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.0003177688631694764, "grad_norm": 7.792065620422363, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8664078712463379, "num_tokens": 608649528.0, "step": 15954 }, { "epoch": 2.0296399949115886, "ewc_loss": 0.06593388319015503, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003175419697072357, "grad_norm": 7.792687892913818, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8608081340789795, "num_tokens": 608686973.0, "step": 15955 }, { "epoch": 2.029767205190179, "ewc_loss": 0.06590785086154938, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003172816359438002, "grad_norm": 7.75391960144043, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8687136769294739, "num_tokens": 608729169.0, "step": 15956 }, { "epoch": 2.0298944154687697, "ewc_loss": 0.06592977046966553, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031750081689096987, "grad_norm": 7.765573024749756, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8648210763931274, "num_tokens": 608764502.0, "step": 15957 }, { "epoch": 2.0300216257473602, "ewc_loss": 0.0657113790512085, "ewc_loss_diag": 3.3855438232421875e-05, "ewc_loss_parallel": 0.00031775826937519014, "grad_norm": 7.756453514099121, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8669816851615906, "num_tokens": 608805190.0, "step": 15958 }, { "epoch": 2.0301488360259508, "ewc_loss": 0.0660119503736496, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003183225926477462, "grad_norm": 7.8072075843811035, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8728827238082886, "num_tokens": 608840006.0, "step": 15959 }, { "epoch": 2.0302760463045413, "ewc_loss": 0.06586109101772308, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031681405380368233, "grad_norm": 7.746740341186523, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8741517066955566, "num_tokens": 608871551.0, "step": 15960 }, { "epoch": 2.030403256583132, "ewc_loss": 0.06602183729410172, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031842151656746864, "grad_norm": 7.8623480796813965, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8456498384475708, "num_tokens": 608909610.0, "step": 15961 }, { "epoch": 2.0305304668617223, "ewc_loss": 0.06579750776290894, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003161781933158636, "grad_norm": 7.722306728363037, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8641561269760132, "num_tokens": 608950758.0, "step": 15962 }, { "epoch": 2.030657677140313, "ewc_loss": 0.06620531529188156, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032025628024712205, "grad_norm": 7.858405590057373, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8539682626724243, "num_tokens": 608990695.0, "step": 15963 }, { "epoch": 2.0307848874189034, "ewc_loss": 0.06605640053749084, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031632574973627925, "grad_norm": 7.742284774780273, "learning_rate": 1e-06, "loss": 0.407, "mean_token_accuracy": 0.8813923597335815, "num_tokens": 609030978.0, "step": 15964 }, { "epoch": 2.030912097697494, "ewc_loss": 0.06612752377986908, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031947839306667447, "grad_norm": 7.790078163146973, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8767093420028687, "num_tokens": 609071974.0, "step": 15965 }, { "epoch": 2.0310393079760845, "ewc_loss": 0.06589433550834656, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031714647775515914, "grad_norm": 7.73193883895874, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8688597083091736, "num_tokens": 609111268.0, "step": 15966 }, { "epoch": 2.031166518254675, "ewc_loss": 0.06595942378044128, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003177973267156631, "grad_norm": 7.830685615539551, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8624133467674255, "num_tokens": 609149681.0, "step": 15967 }, { "epoch": 2.0312937285332655, "ewc_loss": 0.06589479744434357, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003171510761603713, "grad_norm": 7.701326847076416, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8728383779525757, "num_tokens": 609189121.0, "step": 15968 }, { "epoch": 2.031420938811856, "ewc_loss": 0.06605285406112671, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003187316469848156, "grad_norm": 7.819094181060791, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8759969472885132, "num_tokens": 609228070.0, "step": 15969 }, { "epoch": 2.0315481490904466, "ewc_loss": 0.06594669073820114, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003176700265612453, "grad_norm": 7.7938761711120605, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.869454026222229, "num_tokens": 609269718.0, "step": 15970 }, { "epoch": 2.031675359369037, "ewc_loss": 0.06592091917991638, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003174123412463814, "grad_norm": 7.7657575607299805, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.869599461555481, "num_tokens": 609307236.0, "step": 15971 }, { "epoch": 2.0318025696476276, "ewc_loss": 0.0659596174955368, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003177993348799646, "grad_norm": 7.843934535980225, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8470593690872192, "num_tokens": 609342993.0, "step": 15972 }, { "epoch": 2.031929779926218, "ewc_loss": 0.0658106729388237, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031630985904484987, "grad_norm": 7.807274341583252, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8743443489074707, "num_tokens": 609376817.0, "step": 15973 }, { "epoch": 2.0320569902048087, "ewc_loss": 0.0663195326924324, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000316515623126179, "grad_norm": 7.816164970397949, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8652019500732422, "num_tokens": 609411209.0, "step": 15974 }, { "epoch": 2.032184200483399, "ewc_loss": 0.06579720973968506, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003161752538289875, "grad_norm": 7.788026809692383, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8848589658737183, "num_tokens": 609449055.0, "step": 15975 }, { "epoch": 2.0323114107619897, "ewc_loss": 0.06576655805110931, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003158687031827867, "grad_norm": 7.778698921203613, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8657114505767822, "num_tokens": 609486332.0, "step": 15976 }, { "epoch": 2.0324386210405803, "ewc_loss": 0.0657694935798645, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003158980398438871, "grad_norm": 7.805560111999512, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8533186316490173, "num_tokens": 609526924.0, "step": 15977 }, { "epoch": 2.032565831319171, "ewc_loss": 0.0657268613576889, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031547178514301777, "grad_norm": 7.791585922241211, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.880347490310669, "num_tokens": 609559542.0, "step": 15978 }, { "epoch": 2.032693041597761, "ewc_loss": 0.06577169895172119, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031592012965120375, "grad_norm": 7.753567695617676, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8605345487594604, "num_tokens": 609598989.0, "step": 15979 }, { "epoch": 2.0328202518763514, "ewc_loss": 0.0660286694765091, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003160483902320266, "grad_norm": 7.815639019012451, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8547483086585999, "num_tokens": 609635790.0, "step": 15980 }, { "epoch": 2.032947462154942, "ewc_loss": 0.06565628945827484, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031476604635827243, "grad_norm": 7.785251617431641, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.860137939453125, "num_tokens": 609670120.0, "step": 15981 }, { "epoch": 2.0330746724335325, "ewc_loss": 0.06590791046619415, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031728227622807026, "grad_norm": 7.733786106109619, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.866307258605957, "num_tokens": 609713260.0, "step": 15982 }, { "epoch": 2.033201882712123, "ewc_loss": 0.06571325659751892, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003153356665279716, "grad_norm": 7.745473861694336, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8724427223205566, "num_tokens": 609753236.0, "step": 15983 }, { "epoch": 2.0333290929907135, "ewc_loss": 0.06589831411838531, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031718623358756304, "grad_norm": 7.801199436187744, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8698616027832031, "num_tokens": 609791629.0, "step": 15984 }, { "epoch": 2.033456303269304, "ewc_loss": 0.06578565388917923, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031605965341441333, "grad_norm": 7.764678001403809, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8519561290740967, "num_tokens": 609833457.0, "step": 15985 }, { "epoch": 2.0335835135478946, "ewc_loss": 0.0659046322107315, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003172494180034846, "grad_norm": 7.878763675689697, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8593873977661133, "num_tokens": 609866007.0, "step": 15986 }, { "epoch": 2.033710723826485, "ewc_loss": 0.06563347578048706, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003145379014313221, "grad_norm": 7.678143501281738, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8716453313827515, "num_tokens": 609903346.0, "step": 15987 }, { "epoch": 2.0338379341050756, "ewc_loss": 0.06599804759025574, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031818365096114576, "grad_norm": 7.89523458480835, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.871242344379425, "num_tokens": 609935885.0, "step": 15988 }, { "epoch": 2.033965144383666, "ewc_loss": 0.06560659408569336, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031426906934939325, "grad_norm": 7.765000820159912, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8605186939239502, "num_tokens": 609971127.0, "step": 15989 }, { "epoch": 2.0340923546622567, "ewc_loss": 0.0659678727388382, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031788184423930943, "grad_norm": 7.7863945960998535, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8589848279953003, "num_tokens": 610016719.0, "step": 15990 }, { "epoch": 2.034219564940847, "ewc_loss": 0.0660201758146286, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003159634943585843, "grad_norm": 7.9246697425842285, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8560917377471924, "num_tokens": 610057002.0, "step": 15991 }, { "epoch": 2.0343467752194377, "ewc_loss": 0.06576086580753326, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003158118051942438, "grad_norm": 7.996520519256592, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8715754747390747, "num_tokens": 610092779.0, "step": 15992 }, { "epoch": 2.0344739854980283, "ewc_loss": 0.06566610932350159, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031486418447457254, "grad_norm": 7.946658134460449, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8559713959693909, "num_tokens": 610137092.0, "step": 15993 }, { "epoch": 2.034601195776619, "ewc_loss": 0.06542354077100754, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031243852572515607, "grad_norm": 7.700015544891357, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8762721419334412, "num_tokens": 610178632.0, "step": 15994 }, { "epoch": 2.0347284060552093, "ewc_loss": 0.06582041084766388, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031640721135772765, "grad_norm": 8.100154876708984, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8652042150497437, "num_tokens": 610219789.0, "step": 15995 }, { "epoch": 2.0348556163338, "ewc_loss": 0.06525509804487228, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031075411243364215, "grad_norm": 7.955882549285889, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.852806806564331, "num_tokens": 610259104.0, "step": 15996 }, { "epoch": 2.0349828266123904, "ewc_loss": 0.0655110776424408, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003133138525299728, "grad_norm": 7.801774501800537, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8599579334259033, "num_tokens": 610296317.0, "step": 15997 }, { "epoch": 2.035110036890981, "ewc_loss": 0.0653567686676979, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003117708256468177, "grad_norm": 7.848769187927246, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8657031059265137, "num_tokens": 610339300.0, "step": 15998 }, { "epoch": 2.0352372471695714, "ewc_loss": 0.06533984839916229, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003116015868727118, "grad_norm": 7.690750598907471, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8616417646408081, "num_tokens": 610373917.0, "step": 15999 }, { "epoch": 2.035364457448162, "ewc_loss": 0.06578609347343445, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003160640480928123, "grad_norm": 7.89826774597168, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8654069900512695, "num_tokens": 610419571.0, "step": 16000 }, { "epoch": 2.0354916677267525, "ewc_loss": 0.06536108255386353, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003118139284197241, "grad_norm": 7.67639684677124, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.859196662902832, "num_tokens": 610458073.0, "step": 16001 }, { "epoch": 2.035618878005343, "ewc_loss": 0.06600430607795715, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003182461659889668, "grad_norm": 7.842747211456299, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8560499548912048, "num_tokens": 610497223.0, "step": 16002 }, { "epoch": 2.0357460882839336, "ewc_loss": 0.06563714891672134, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031457460136152804, "grad_norm": 7.87630033493042, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8475049138069153, "num_tokens": 610532563.0, "step": 16003 }, { "epoch": 2.0358732985625236, "ewc_loss": 0.0658039078116417, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003162422217428684, "grad_norm": 7.853620529174805, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8786540031433105, "num_tokens": 610566644.0, "step": 16004 }, { "epoch": 2.036000508841114, "ewc_loss": 0.06621995568275452, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031551989377476275, "grad_norm": 7.775135517120361, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8541151285171509, "num_tokens": 610604755.0, "step": 16005 }, { "epoch": 2.0361277191197047, "ewc_loss": 0.06594549119472504, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003176580648869276, "grad_norm": 7.8413262367248535, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8701320886611938, "num_tokens": 610645905.0, "step": 16006 }, { "epoch": 2.036254929398295, "ewc_loss": 0.06582269072532654, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031642999965697527, "grad_norm": 7.7568135261535645, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8840566873550415, "num_tokens": 610686800.0, "step": 16007 }, { "epoch": 2.0363821396768857, "ewc_loss": 0.06601165235042572, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031831968226470053, "grad_norm": 7.804128170013428, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8657612800598145, "num_tokens": 610728291.0, "step": 16008 }, { "epoch": 2.0365093499554763, "ewc_loss": 0.0658736526966095, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031693969503976405, "grad_norm": 7.866135120391846, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8636691570281982, "num_tokens": 610759818.0, "step": 16009 }, { "epoch": 2.036636560234067, "ewc_loss": 0.06597177684307098, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003179208724759519, "grad_norm": 7.824123859405518, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8522272109985352, "num_tokens": 610800125.0, "step": 16010 }, { "epoch": 2.0367637705126573, "ewc_loss": 0.06627026200294495, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003184643283020705, "grad_norm": 7.7902116775512695, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8570234179496765, "num_tokens": 610839261.0, "step": 16011 }, { "epoch": 2.036890980791248, "ewc_loss": 0.065972700715065, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003179300983902067, "grad_norm": 7.842320442199707, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8702962398529053, "num_tokens": 610879092.0, "step": 16012 }, { "epoch": 2.0370181910698384, "ewc_loss": 0.06623922288417816, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031815399415791035, "grad_norm": 7.778127193450928, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8654910326004028, "num_tokens": 610914494.0, "step": 16013 }, { "epoch": 2.037145401348429, "ewc_loss": 0.06619436293840408, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003177053586114198, "grad_norm": 7.816359519958496, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8758458495140076, "num_tokens": 610950784.0, "step": 16014 }, { "epoch": 2.0372726116270194, "ewc_loss": 0.06647758185863495, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031809607753530145, "grad_norm": 7.801433086395264, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8554990291595459, "num_tokens": 610989254.0, "step": 16015 }, { "epoch": 2.03739982190561, "ewc_loss": 0.06658993661403656, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031921963091008365, "grad_norm": 7.913827419281006, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8594505786895752, "num_tokens": 611022668.0, "step": 16016 }, { "epoch": 2.0375270321842005, "ewc_loss": 0.06636594980955124, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003169798001181334, "grad_norm": 7.77558708190918, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8664093613624573, "num_tokens": 611062110.0, "step": 16017 }, { "epoch": 2.037654242462791, "ewc_loss": 0.06624047458171844, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031816650880500674, "grad_norm": 7.817796230316162, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8717490434646606, "num_tokens": 611098663.0, "step": 16018 }, { "epoch": 2.0377814527413816, "ewc_loss": 0.06653336435556412, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003186539688613266, "grad_norm": 7.851816654205322, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8635764122009277, "num_tokens": 611134058.0, "step": 16019 }, { "epoch": 2.037908663019972, "ewc_loss": 0.06640318036079407, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003173521254211664, "grad_norm": 7.772363185882568, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8658647537231445, "num_tokens": 611174748.0, "step": 16020 }, { "epoch": 2.0380358732985626, "ewc_loss": 0.06655418872833252, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031886217766441405, "grad_norm": 7.849430084228516, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8820556402206421, "num_tokens": 611216792.0, "step": 16021 }, { "epoch": 2.038163083577153, "ewc_loss": 0.06631754338741302, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031649574520997703, "grad_norm": 7.805216312408447, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8761446475982666, "num_tokens": 611253996.0, "step": 16022 }, { "epoch": 2.0382902938557437, "ewc_loss": 0.06620517373085022, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031781342113390565, "grad_norm": 7.778858661651611, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8759782314300537, "num_tokens": 611295351.0, "step": 16023 }, { "epoch": 2.038417504134334, "ewc_loss": 0.06640128791332245, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031733314972370863, "grad_norm": 7.869830131530762, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8653548359870911, "num_tokens": 611330513.0, "step": 16024 }, { "epoch": 2.0385447144129247, "ewc_loss": 0.06626884639263153, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003160088090226054, "grad_norm": 7.74170446395874, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8748742938041687, "num_tokens": 611369679.0, "step": 16025 }, { "epoch": 2.0386719246915153, "ewc_loss": 0.06653939187526703, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031871424289420247, "grad_norm": 7.859650611877441, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8743175864219666, "num_tokens": 611410398.0, "step": 16026 }, { "epoch": 2.038799134970106, "ewc_loss": 0.06619873642921448, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003153076977469027, "grad_norm": 7.7878618240356445, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8587899208068848, "num_tokens": 611447151.0, "step": 16027 }, { "epoch": 2.0389263452486963, "ewc_loss": 0.06648875772953033, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003182079235557467, "grad_norm": 7.805986404418945, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8582902550697327, "num_tokens": 611486923.0, "step": 16028 }, { "epoch": 2.0390535555272864, "ewc_loss": 0.0663362666964531, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003166829701513052, "grad_norm": 7.890702724456787, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8721388578414917, "num_tokens": 611519502.0, "step": 16029 }, { "epoch": 2.039180765805877, "ewc_loss": 0.06635527312755585, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003168730472680181, "grad_norm": 7.802842140197754, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8529316186904907, "num_tokens": 611557647.0, "step": 16030 }, { "epoch": 2.0393079760844675, "ewc_loss": 0.06639483571052551, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003172687138430774, "grad_norm": 7.798123836517334, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8653794527053833, "num_tokens": 611597405.0, "step": 16031 }, { "epoch": 2.039435186363058, "ewc_loss": 0.06644633412361145, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003177837061230093, "grad_norm": 7.916278839111328, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.859489381313324, "num_tokens": 611635996.0, "step": 16032 }, { "epoch": 2.0395623966416485, "ewc_loss": 0.06631232798099518, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000316443620249629, "grad_norm": 7.7565016746521, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8747662901878357, "num_tokens": 611677900.0, "step": 16033 }, { "epoch": 2.039689606920239, "ewc_loss": 0.06644247472286224, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003177450271323323, "grad_norm": 7.815335750579834, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8694688081741333, "num_tokens": 611719184.0, "step": 16034 }, { "epoch": 2.0398168171988296, "ewc_loss": 0.06632398813962936, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031656018109060824, "grad_norm": 7.77232027053833, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8642155528068542, "num_tokens": 611761270.0, "step": 16035 }, { "epoch": 2.03994402747742, "ewc_loss": 0.06661923229694366, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031951264827512205, "grad_norm": 7.924656391143799, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.864380955696106, "num_tokens": 611794939.0, "step": 16036 }, { "epoch": 2.0400712377560106, "ewc_loss": 0.06623729318380356, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031569323618896306, "grad_norm": 7.763253211975098, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.877826988697052, "num_tokens": 611834093.0, "step": 16037 }, { "epoch": 2.040198448034601, "ewc_loss": 0.06650882214307785, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031840853625908494, "grad_norm": 7.92709493637085, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8616869449615479, "num_tokens": 611871327.0, "step": 16038 }, { "epoch": 2.0403256583131917, "ewc_loss": 0.06619517505168915, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031527201645076275, "grad_norm": 7.785397052764893, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8586958646774292, "num_tokens": 611907837.0, "step": 16039 }, { "epoch": 2.040452868591782, "ewc_loss": 0.06600846350193024, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003182877553626895, "grad_norm": 7.853644371032715, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8483781218528748, "num_tokens": 611950964.0, "step": 16040 }, { "epoch": 2.0405800788703727, "ewc_loss": 0.06575728207826614, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003157759492751211, "grad_norm": 7.852324485778809, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8611553311347961, "num_tokens": 611985867.0, "step": 16041 }, { "epoch": 2.0407072891489633, "ewc_loss": 0.06578324735164642, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031603555544279516, "grad_norm": 7.820624828338623, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8696494102478027, "num_tokens": 612017819.0, "step": 16042 }, { "epoch": 2.040834499427554, "ewc_loss": 0.06642011553049088, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031752148061059415, "grad_norm": 7.933452129364014, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8511381149291992, "num_tokens": 612060669.0, "step": 16043 }, { "epoch": 2.0409617097061443, "ewc_loss": 0.06561106443405151, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003143137728329748, "grad_norm": 7.817385196685791, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8509484529495239, "num_tokens": 612099801.0, "step": 16044 }, { "epoch": 2.041088919984735, "ewc_loss": 0.06589175760746002, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003171207499690354, "grad_norm": 7.844155788421631, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8690427541732788, "num_tokens": 612139454.0, "step": 16045 }, { "epoch": 2.0412161302633254, "ewc_loss": 0.06579442322254181, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003161474014632404, "grad_norm": 7.90345573425293, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8469209671020508, "num_tokens": 612181695.0, "step": 16046 }, { "epoch": 2.041343340541916, "ewc_loss": 0.0656227171421051, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.000314430333673954, "grad_norm": 7.774374485015869, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8809090256690979, "num_tokens": 612223200.0, "step": 16047 }, { "epoch": 2.0414705508205064, "ewc_loss": 0.06574687361717224, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031567190308123827, "grad_norm": 7.848837852478027, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8677301406860352, "num_tokens": 612265946.0, "step": 16048 }, { "epoch": 2.041597761099097, "ewc_loss": 0.06584901362657547, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031425183988176286, "grad_norm": 7.873025417327881, "learning_rate": 1e-06, "loss": 0.405, "mean_token_accuracy": 0.8830438852310181, "num_tokens": 612301246.0, "step": 16049 }, { "epoch": 2.0417249713776875, "ewc_loss": 0.06561020016670227, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003143050998914987, "grad_norm": 7.845083236694336, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8511038422584534, "num_tokens": 612337789.0, "step": 16050 }, { "epoch": 2.041852181656278, "ewc_loss": 0.0656139999628067, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003143430803902447, "grad_norm": 7.779316425323486, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8794594407081604, "num_tokens": 612372391.0, "step": 16051 }, { "epoch": 2.0419793919348685, "ewc_loss": 0.06569421291351318, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031514529837295413, "grad_norm": 7.858457088470459, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8695441484451294, "num_tokens": 612408806.0, "step": 16052 }, { "epoch": 2.0421066022134586, "ewc_loss": 0.06559772789478302, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003141803899779916, "grad_norm": 7.760178089141846, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8768587112426758, "num_tokens": 612446349.0, "step": 16053 }, { "epoch": 2.042233812492049, "ewc_loss": 0.06587407737970352, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003169438859913498, "grad_norm": 7.780256748199463, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8786086440086365, "num_tokens": 612489899.0, "step": 16054 }, { "epoch": 2.0423610227706397, "ewc_loss": 0.06571722030639648, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031537527684122324, "grad_norm": 7.791866779327393, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8615813851356506, "num_tokens": 612532314.0, "step": 16055 }, { "epoch": 2.04248823304923, "ewc_loss": 0.06589408218860626, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031714397482573986, "grad_norm": 7.841864109039307, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8741234540939331, "num_tokens": 612568033.0, "step": 16056 }, { "epoch": 2.0426154433278207, "ewc_loss": 0.06571610271930695, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003153641300741583, "grad_norm": 7.778712749481201, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8724095225334167, "num_tokens": 612605160.0, "step": 16057 }, { "epoch": 2.0427426536064113, "ewc_loss": 0.06591859459877014, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031738911638967693, "grad_norm": 7.892277240753174, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.854461669921875, "num_tokens": 612647851.0, "step": 16058 }, { "epoch": 2.042869863885002, "ewc_loss": 0.06570059806108475, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003152090939693153, "grad_norm": 7.788628101348877, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8668133020401001, "num_tokens": 612684770.0, "step": 16059 }, { "epoch": 2.0429970741635923, "ewc_loss": 0.06597743928432465, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031797753763385117, "grad_norm": 7.882667064666748, "learning_rate": 1e-06, "loss": 0.4943, "mean_token_accuracy": 0.8563520908355713, "num_tokens": 612721689.0, "step": 16060 }, { "epoch": 2.043124284442183, "ewc_loss": 0.0657111257314682, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003153144207317382, "grad_norm": 7.748207092285156, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8648335933685303, "num_tokens": 612757256.0, "step": 16061 }, { "epoch": 2.0432514947207734, "ewc_loss": 0.06605316698551178, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003187347901985049, "grad_norm": 7.879806041717529, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.871789813041687, "num_tokens": 612797806.0, "step": 16062 }, { "epoch": 2.043378704999364, "ewc_loss": 0.06620603799819946, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031538071925751865, "grad_norm": 7.801109313964844, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8769998550415039, "num_tokens": 612833507.0, "step": 16063 }, { "epoch": 2.0435059152779544, "ewc_loss": 0.06597094237804413, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031791257788427174, "grad_norm": 7.856258392333984, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8725779056549072, "num_tokens": 612872920.0, "step": 16064 }, { "epoch": 2.043633125556545, "ewc_loss": 0.0657719075679779, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003159221669193357, "grad_norm": 7.725301742553711, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.873407244682312, "num_tokens": 612909534.0, "step": 16065 }, { "epoch": 2.0437603358351355, "ewc_loss": 0.06609812378883362, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031918438617140055, "grad_norm": 7.790443420410156, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8632110357284546, "num_tokens": 612950531.0, "step": 16066 }, { "epoch": 2.043887546113726, "ewc_loss": 0.06590765714645386, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003172796859871596, "grad_norm": 7.8366265296936035, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8688112497329712, "num_tokens": 612984638.0, "step": 16067 }, { "epoch": 2.0440147563923166, "ewc_loss": 0.06593193858861923, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003175225283484906, "grad_norm": 7.821560382843018, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8779816031455994, "num_tokens": 613020632.0, "step": 16068 }, { "epoch": 2.044141966670907, "ewc_loss": 0.06601159274578094, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003183190128766, "grad_norm": 7.821631908416748, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8705375790596008, "num_tokens": 613057381.0, "step": 16069 }, { "epoch": 2.0442691769494976, "ewc_loss": 0.06581519544124603, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003163550281897187, "grad_norm": 7.746454238891602, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8643045425415039, "num_tokens": 613094287.0, "step": 16070 }, { "epoch": 2.044396387228088, "ewc_loss": 0.06617975980043411, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032000071951188147, "grad_norm": 7.884415149688721, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8737332820892334, "num_tokens": 613126315.0, "step": 16071 }, { "epoch": 2.0445235975066787, "ewc_loss": 0.065742626786232, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031562941148877144, "grad_norm": 7.774077892303467, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8624169826507568, "num_tokens": 613160970.0, "step": 16072 }, { "epoch": 2.044650807785269, "ewc_loss": 0.06623199582099915, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032052313326857984, "grad_norm": 7.8247904777526855, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8694334030151367, "num_tokens": 613204685.0, "step": 16073 }, { "epoch": 2.0447780180638597, "ewc_loss": 0.06647955626249313, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000318115868140012, "grad_norm": 7.787161827087402, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8761616945266724, "num_tokens": 613242394.0, "step": 16074 }, { "epoch": 2.0449052283424503, "ewc_loss": 0.0660678967833519, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031888208468444645, "grad_norm": 7.817960739135742, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.874879002571106, "num_tokens": 613283517.0, "step": 16075 }, { "epoch": 2.045032438621041, "ewc_loss": 0.06642776727676392, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000317597936373204, "grad_norm": 7.823741912841797, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8709220290184021, "num_tokens": 613317237.0, "step": 16076 }, { "epoch": 2.045159648899631, "ewc_loss": 0.06605685502290726, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031877169385552406, "grad_norm": 7.812756538391113, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8702126145362854, "num_tokens": 613355741.0, "step": 16077 }, { "epoch": 2.0452868591782214, "ewc_loss": 0.0662078857421875, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031784060411155224, "grad_norm": 7.790399551391602, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8578463792800903, "num_tokens": 613392131.0, "step": 16078 }, { "epoch": 2.045414069456812, "ewc_loss": 0.06629498302936554, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003187115944456309, "grad_norm": 7.7911152839660645, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8575596809387207, "num_tokens": 613433687.0, "step": 16079 }, { "epoch": 2.0455412797354025, "ewc_loss": 0.06598019599914551, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003180050989612937, "grad_norm": 7.7360005378723145, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8712083101272583, "num_tokens": 613472882.0, "step": 16080 }, { "epoch": 2.045668490013993, "ewc_loss": 0.06618711352348328, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003200742066837847, "grad_norm": 7.782253742218018, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8715023398399353, "num_tokens": 613515869.0, "step": 16081 }, { "epoch": 2.0457957002925835, "ewc_loss": 0.0663529485464096, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003192912263330072, "grad_norm": 7.815183639526367, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8672335147857666, "num_tokens": 613551835.0, "step": 16082 }, { "epoch": 2.045922910571174, "ewc_loss": 0.06610693037509918, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003192724834661931, "grad_norm": 7.792663097381592, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8649982213973999, "num_tokens": 613592277.0, "step": 16083 }, { "epoch": 2.0460501208497646, "ewc_loss": 0.06638775020837784, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003196392208337784, "grad_norm": 7.8388166427612305, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8669004440307617, "num_tokens": 613625594.0, "step": 16084 }, { "epoch": 2.046177331128355, "ewc_loss": 0.06596367806196213, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003178399056196213, "grad_norm": 7.821765422821045, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8702292442321777, "num_tokens": 613659924.0, "step": 16085 }, { "epoch": 2.0463045414069456, "ewc_loss": 0.06602469086647034, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003184500674251467, "grad_norm": 7.7961883544921875, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8493516445159912, "num_tokens": 613697067.0, "step": 16086 }, { "epoch": 2.046431751685536, "ewc_loss": 0.06597310304641724, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003179342020303011, "grad_norm": 7.770482063293457, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8742779493331909, "num_tokens": 613738211.0, "step": 16087 }, { "epoch": 2.0465589619641267, "ewc_loss": 0.06596902012825012, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003178933693561703, "grad_norm": 7.8020806312561035, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.873092770576477, "num_tokens": 613782590.0, "step": 16088 }, { "epoch": 2.046686172242717, "ewc_loss": 0.06596006453037262, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.000317803758662194, "grad_norm": 7.717824459075928, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8747171759605408, "num_tokens": 613825770.0, "step": 16089 }, { "epoch": 2.0468133825213077, "ewc_loss": 0.06609329581260681, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003191361029166728, "grad_norm": 7.814623832702637, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8723934888839722, "num_tokens": 613864025.0, "step": 16090 }, { "epoch": 2.0469405927998983, "ewc_loss": 0.06653465330600739, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000318666803650558, "grad_norm": 7.748122215270996, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8697835206985474, "num_tokens": 613907181.0, "step": 16091 }, { "epoch": 2.047067803078489, "ewc_loss": 0.06632071733474731, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032141030533239245, "grad_norm": 7.824922561645508, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8582855463027954, "num_tokens": 613951258.0, "step": 16092 }, { "epoch": 2.0471950133570793, "ewc_loss": 0.06601259112358093, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003183289954904467, "grad_norm": 7.765994071960449, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8661200404167175, "num_tokens": 613991221.0, "step": 16093 }, { "epoch": 2.04732222363567, "ewc_loss": 0.06630100309848785, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.000321213185088709, "grad_norm": 7.831410884857178, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8781384229660034, "num_tokens": 614026862.0, "step": 16094 }, { "epoch": 2.0474494339142604, "ewc_loss": 0.06611928343772888, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031939594191499054, "grad_norm": 7.775245666503906, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.857123076915741, "num_tokens": 614067560.0, "step": 16095 }, { "epoch": 2.047576644192851, "ewc_loss": 0.06625084578990936, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003207115805707872, "grad_norm": 7.789477348327637, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8706737160682678, "num_tokens": 614106974.0, "step": 16096 }, { "epoch": 2.0477038544714414, "ewc_loss": 0.06623896956443787, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003205928369425237, "grad_norm": 7.800472259521484, "learning_rate": 1e-06, "loss": 0.367, "mean_token_accuracy": 0.8932886719703674, "num_tokens": 614145641.0, "step": 16097 }, { "epoch": 2.047831064750032, "ewc_loss": 0.06617707759141922, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003199739148840308, "grad_norm": 7.781203746795654, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8713184595108032, "num_tokens": 614181476.0, "step": 16098 }, { "epoch": 2.0479582750286225, "ewc_loss": 0.06632308661937714, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003214339376427233, "grad_norm": 7.84856653213501, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8616423606872559, "num_tokens": 614219212.0, "step": 16099 }, { "epoch": 2.048085485307213, "ewc_loss": 0.06604969501495361, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031870012753643095, "grad_norm": 7.734688758850098, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8385071754455566, "num_tokens": 614264017.0, "step": 16100 }, { "epoch": 2.0482126955858035, "ewc_loss": 0.06663781404495239, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003221398510504514, "grad_norm": 7.861424922943115, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8506519794464111, "num_tokens": 614299031.0, "step": 16101 }, { "epoch": 2.0483399058643936, "ewc_loss": 0.06635302305221558, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003192919248249382, "grad_norm": 7.727368354797363, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8730720281600952, "num_tokens": 614334387.0, "step": 16102 }, { "epoch": 2.048467116142984, "ewc_loss": 0.06651744246482849, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003233775496482849, "grad_norm": 7.8530120849609375, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8678545355796814, "num_tokens": 614372438.0, "step": 16103 }, { "epoch": 2.0485943264215747, "ewc_loss": 0.06625016033649445, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032070474117062986, "grad_norm": 7.779018878936768, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8630869388580322, "num_tokens": 614409976.0, "step": 16104 }, { "epoch": 2.048721536700165, "ewc_loss": 0.06687282770872116, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003220485814381391, "grad_norm": 7.996657371520996, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8746762275695801, "num_tokens": 614448740.0, "step": 16105 }, { "epoch": 2.0488487469787557, "ewc_loss": 0.06598283350467682, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031803149613551795, "grad_norm": 7.7082014083862305, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8713271617889404, "num_tokens": 614484644.0, "step": 16106 }, { "epoch": 2.0489759572573463, "ewc_loss": 0.06667947769165039, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003249978763051331, "grad_norm": 7.9123430252075195, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8479657173156738, "num_tokens": 614521210.0, "step": 16107 }, { "epoch": 2.049103167535937, "ewc_loss": 0.06657975912094116, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003191178839188069, "grad_norm": 7.7807440757751465, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8617785573005676, "num_tokens": 614561783.0, "step": 16108 }, { "epoch": 2.0492303778145273, "ewc_loss": 0.06701868772506714, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032350720721296966, "grad_norm": 7.907102584838867, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8675029873847961, "num_tokens": 614598643.0, "step": 16109 }, { "epoch": 2.049357588093118, "ewc_loss": 0.06657300889492035, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031905039213597775, "grad_norm": 7.7875776290893555, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8671602606773376, "num_tokens": 614633520.0, "step": 16110 }, { "epoch": 2.0494847983717084, "ewc_loss": 0.0668204054236412, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003215243632439524, "grad_norm": 7.777605056762695, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8539294600486755, "num_tokens": 614673266.0, "step": 16111 }, { "epoch": 2.049612008650299, "ewc_loss": 0.06667810678482056, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032010141876526177, "grad_norm": 7.803341865539551, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8579238057136536, "num_tokens": 614707201.0, "step": 16112 }, { "epoch": 2.0497392189288894, "ewc_loss": 0.06681935489177704, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003215138567611575, "grad_norm": 7.784850597381592, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8718274831771851, "num_tokens": 614746542.0, "step": 16113 }, { "epoch": 2.04986642920748, "ewc_loss": 0.06675533950328827, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032087371801026165, "grad_norm": 7.785408020019531, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8568068742752075, "num_tokens": 614781804.0, "step": 16114 }, { "epoch": 2.0499936394860705, "ewc_loss": 0.0662301704287529, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032050482695922256, "grad_norm": 7.754906177520752, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8665724992752075, "num_tokens": 614817281.0, "step": 16115 }, { "epoch": 2.050120849764661, "ewc_loss": 0.06678184866905212, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003211387956980616, "grad_norm": 7.8115339279174805, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.857897162437439, "num_tokens": 614853666.0, "step": 16116 }, { "epoch": 2.0502480600432516, "ewc_loss": 0.06623991578817368, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003206022665835917, "grad_norm": 7.816939830780029, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8691606521606445, "num_tokens": 614891221.0, "step": 16117 }, { "epoch": 2.050375270321842, "ewc_loss": 0.06628037989139557, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003210069553460926, "grad_norm": 7.781559944152832, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8748288154602051, "num_tokens": 614925956.0, "step": 16118 }, { "epoch": 2.0505024806004326, "ewc_loss": 0.06651954352855682, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032095712958835065, "grad_norm": 7.863309860229492, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8733619451522827, "num_tokens": 614959934.0, "step": 16119 }, { "epoch": 2.050629690879023, "ewc_loss": 0.06656748056411743, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003189951239619404, "grad_norm": 7.75480842590332, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8634098768234253, "num_tokens": 615000579.0, "step": 16120 }, { "epoch": 2.0507569011576137, "ewc_loss": 0.06678509712219238, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032117124646902084, "grad_norm": 7.880540370941162, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8606947064399719, "num_tokens": 615037371.0, "step": 16121 }, { "epoch": 2.050884111436204, "ewc_loss": 0.06594111025333405, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031761423451825976, "grad_norm": 7.754878044128418, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8565827012062073, "num_tokens": 615075948.0, "step": 16122 }, { "epoch": 2.0510113217147947, "ewc_loss": 0.06672690808773041, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205894026905298, "grad_norm": 7.746695518493652, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.8816331028938293, "num_tokens": 615114619.0, "step": 16123 }, { "epoch": 2.0511385319933853, "ewc_loss": 0.06648634374141693, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031818370916880667, "grad_norm": 7.732433795928955, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8674182891845703, "num_tokens": 615149260.0, "step": 16124 }, { "epoch": 2.051265742271976, "ewc_loss": 0.06664522737264633, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003197726036887616, "grad_norm": 7.82875394821167, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8690318465232849, "num_tokens": 615183284.0, "step": 16125 }, { "epoch": 2.0513929525505663, "ewc_loss": 0.06656467169523239, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031896703876554966, "grad_norm": 7.770125865936279, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8732823133468628, "num_tokens": 615220015.0, "step": 16126 }, { "epoch": 2.0515201628291564, "ewc_loss": 0.06656117737293243, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003189321141690016, "grad_norm": 7.769596099853516, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8595360517501831, "num_tokens": 615255614.0, "step": 16127 }, { "epoch": 2.051647373107747, "ewc_loss": 0.0661110132932663, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031931325793266296, "grad_norm": 7.80584192276001, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8684300184249878, "num_tokens": 615290232.0, "step": 16128 }, { "epoch": 2.0517745833863374, "ewc_loss": 0.06603659689426422, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031856910209171474, "grad_norm": 7.721770763397217, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8721340894699097, "num_tokens": 615328204.0, "step": 16129 }, { "epoch": 2.051901793664928, "ewc_loss": 0.06640753895044327, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031983712688088417, "grad_norm": 7.789529800415039, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8586847186088562, "num_tokens": 615362846.0, "step": 16130 }, { "epoch": 2.0520290039435185, "ewc_loss": 0.06639376282691956, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031969932024367154, "grad_norm": 7.728600978851318, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8664157390594482, "num_tokens": 615405404.0, "step": 16131 }, { "epoch": 2.052156214222109, "ewc_loss": 0.06641873717308044, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003199490893166512, "grad_norm": 7.837238788604736, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8554907441139221, "num_tokens": 615439835.0, "step": 16132 }, { "epoch": 2.0522834245006996, "ewc_loss": 0.06656014919281006, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003189218114130199, "grad_norm": 7.786813735961914, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8771460056304932, "num_tokens": 615477832.0, "step": 16133 }, { "epoch": 2.05241063477929, "ewc_loss": 0.0666014701128006, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031933499849401414, "grad_norm": 7.773270606994629, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8576793074607849, "num_tokens": 615518089.0, "step": 16134 }, { "epoch": 2.0525378450578806, "ewc_loss": 0.06680356711149216, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031891456455923617, "grad_norm": 7.761573791503906, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8711023330688477, "num_tokens": 615557286.0, "step": 16135 }, { "epoch": 2.052665055336471, "ewc_loss": 0.06661801040172577, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003195004537701607, "grad_norm": 7.797092437744141, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8712825775146484, "num_tokens": 615593284.0, "step": 16136 }, { "epoch": 2.0527922656150617, "ewc_loss": 0.06711187958717346, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031955624581314623, "grad_norm": 7.788609504699707, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8632060885429382, "num_tokens": 615633579.0, "step": 16137 }, { "epoch": 2.052919475893652, "ewc_loss": 0.06650903820991516, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031841074815019965, "grad_norm": 7.73933219909668, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8722301125526428, "num_tokens": 615680135.0, "step": 16138 }, { "epoch": 2.0530466861722427, "ewc_loss": 0.06692443042993546, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032012321753427386, "grad_norm": 7.855450630187988, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8548417091369629, "num_tokens": 615714801.0, "step": 16139 }, { "epoch": 2.0531738964508333, "ewc_loss": 0.06647961586713791, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003181164793204516, "grad_norm": 7.775847911834717, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.859611988067627, "num_tokens": 615750538.0, "step": 16140 }, { "epoch": 2.053301106729424, "ewc_loss": 0.06665845215320587, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000319904851494357, "grad_norm": 7.818511486053467, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8704200983047485, "num_tokens": 615789465.0, "step": 16141 }, { "epoch": 2.0534283170080143, "ewc_loss": 0.06651533395051956, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031847364152781665, "grad_norm": 7.788290023803711, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8524510860443115, "num_tokens": 615832553.0, "step": 16142 }, { "epoch": 2.053555527286605, "ewc_loss": 0.06655166298151016, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031883694464340806, "grad_norm": 7.754664421081543, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8631309270858765, "num_tokens": 615868341.0, "step": 16143 }, { "epoch": 2.0536827375651954, "ewc_loss": 0.06654451787471771, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031876543653197587, "grad_norm": 7.848311901092529, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8774956464767456, "num_tokens": 615904324.0, "step": 16144 }, { "epoch": 2.053809947843786, "ewc_loss": 0.06647300720214844, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031805041362531483, "grad_norm": 7.745458126068115, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.874454140663147, "num_tokens": 615947656.0, "step": 16145 }, { "epoch": 2.0539371581223764, "ewc_loss": 0.06656348705291748, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003189551644027233, "grad_norm": 7.796008586883545, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8604699373245239, "num_tokens": 615989050.0, "step": 16146 }, { "epoch": 2.054064368400967, "ewc_loss": 0.06652962416410446, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003186165413353592, "grad_norm": 7.767831325531006, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8706637620925903, "num_tokens": 616027638.0, "step": 16147 }, { "epoch": 2.0541915786795575, "ewc_loss": 0.06660325080156326, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031935281003825366, "grad_norm": 7.802091598510742, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8757950663566589, "num_tokens": 616066392.0, "step": 16148 }, { "epoch": 2.054318788958148, "ewc_loss": 0.06657852977514267, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031910560210235417, "grad_norm": 7.733516216278076, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8750275373458862, "num_tokens": 616108303.0, "step": 16149 }, { "epoch": 2.0544459992367385, "ewc_loss": 0.06670349091291428, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032035523327067494, "grad_norm": 7.793816566467285, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8590514659881592, "num_tokens": 616154174.0, "step": 16150 }, { "epoch": 2.0545732095153286, "ewc_loss": 0.06649983674287796, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031831866363063455, "grad_norm": 7.767113208770752, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8641527891159058, "num_tokens": 616193954.0, "step": 16151 }, { "epoch": 2.054700419793919, "ewc_loss": 0.06679143011569977, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032123466371558607, "grad_norm": 7.8288702964782715, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.870944082736969, "num_tokens": 616232771.0, "step": 16152 }, { "epoch": 2.0548276300725097, "ewc_loss": 0.06657643616199493, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003190846764482558, "grad_norm": 7.787432670593262, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8778706192970276, "num_tokens": 616265151.0, "step": 16153 }, { "epoch": 2.0549548403511, "ewc_loss": 0.06661941856145859, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003195145109202713, "grad_norm": 7.778852462768555, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8612496256828308, "num_tokens": 616303886.0, "step": 16154 }, { "epoch": 2.0550820506296907, "ewc_loss": 0.06668666005134583, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003201869549229741, "grad_norm": 7.808481216430664, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8768017292022705, "num_tokens": 616340040.0, "step": 16155 }, { "epoch": 2.0552092609082813, "ewc_loss": 0.06656625121831894, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003189828130416572, "grad_norm": 7.814913272857666, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.872042179107666, "num_tokens": 616371329.0, "step": 16156 }, { "epoch": 2.055336471186872, "ewc_loss": 0.06672799587249756, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003206003166269511, "grad_norm": 7.807301998138428, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8799117207527161, "num_tokens": 616406383.0, "step": 16157 }, { "epoch": 2.0554636814654623, "ewc_loss": 0.06668001413345337, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032012048177421093, "grad_norm": 7.887813091278076, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8534355163574219, "num_tokens": 616445600.0, "step": 16158 }, { "epoch": 2.055590891744053, "ewc_loss": 0.06646386533975601, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003179589693900198, "grad_norm": 7.795225620269775, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8589581847190857, "num_tokens": 616485376.0, "step": 16159 }, { "epoch": 2.0557181020226434, "ewc_loss": 0.06710392981767654, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003219181962776929, "grad_norm": 7.879574298858643, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8564531803131104, "num_tokens": 616527104.0, "step": 16160 }, { "epoch": 2.055845312301234, "ewc_loss": 0.06668907403945923, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003177696489728987, "grad_norm": 7.7286200523376465, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8692154884338379, "num_tokens": 616567557.0, "step": 16161 }, { "epoch": 2.0559725225798244, "ewc_loss": 0.06677062809467316, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003210265713278204, "grad_norm": 7.8486151695251465, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8714728951454163, "num_tokens": 616601612.0, "step": 16162 }, { "epoch": 2.056099732858415, "ewc_loss": 0.0662851333618164, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031861307797953486, "grad_norm": 7.785999298095703, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8646745681762695, "num_tokens": 616638019.0, "step": 16163 }, { "epoch": 2.0562269431370055, "ewc_loss": 0.06647185981273651, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.000320480321533978, "grad_norm": 7.88358736038208, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8649228811264038, "num_tokens": 616675224.0, "step": 16164 }, { "epoch": 2.056354153415596, "ewc_loss": 0.0665275901556015, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003185961686540395, "grad_norm": 7.7991862297058105, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8648966550827026, "num_tokens": 616713628.0, "step": 16165 }, { "epoch": 2.0564813636941865, "ewc_loss": 0.0666925460100174, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003202458028681576, "grad_norm": 7.81540584564209, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8642005920410156, "num_tokens": 616755985.0, "step": 16166 }, { "epoch": 2.056608573972777, "ewc_loss": 0.06629618257284164, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031872352701611817, "grad_norm": 7.829342842102051, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8680934906005859, "num_tokens": 616786564.0, "step": 16167 }, { "epoch": 2.0567357842513676, "ewc_loss": 0.06636475771665573, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003194093005731702, "grad_norm": 7.804961204528809, "learning_rate": 1e-06, "loss": 0.5425, "mean_token_accuracy": 0.8383952975273132, "num_tokens": 616823285.0, "step": 16168 }, { "epoch": 2.056862994529958, "ewc_loss": 0.06641373038291931, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003198990016244352, "grad_norm": 7.7668843269348145, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8747029900550842, "num_tokens": 616864465.0, "step": 16169 }, { "epoch": 2.0569902048085487, "ewc_loss": 0.06639090925455093, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003196708275936544, "grad_norm": 7.824096202850342, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8570826053619385, "num_tokens": 616906670.0, "step": 16170 }, { "epoch": 2.057117415087139, "ewc_loss": 0.06662185490131378, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031953887082636356, "grad_norm": 7.815314769744873, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8662497401237488, "num_tokens": 616947656.0, "step": 16171 }, { "epoch": 2.0572446253657297, "ewc_loss": 0.06664817780256271, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031980208586901426, "grad_norm": 7.844734191894531, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.859734058380127, "num_tokens": 616980161.0, "step": 16172 }, { "epoch": 2.0573718356443202, "ewc_loss": 0.06628222018480301, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003185839159414172, "grad_norm": 7.805810451507568, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.86545729637146, "num_tokens": 617015281.0, "step": 16173 }, { "epoch": 2.0574990459229108, "ewc_loss": 0.0661819726228714, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032002286752685905, "grad_norm": 7.79198694229126, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.868053138256073, "num_tokens": 617054174.0, "step": 16174 }, { "epoch": 2.057626256201501, "ewc_loss": 0.06652567535638809, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003185770765412599, "grad_norm": 7.761261463165283, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8672258257865906, "num_tokens": 617095160.0, "step": 16175 }, { "epoch": 2.0577534664800914, "ewc_loss": 0.06634639203548431, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003192255971953273, "grad_norm": 7.813168048858643, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8710979223251343, "num_tokens": 617130781.0, "step": 16176 }, { "epoch": 2.057880676758682, "ewc_loss": 0.06725810468196869, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00031857710564509034, "grad_norm": 7.892945766448975, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8646583557128906, "num_tokens": 617169473.0, "step": 16177 }, { "epoch": 2.0580078870372724, "ewc_loss": 0.06620330363512039, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003177947364747524, "grad_norm": 7.700331687927246, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8595377802848816, "num_tokens": 617212208.0, "step": 16178 }, { "epoch": 2.058135097315863, "ewc_loss": 0.06646537780761719, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003204155364073813, "grad_norm": 7.8452606201171875, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8680768609046936, "num_tokens": 617246055.0, "step": 16179 }, { "epoch": 2.0582623075944535, "ewc_loss": 0.06646670401096344, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031798731652088463, "grad_norm": 7.765641212463379, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8623981475830078, "num_tokens": 617285614.0, "step": 16180 }, { "epoch": 2.058389517873044, "ewc_loss": 0.0667182058095932, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205023822374642, "grad_norm": 7.7814812660217285, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8729221820831299, "num_tokens": 617323401.0, "step": 16181 }, { "epoch": 2.0585167281516346, "ewc_loss": 0.06665584444999695, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031987871625460684, "grad_norm": 7.764801025390625, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8528589606285095, "num_tokens": 617372550.0, "step": 16182 }, { "epoch": 2.058643938430225, "ewc_loss": 0.06670242547988892, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032034452306106687, "grad_norm": 7.804741859436035, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8723912239074707, "num_tokens": 617407894.0, "step": 16183 }, { "epoch": 2.0587711487088156, "ewc_loss": 0.06648162752389908, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003205779939889908, "grad_norm": 7.754074573516846, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8580090999603271, "num_tokens": 617450117.0, "step": 16184 }, { "epoch": 2.058898358987406, "ewc_loss": 0.06645558029413223, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.000320317514706403, "grad_norm": 7.789474010467529, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8718521595001221, "num_tokens": 617485885.0, "step": 16185 }, { "epoch": 2.0590255692659967, "ewc_loss": 0.06646643579006195, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032042613020166755, "grad_norm": 7.848005771636963, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.849951982498169, "num_tokens": 617520511.0, "step": 16186 }, { "epoch": 2.059152779544587, "ewc_loss": 0.06634243577718735, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031918607419356704, "grad_norm": 7.757270812988281, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8582521677017212, "num_tokens": 617558288.0, "step": 16187 }, { "epoch": 2.0592799898231777, "ewc_loss": 0.06639456748962402, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032214884413406253, "grad_norm": 7.829091548919678, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8655303716659546, "num_tokens": 617602991.0, "step": 16188 }, { "epoch": 2.0594072001017683, "ewc_loss": 0.06630390137434006, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003188007394783199, "grad_norm": 7.759310245513916, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8646501898765564, "num_tokens": 617639629.0, "step": 16189 }, { "epoch": 2.059534410380359, "ewc_loss": 0.06656704843044281, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032143216230906546, "grad_norm": 7.841456890106201, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8597695231437683, "num_tokens": 617676218.0, "step": 16190 }, { "epoch": 2.0596616206589493, "ewc_loss": 0.06638741493225098, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003196359029971063, "grad_norm": 7.806469440460205, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.858717679977417, "num_tokens": 617716270.0, "step": 16191 }, { "epoch": 2.05978883093754, "ewc_loss": 0.06621402502059937, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032034338801167905, "grad_norm": 7.810707092285156, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8752626180648804, "num_tokens": 617755559.0, "step": 16192 }, { "epoch": 2.0599160412161304, "ewc_loss": 0.06655532866716385, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032131499028764665, "grad_norm": 7.800429344177246, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8695613741874695, "num_tokens": 617793522.0, "step": 16193 }, { "epoch": 2.060043251494721, "ewc_loss": 0.06646639853715897, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003204256936442107, "grad_norm": 7.880522727966309, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8704134225845337, "num_tokens": 617828005.0, "step": 16194 }, { "epoch": 2.0601704617733114, "ewc_loss": 0.06639818847179413, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031974355806596577, "grad_norm": 7.82972526550293, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8618277311325073, "num_tokens": 617863824.0, "step": 16195 }, { "epoch": 2.060297672051902, "ewc_loss": 0.06641402840614319, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031990205752663314, "grad_norm": 7.790042400360107, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8636747598648071, "num_tokens": 617903648.0, "step": 16196 }, { "epoch": 2.0604248823304925, "ewc_loss": 0.06648468971252441, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032060861121863127, "grad_norm": 7.829473495483398, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8624448180198669, "num_tokens": 617946229.0, "step": 16197 }, { "epoch": 2.060552092609083, "ewc_loss": 0.06623706221580505, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031813234090805054, "grad_norm": 7.855331897735596, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8672398924827576, "num_tokens": 617990486.0, "step": 16198 }, { "epoch": 2.0606793028876735, "ewc_loss": 0.06639344990253448, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003196962352376431, "grad_norm": 7.787983417510986, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8792943358421326, "num_tokens": 618031743.0, "step": 16199 }, { "epoch": 2.0608065131662636, "ewc_loss": 0.06637893617153168, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003195510944351554, "grad_norm": 7.799229621887207, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8632490634918213, "num_tokens": 618070181.0, "step": 16200 }, { "epoch": 2.060933723444854, "ewc_loss": 0.06658497452735901, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003191700088791549, "grad_norm": 7.817716121673584, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8725826144218445, "num_tokens": 618106396.0, "step": 16201 }, { "epoch": 2.0610609337234447, "ewc_loss": 0.06633792817592621, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031914102146402, "grad_norm": 7.7488484382629395, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8718171119689941, "num_tokens": 618148281.0, "step": 16202 }, { "epoch": 2.061188144002035, "ewc_loss": 0.06648187339305878, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003205804096069187, "grad_norm": 7.92640495300293, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8599804639816284, "num_tokens": 618182721.0, "step": 16203 }, { "epoch": 2.0613153542806257, "ewc_loss": 0.06646842509508133, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000318004545988515, "grad_norm": 7.7614593505859375, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8823745250701904, "num_tokens": 618219497.0, "step": 16204 }, { "epoch": 2.0614425645592163, "ewc_loss": 0.0667470246553421, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032079056836664677, "grad_norm": 7.8238959312438965, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8832096457481384, "num_tokens": 618258643.0, "step": 16205 }, { "epoch": 2.061569774837807, "ewc_loss": 0.06625254452228546, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003182871441822499, "grad_norm": 7.769638538360596, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8819254040718079, "num_tokens": 618292682.0, "step": 16206 }, { "epoch": 2.0616969851163973, "ewc_loss": 0.06649097800254822, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032067150459624827, "grad_norm": 7.848557472229004, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8514386415481567, "num_tokens": 618331375.0, "step": 16207 }, { "epoch": 2.061824195394988, "ewc_loss": 0.06641251593828201, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031744546140544116, "grad_norm": 7.749525547027588, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.870391309261322, "num_tokens": 618367452.0, "step": 16208 }, { "epoch": 2.0619514056735784, "ewc_loss": 0.06674838066101074, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032080415985547006, "grad_norm": 7.828346252441406, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8649544715881348, "num_tokens": 618410470.0, "step": 16209 }, { "epoch": 2.062078615952169, "ewc_loss": 0.06662888824939728, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003196091565769166, "grad_norm": 7.766842842102051, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.859531819820404, "num_tokens": 618447524.0, "step": 16210 }, { "epoch": 2.0622058262307594, "ewc_loss": 0.0668025091290474, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032134540379047394, "grad_norm": 7.87766695022583, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8479373455047607, "num_tokens": 618481793.0, "step": 16211 }, { "epoch": 2.06233303650935, "ewc_loss": 0.06667685508728027, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003200888750143349, "grad_norm": 7.82390832901001, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8705488443374634, "num_tokens": 618517097.0, "step": 16212 }, { "epoch": 2.0624602467879405, "ewc_loss": 0.06672197580337524, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032054007169790566, "grad_norm": 7.797445297241211, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8637338280677795, "num_tokens": 618554221.0, "step": 16213 }, { "epoch": 2.062587457066531, "ewc_loss": 0.06668590009212494, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032017932971939445, "grad_norm": 7.795426845550537, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8771795034408569, "num_tokens": 618591279.0, "step": 16214 }, { "epoch": 2.0627146673451215, "ewc_loss": 0.06674174964427948, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032073783222585917, "grad_norm": 7.8304948806762695, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8548247814178467, "num_tokens": 618628168.0, "step": 16215 }, { "epoch": 2.062841877623712, "ewc_loss": 0.06670519709587097, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003203722881153226, "grad_norm": 7.802257537841797, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8724481463432312, "num_tokens": 618664580.0, "step": 16216 }, { "epoch": 2.0629690879023026, "ewc_loss": 0.06666503846645355, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003199707134626806, "grad_norm": 7.861036777496338, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8551127314567566, "num_tokens": 618699007.0, "step": 16217 }, { "epoch": 2.063096298180893, "ewc_loss": 0.06669390946626663, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003202594234608114, "grad_norm": 7.854411602020264, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8682975769042969, "num_tokens": 618736630.0, "step": 16218 }, { "epoch": 2.0632235084594837, "ewc_loss": 0.06659409403800964, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003192612202838063, "grad_norm": 7.746832847595215, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8752325773239136, "num_tokens": 618777868.0, "step": 16219 }, { "epoch": 2.063350718738074, "ewc_loss": 0.06679102033376694, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032123050186783075, "grad_norm": 7.863590240478516, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8703238368034363, "num_tokens": 618819134.0, "step": 16220 }, { "epoch": 2.0634779290166647, "ewc_loss": 0.06654879450798035, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003188082482665777, "grad_norm": 7.79035758972168, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8656433820724487, "num_tokens": 618862231.0, "step": 16221 }, { "epoch": 2.0636051392952552, "ewc_loss": 0.0668184831738472, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032150515471585095, "grad_norm": 7.861974239349365, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8598339557647705, "num_tokens": 618906816.0, "step": 16222 }, { "epoch": 2.0637323495738458, "ewc_loss": 0.0665239691734314, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003185600508004427, "grad_norm": 7.739823818206787, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8660047054290771, "num_tokens": 618943239.0, "step": 16223 }, { "epoch": 2.0638595598524363, "ewc_loss": 0.06686760485172272, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032199634006246924, "grad_norm": 7.896854877471924, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.8796221017837524, "num_tokens": 618978641.0, "step": 16224 }, { "epoch": 2.0639867701310264, "ewc_loss": 0.0665765106678009, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031908537494018674, "grad_norm": 7.778196811676025, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8677783012390137, "num_tokens": 619020522.0, "step": 16225 }, { "epoch": 2.064113980409617, "ewc_loss": 0.06664792448282242, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003222409577574581, "grad_norm": 7.869039535522461, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8786535263061523, "num_tokens": 619060769.0, "step": 16226 }, { "epoch": 2.0642411906882074, "ewc_loss": 0.06654617190361023, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031878199661150575, "grad_norm": 7.741425037384033, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8774240016937256, "num_tokens": 619098711.0, "step": 16227 }, { "epoch": 2.064368400966798, "ewc_loss": 0.06695948541164398, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003229151479899883, "grad_norm": 7.896801471710205, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8630212545394897, "num_tokens": 619133673.0, "step": 16228 }, { "epoch": 2.0644956112453885, "ewc_loss": 0.06629858911037445, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00031874762498773634, "grad_norm": 7.73691987991333, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8615528345108032, "num_tokens": 619174554.0, "step": 16229 }, { "epoch": 2.064622821523979, "ewc_loss": 0.06679369509220123, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032369865220971406, "grad_norm": 7.901438236236572, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8559150099754333, "num_tokens": 619211095.0, "step": 16230 }, { "epoch": 2.0647500318025696, "ewc_loss": 0.06660587340593338, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031937906169332564, "grad_norm": 7.772576808929443, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8767136931419373, "num_tokens": 619246346.0, "step": 16231 }, { "epoch": 2.06487724208116, "ewc_loss": 0.06647530198097229, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00032295615528710186, "grad_norm": 7.854947090148926, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8652654886245728, "num_tokens": 619280629.0, "step": 16232 }, { "epoch": 2.0650044523597506, "ewc_loss": 0.06609603762626648, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.00031916346051730216, "grad_norm": 7.785243988037109, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8463609218597412, "num_tokens": 619320928.0, "step": 16233 }, { "epoch": 2.065131662638341, "ewc_loss": 0.06631270796060562, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003213302115909755, "grad_norm": 7.76784610748291, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8690866231918335, "num_tokens": 619369747.0, "step": 16234 }, { "epoch": 2.0652588729169317, "ewc_loss": 0.06635118275880814, "ewc_loss_diag": 3.409385681152344e-05, "ewc_loss_parallel": 0.0003217149351257831, "grad_norm": 7.7796549797058105, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8690851926803589, "num_tokens": 619408694.0, "step": 16235 }, { "epoch": 2.065386083195522, "ewc_loss": 0.06677500158548355, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003210703143849969, "grad_norm": 7.806814670562744, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8746088743209839, "num_tokens": 619446858.0, "step": 16236 }, { "epoch": 2.0655132934741127, "ewc_loss": 0.06664125621318817, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003221742808818817, "grad_norm": 7.857212543487549, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8655252456665039, "num_tokens": 619482281.0, "step": 16237 }, { "epoch": 2.0656405037527032, "ewc_loss": 0.06657029688358307, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032146467128768563, "grad_norm": 7.804245948791504, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8758164644241333, "num_tokens": 619520309.0, "step": 16238 }, { "epoch": 2.065767714031294, "ewc_loss": 0.06657842546701431, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003215459582861513, "grad_norm": 7.843111515045166, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.870378851890564, "num_tokens": 619554662.0, "step": 16239 }, { "epoch": 2.0658949243098843, "ewc_loss": 0.06678594648838043, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003211798029951751, "grad_norm": 7.792189121246338, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8621151447296143, "num_tokens": 619599632.0, "step": 16240 }, { "epoch": 2.066022134588475, "ewc_loss": 0.0666700005531311, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032246173941530287, "grad_norm": 7.870250701904297, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8810622096061707, "num_tokens": 619639414.0, "step": 16241 }, { "epoch": 2.0661493448670654, "ewc_loss": 0.06639289855957031, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003196906764060259, "grad_norm": 7.793085098266602, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.869096040725708, "num_tokens": 619675878.0, "step": 16242 }, { "epoch": 2.066276555145656, "ewc_loss": 0.06661874055862427, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.0003219491336494684, "grad_norm": 7.859392166137695, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.869671106338501, "num_tokens": 619709486.0, "step": 16243 }, { "epoch": 2.0664037654242464, "ewc_loss": 0.06660193204879761, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003193396842107177, "grad_norm": 7.775660991668701, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8677147030830383, "num_tokens": 619753576.0, "step": 16244 }, { "epoch": 2.066530975702837, "ewc_loss": 0.06695888936519623, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003229091817047447, "grad_norm": 7.9246506690979, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8671845197677612, "num_tokens": 619792520.0, "step": 16245 }, { "epoch": 2.0666581859814275, "ewc_loss": 0.06660927832126617, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031941308407112956, "grad_norm": 7.852071285247803, "learning_rate": 1e-06, "loss": 0.5447, "mean_token_accuracy": 0.8454131484031677, "num_tokens": 619827871.0, "step": 16246 }, { "epoch": 2.066785396260018, "ewc_loss": 0.06668552756309509, "ewc_loss_diag": 3.4332275390625e-05, "ewc_loss_parallel": 0.00032261695014312863, "grad_norm": 7.876492500305176, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8620357513427734, "num_tokens": 619864527.0, "step": 16247 }, { "epoch": 2.0669126065386085, "ewc_loss": 0.06660465896129608, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031936686718836427, "grad_norm": 7.800820350646973, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8680523633956909, "num_tokens": 619904934.0, "step": 16248 }, { "epoch": 2.0670398168171986, "ewc_loss": 0.0668216198682785, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003215365286450833, "grad_norm": 7.876391410827637, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8758023977279663, "num_tokens": 619945472.0, "step": 16249 }, { "epoch": 2.067167027095789, "ewc_loss": 0.06672236323356628, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032054397161118686, "grad_norm": 7.87045431137085, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8800466060638428, "num_tokens": 619981738.0, "step": 16250 }, { "epoch": 2.0672942373743797, "ewc_loss": 0.06665769219398499, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003198972262907773, "grad_norm": 7.833019256591797, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8683103322982788, "num_tokens": 620024593.0, "step": 16251 }, { "epoch": 2.06742144765297, "ewc_loss": 0.06667517870664597, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003200720821041614, "grad_norm": 7.816797256469727, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8732904195785522, "num_tokens": 620064059.0, "step": 16252 }, { "epoch": 2.0675486579315607, "ewc_loss": 0.06669288873672485, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032024920801632106, "grad_norm": 7.8386006355285645, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8672338128089905, "num_tokens": 620100028.0, "step": 16253 }, { "epoch": 2.0676758682101513, "ewc_loss": 0.06672851741313934, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003206054389011115, "grad_norm": 7.87567663192749, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8796746730804443, "num_tokens": 620135181.0, "step": 16254 }, { "epoch": 2.067803078488742, "ewc_loss": 0.06662949919700623, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031961535569280386, "grad_norm": 7.846124649047852, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8678585290908813, "num_tokens": 620174887.0, "step": 16255 }, { "epoch": 2.0679302887673323, "ewc_loss": 0.06679120659828186, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032123236451298, "grad_norm": 7.8644795417785645, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8668347597122192, "num_tokens": 620212924.0, "step": 16256 }, { "epoch": 2.068057499045923, "ewc_loss": 0.06671903282403946, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205106477253139, "grad_norm": 8.042326927185059, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.857388973236084, "num_tokens": 620251088.0, "step": 16257 }, { "epoch": 2.0681847093245134, "ewc_loss": 0.06644387543201447, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003177591133862734, "grad_norm": 7.830902576446533, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8658313751220703, "num_tokens": 620287888.0, "step": 16258 }, { "epoch": 2.068311919603104, "ewc_loss": 0.06681680679321289, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032148833270184696, "grad_norm": 7.866130828857422, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8656261563301086, "num_tokens": 620324937.0, "step": 16259 }, { "epoch": 2.0684391298816944, "ewc_loss": 0.0665351003408432, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031867134384810925, "grad_norm": 7.84124755859375, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.867477536201477, "num_tokens": 620363390.0, "step": 16260 }, { "epoch": 2.068566340160285, "ewc_loss": 0.06667928397655487, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032011320581659675, "grad_norm": 7.795794486999512, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8733118772506714, "num_tokens": 620403707.0, "step": 16261 }, { "epoch": 2.0686935504388755, "ewc_loss": 0.06678943336009979, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032121469848789275, "grad_norm": 7.837669372558594, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8715590238571167, "num_tokens": 620445994.0, "step": 16262 }, { "epoch": 2.068820760717466, "ewc_loss": 0.06675413250923157, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032086166902445257, "grad_norm": 7.879791736602783, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8750144243240356, "num_tokens": 620480883.0, "step": 16263 }, { "epoch": 2.0689479709960565, "ewc_loss": 0.06668934226036072, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032021375955082476, "grad_norm": 7.830286502838135, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8794030547142029, "num_tokens": 620518935.0, "step": 16264 }, { "epoch": 2.069075181274647, "ewc_loss": 0.06681225448846817, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032144287251867354, "grad_norm": 7.850253582000732, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8611626625061035, "num_tokens": 620551985.0, "step": 16265 }, { "epoch": 2.0692023915532376, "ewc_loss": 0.06673052906990051, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032062557875178754, "grad_norm": 7.857292652130127, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8624292612075806, "num_tokens": 620592809.0, "step": 16266 }, { "epoch": 2.069329601831828, "ewc_loss": 0.06673869490623474, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032070724410004914, "grad_norm": 7.835086822509766, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8520548343658447, "num_tokens": 620627951.0, "step": 16267 }, { "epoch": 2.0694568121104187, "ewc_loss": 0.06673464179039001, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003206667606718838, "grad_norm": 7.834143161773682, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8711118698120117, "num_tokens": 620660892.0, "step": 16268 }, { "epoch": 2.069584022389009, "ewc_loss": 0.06678570806980133, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032117741648107767, "grad_norm": 7.865142345428467, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8427510857582092, "num_tokens": 620698165.0, "step": 16269 }, { "epoch": 2.0697112326675997, "ewc_loss": 0.06667445600032806, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032006483525037766, "grad_norm": 7.791019439697266, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8574735522270203, "num_tokens": 620734758.0, "step": 16270 }, { "epoch": 2.0698384429461902, "ewc_loss": 0.06689844280481339, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003223047242499888, "grad_norm": 8.168893814086914, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8472837209701538, "num_tokens": 620770224.0, "step": 16271 }, { "epoch": 2.0699656532247808, "ewc_loss": 0.06641850620508194, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003175053861923516, "grad_norm": 7.668677806854248, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8749518394470215, "num_tokens": 620805724.0, "step": 16272 }, { "epoch": 2.070092863503371, "ewc_loss": 0.06729726493358612, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032629299676045775, "grad_norm": 7.923869609832764, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8549416065216064, "num_tokens": 620847622.0, "step": 16273 }, { "epoch": 2.0702200737819614, "ewc_loss": 0.06697845458984375, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031822206801734865, "grad_norm": 7.814755916595459, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8641312122344971, "num_tokens": 620879042.0, "step": 16274 }, { "epoch": 2.070347284060552, "ewc_loss": 0.06701044738292694, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032342475606128573, "grad_norm": 7.8586955070495605, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8641084432601929, "num_tokens": 620922685.0, "step": 16275 }, { "epoch": 2.0704744943391424, "ewc_loss": 0.06666260957717896, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003199463535565883, "grad_norm": 7.804596424102783, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8612229228019714, "num_tokens": 620954854.0, "step": 16276 }, { "epoch": 2.070601704617733, "ewc_loss": 0.06680517643690109, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032137209200300276, "grad_norm": 7.808114051818848, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8639695644378662, "num_tokens": 620994581.0, "step": 16277 }, { "epoch": 2.0707289148963235, "ewc_loss": 0.06677323579788208, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003210526774637401, "grad_norm": 7.829445838928223, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8660529851913452, "num_tokens": 621030052.0, "step": 16278 }, { "epoch": 2.070856125174914, "ewc_loss": 0.06671092659235, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032042956445366144, "grad_norm": 7.749871730804443, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8752835392951965, "num_tokens": 621072474.0, "step": 16279 }, { "epoch": 2.0709833354535045, "ewc_loss": 0.06741609424352646, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032259844010695815, "grad_norm": 7.896885871887207, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8716026544570923, "num_tokens": 621104365.0, "step": 16280 }, { "epoch": 2.071110545732095, "ewc_loss": 0.06653530895709991, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031867335201241076, "grad_norm": 7.734599590301514, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8713667988777161, "num_tokens": 621140497.0, "step": 16281 }, { "epoch": 2.0712377560106856, "ewc_loss": 0.06748820841312408, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003233195748180151, "grad_norm": 7.810648441314697, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8697996735572815, "num_tokens": 621177431.0, "step": 16282 }, { "epoch": 2.071364966289276, "ewc_loss": 0.0667199045419693, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205193788744509, "grad_norm": 7.784799575805664, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8715653419494629, "num_tokens": 621212874.0, "step": 16283 }, { "epoch": 2.0714921765678667, "ewc_loss": 0.06734561920166016, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003218936617486179, "grad_norm": 7.802847385406494, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.871452808380127, "num_tokens": 621249927.0, "step": 16284 }, { "epoch": 2.071619386846457, "ewc_loss": 0.06717623770236969, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032019984791986644, "grad_norm": 7.8441386222839355, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8437917232513428, "num_tokens": 621280794.0, "step": 16285 }, { "epoch": 2.0717465971250477, "ewc_loss": 0.06719167530536652, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003203542437404394, "grad_norm": 7.784064292907715, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8498471975326538, "num_tokens": 621321167.0, "step": 16286 }, { "epoch": 2.0718738074036382, "ewc_loss": 0.06673161685466766, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032063646358437836, "grad_norm": 7.750692844390869, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8783919215202332, "num_tokens": 621357962.0, "step": 16287 }, { "epoch": 2.0720010176822288, "ewc_loss": 0.06722074747085571, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032064493279904127, "grad_norm": 7.800975799560547, "learning_rate": 1e-06, "loss": 0.4, "mean_token_accuracy": 0.884863018989563, "num_tokens": 621398933.0, "step": 16288 }, { "epoch": 2.0721282279608193, "ewc_loss": 0.06715848296880722, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032002231455408037, "grad_norm": 7.765669345855713, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8863979578018188, "num_tokens": 621442334.0, "step": 16289 }, { "epoch": 2.07225543823941, "ewc_loss": 0.06720279902219772, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032046547858044505, "grad_norm": 7.796549320220947, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8706514835357666, "num_tokens": 621478706.0, "step": 16290 }, { "epoch": 2.0723826485180004, "ewc_loss": 0.06715285778045654, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003199660568498075, "grad_norm": 7.910827159881592, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.87318354845047, "num_tokens": 621513911.0, "step": 16291 }, { "epoch": 2.072509858796591, "ewc_loss": 0.06657058000564575, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003190260613337159, "grad_norm": 7.801215171813965, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8615051507949829, "num_tokens": 621554080.0, "step": 16292 }, { "epoch": 2.0726370690751814, "ewc_loss": 0.06715314090251923, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003199689381290227, "grad_norm": 7.765566349029541, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8694479465484619, "num_tokens": 621592059.0, "step": 16293 }, { "epoch": 2.072764279353772, "ewc_loss": 0.06660914421081543, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031941174529492855, "grad_norm": 7.8949713706970215, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8805980682373047, "num_tokens": 621629626.0, "step": 16294 }, { "epoch": 2.0728914896323625, "ewc_loss": 0.06696353107690811, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031807279447093606, "grad_norm": 7.730960845947266, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8715491890907288, "num_tokens": 621671341.0, "step": 16295 }, { "epoch": 2.073018699910953, "ewc_loss": 0.06729619950056076, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032139947870746255, "grad_norm": 7.860345840454102, "learning_rate": 1e-06, "loss": 0.3977, "mean_token_accuracy": 0.8819893598556519, "num_tokens": 621708756.0, "step": 16296 }, { "epoch": 2.0731459101895435, "ewc_loss": 0.0664440244436264, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003177605976816267, "grad_norm": 7.782567501068115, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8726304769515991, "num_tokens": 621748009.0, "step": 16297 }, { "epoch": 2.0732731204681336, "ewc_loss": 0.06674294173717499, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032074973569251597, "grad_norm": 7.8377203941345215, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8759524822235107, "num_tokens": 621788695.0, "step": 16298 }, { "epoch": 2.073400330746724, "ewc_loss": 0.06648194044828415, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031813973328098655, "grad_norm": 7.82041597366333, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8655823469161987, "num_tokens": 621828099.0, "step": 16299 }, { "epoch": 2.0735275410253147, "ewc_loss": 0.06664884835481644, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003198087797500193, "grad_norm": 7.8846211433410645, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8518617153167725, "num_tokens": 621867033.0, "step": 16300 }, { "epoch": 2.073654751303905, "ewc_loss": 0.0666072815656662, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003193931479472667, "grad_norm": 7.828989028930664, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8576829433441162, "num_tokens": 621907435.0, "step": 16301 }, { "epoch": 2.0737819615824957, "ewc_loss": 0.06674657016992569, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032078599906526506, "grad_norm": 7.907871723175049, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8573114275932312, "num_tokens": 621948995.0, "step": 16302 }, { "epoch": 2.0739091718610863, "ewc_loss": 0.06653115153312683, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000318631820846349, "grad_norm": 7.818263053894043, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8791669011116028, "num_tokens": 621991109.0, "step": 16303 }, { "epoch": 2.074036382139677, "ewc_loss": 0.06677640229463577, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032108434243127704, "grad_norm": 7.846573352813721, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8728857636451721, "num_tokens": 622026585.0, "step": 16304 }, { "epoch": 2.0741635924182673, "ewc_loss": 0.06668549031019211, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003201751969754696, "grad_norm": 7.877671241760254, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8773537278175354, "num_tokens": 622065208.0, "step": 16305 }, { "epoch": 2.074290802696858, "ewc_loss": 0.06672647595405579, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205850953236222, "grad_norm": 7.861995220184326, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8567788004875183, "num_tokens": 622104838.0, "step": 16306 }, { "epoch": 2.0744180129754484, "ewc_loss": 0.066746786236763, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003207881236448884, "grad_norm": 7.928258419036865, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8587462902069092, "num_tokens": 622136879.0, "step": 16307 }, { "epoch": 2.074545223254039, "ewc_loss": 0.06663012504577637, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003196215839125216, "grad_norm": 7.837304592132568, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.879475474357605, "num_tokens": 622175938.0, "step": 16308 }, { "epoch": 2.0746724335326294, "ewc_loss": 0.0668085515499115, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003214058524463326, "grad_norm": 7.854691028594971, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8748469352722168, "num_tokens": 622216663.0, "step": 16309 }, { "epoch": 2.07479964381122, "ewc_loss": 0.06667858362197876, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032010613358579576, "grad_norm": 7.884039402008057, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8756104707717896, "num_tokens": 622250450.0, "step": 16310 }, { "epoch": 2.0749268540898105, "ewc_loss": 0.06674063205718994, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032072668545879424, "grad_norm": 7.897622108459473, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8597795963287354, "num_tokens": 622291919.0, "step": 16311 }, { "epoch": 2.075054064368401, "ewc_loss": 0.06672601401805878, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205804678145796, "grad_norm": 7.904473781585693, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8682047128677368, "num_tokens": 622325892.0, "step": 16312 }, { "epoch": 2.0751812746469915, "ewc_loss": 0.06664222478866577, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003197425394318998, "grad_norm": 7.89564323425293, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8667786121368408, "num_tokens": 622356966.0, "step": 16313 }, { "epoch": 2.075308484925582, "ewc_loss": 0.06669563800096512, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003202767111361027, "grad_norm": 7.865062713623047, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8713819980621338, "num_tokens": 622389931.0, "step": 16314 }, { "epoch": 2.0754356952041726, "ewc_loss": 0.06668177247047424, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032013808959163725, "grad_norm": 7.841537952423096, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8654257655143738, "num_tokens": 622427817.0, "step": 16315 }, { "epoch": 2.075562905482763, "ewc_loss": 0.06658926606178284, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003192129952367395, "grad_norm": 7.809939861297607, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.858158528804779, "num_tokens": 622470353.0, "step": 16316 }, { "epoch": 2.0756901157613536, "ewc_loss": 0.06678850948810577, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003212053852621466, "grad_norm": 7.902787208557129, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8840569257736206, "num_tokens": 622507577.0, "step": 16317 }, { "epoch": 2.075817326039944, "ewc_loss": 0.06656783074140549, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003189986164215952, "grad_norm": 8.23302936553955, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8617038726806641, "num_tokens": 622544746.0, "step": 16318 }, { "epoch": 2.0759445363185347, "ewc_loss": 0.06622949242591858, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031561520881950855, "grad_norm": 7.765759468078613, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8601884841918945, "num_tokens": 622583688.0, "step": 16319 }, { "epoch": 2.0760717465971252, "ewc_loss": 0.06694382429122925, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032275854027830064, "grad_norm": 7.954558849334717, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8689002394676208, "num_tokens": 622619911.0, "step": 16320 }, { "epoch": 2.0761989568757158, "ewc_loss": 0.06619127094745636, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031523307552561164, "grad_norm": 7.650053977966309, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8751134872436523, "num_tokens": 622665732.0, "step": 16321 }, { "epoch": 2.0763261671543063, "ewc_loss": 0.0671592503786087, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000324912864016369, "grad_norm": 7.971426963806152, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.875117301940918, "num_tokens": 622705746.0, "step": 16322 }, { "epoch": 2.0764533774328964, "ewc_loss": 0.06630818545818329, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003164021181873977, "grad_norm": 7.72704553604126, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8682304620742798, "num_tokens": 622745481.0, "step": 16323 }, { "epoch": 2.076580587711487, "ewc_loss": 0.06706545501947403, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032397484756074846, "grad_norm": 7.923879146575928, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8711102604866028, "num_tokens": 622786910.0, "step": 16324 }, { "epoch": 2.0767077979900774, "ewc_loss": 0.06644278764724731, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031774817034602165, "grad_norm": 7.775496006011963, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8634494543075562, "num_tokens": 622826976.0, "step": 16325 }, { "epoch": 2.076835008268668, "ewc_loss": 0.06691519916057587, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003224722750019282, "grad_norm": 7.90580940246582, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8679068088531494, "num_tokens": 622858945.0, "step": 16326 }, { "epoch": 2.0769622185472585, "ewc_loss": 0.06652862578630447, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031860655872151256, "grad_norm": 7.806504726409912, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8794004917144775, "num_tokens": 622893229.0, "step": 16327 }, { "epoch": 2.077089428825849, "ewc_loss": 0.06677928566932678, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032111318432725966, "grad_norm": 7.857892036437988, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8768871426582336, "num_tokens": 622933701.0, "step": 16328 }, { "epoch": 2.0772166391044395, "ewc_loss": 0.06665703654289246, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000319890707032755, "grad_norm": 7.829592227935791, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.854320228099823, "num_tokens": 622971226.0, "step": 16329 }, { "epoch": 2.07734384938303, "ewc_loss": 0.06674143671989441, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003207346599083394, "grad_norm": 7.93382453918457, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8635667562484741, "num_tokens": 623009232.0, "step": 16330 }, { "epoch": 2.0774710596616206, "ewc_loss": 0.06661699712276459, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031949032563716173, "grad_norm": 7.813679218292236, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8602594137191772, "num_tokens": 623048005.0, "step": 16331 }, { "epoch": 2.077598269940211, "ewc_loss": 0.06662628054618835, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003195831086486578, "grad_norm": 7.803991794586182, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.866757869720459, "num_tokens": 623088474.0, "step": 16332 }, { "epoch": 2.0777254802188017, "ewc_loss": 0.06672771275043488, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205974935553968, "grad_norm": 7.857293605804443, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8685470819473267, "num_tokens": 623123700.0, "step": 16333 }, { "epoch": 2.077852690497392, "ewc_loss": 0.06662967801094055, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003196170728188008, "grad_norm": 7.823270797729492, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8704389333724976, "num_tokens": 623161406.0, "step": 16334 }, { "epoch": 2.0779799007759827, "ewc_loss": 0.0667228251695633, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003205485118087381, "grad_norm": 7.87760066986084, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.85830157995224, "num_tokens": 623199770.0, "step": 16335 }, { "epoch": 2.0781071110545732, "ewc_loss": 0.06667270511388779, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003200473729521036, "grad_norm": 7.8623456954956055, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8714662790298462, "num_tokens": 623234635.0, "step": 16336 }, { "epoch": 2.0782343213331638, "ewc_loss": 0.06676091253757477, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003209294518455863, "grad_norm": 7.838977336883545, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8687896728515625, "num_tokens": 623273695.0, "step": 16337 }, { "epoch": 2.0783615316117543, "ewc_loss": 0.06664583086967468, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003197785699740052, "grad_norm": 7.801090240478516, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8522788286209106, "num_tokens": 623315278.0, "step": 16338 }, { "epoch": 2.078488741890345, "ewc_loss": 0.066790372133255, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003212240117136389, "grad_norm": 7.921849250793457, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8754252195358276, "num_tokens": 623351753.0, "step": 16339 }, { "epoch": 2.0786159521689354, "ewc_loss": 0.06664568185806274, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031977708567865193, "grad_norm": 7.890464782714844, "learning_rate": 1e-06, "loss": 0.5264, "mean_token_accuracy": 0.8507946729660034, "num_tokens": 623387812.0, "step": 16340 }, { "epoch": 2.078743162447526, "ewc_loss": 0.0668099895119667, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032142020063474774, "grad_norm": 7.82637357711792, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8705281615257263, "num_tokens": 623426784.0, "step": 16341 }, { "epoch": 2.0788703727261164, "ewc_loss": 0.06674502789974213, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.000320770574035123, "grad_norm": 7.935990333557129, "learning_rate": 1e-06, "loss": 0.5415, "mean_token_accuracy": 0.8445471525192261, "num_tokens": 623463362.0, "step": 16342 }, { "epoch": 2.078997583004707, "ewc_loss": 0.06659495085477829, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00031926980591379106, "grad_norm": 7.851428031921387, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8597083687782288, "num_tokens": 623500108.0, "step": 16343 }, { "epoch": 2.0791247932832975, "ewc_loss": 0.06674425303936005, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003207628324162215, "grad_norm": 7.8399481773376465, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8634854555130005, "num_tokens": 623537159.0, "step": 16344 }, { "epoch": 2.079252003561888, "ewc_loss": 0.06673634052276611, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032068375730887055, "grad_norm": 7.815654277801514, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8865113258361816, "num_tokens": 623576256.0, "step": 16345 }, { "epoch": 2.0793792138404785, "ewc_loss": 0.06694184243679047, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032029737485572696, "grad_norm": 7.865166664123535, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8624483942985535, "num_tokens": 623612454.0, "step": 16346 }, { "epoch": 2.0795064241190686, "ewc_loss": 0.06694340705871582, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032031297450885177, "grad_norm": 7.801539421081543, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8808407783508301, "num_tokens": 623654392.0, "step": 16347 }, { "epoch": 2.079633634397659, "ewc_loss": 0.06700111925601959, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003208900452591479, "grad_norm": 7.864821910858154, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8495180010795593, "num_tokens": 623697350.0, "step": 16348 }, { "epoch": 2.0797608446762497, "ewc_loss": 0.06691475212574005, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032002641819417477, "grad_norm": 7.844884872436523, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.871006190776825, "num_tokens": 623736272.0, "step": 16349 }, { "epoch": 2.07988805495484, "ewc_loss": 0.06693252921104431, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032020421349443495, "grad_norm": 7.854060173034668, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8584339618682861, "num_tokens": 623781319.0, "step": 16350 }, { "epoch": 2.0800152652334307, "ewc_loss": 0.06668627262115479, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003201830550096929, "grad_norm": 7.848728656768799, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8676812052726746, "num_tokens": 623821944.0, "step": 16351 }, { "epoch": 2.0801424755120212, "ewc_loss": 0.06684714555740356, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003217917401343584, "grad_norm": 7.88265323638916, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8623058795928955, "num_tokens": 623858399.0, "step": 16352 }, { "epoch": 2.0802696857906118, "ewc_loss": 0.0669621154665947, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032050005393102765, "grad_norm": 7.86042594909668, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8697368502616882, "num_tokens": 623897839.0, "step": 16353 }, { "epoch": 2.0803968960692023, "ewc_loss": 0.06678742170333862, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003211945586372167, "grad_norm": 7.931060791015625, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8507547378540039, "num_tokens": 623933127.0, "step": 16354 }, { "epoch": 2.080524106347793, "ewc_loss": 0.06670039892196655, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032032429589889944, "grad_norm": 7.8613176345825195, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8657327890396118, "num_tokens": 623972140.0, "step": 16355 }, { "epoch": 2.0806513166263834, "ewc_loss": 0.06677551567554474, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032107546576298773, "grad_norm": 7.868380069732666, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8563957214355469, "num_tokens": 624008875.0, "step": 16356 }, { "epoch": 2.080778526904974, "ewc_loss": 0.06706571578979492, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032153603387996554, "grad_norm": 7.919179916381836, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8654261827468872, "num_tokens": 624045162.0, "step": 16357 }, { "epoch": 2.0809057371835644, "ewc_loss": 0.06689290702342987, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003198079648427665, "grad_norm": 7.817078590393066, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8786311745643616, "num_tokens": 624085275.0, "step": 16358 }, { "epoch": 2.081032947462155, "ewc_loss": 0.06705442070960999, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003214231401216239, "grad_norm": 7.893486499786377, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8647441267967224, "num_tokens": 624127479.0, "step": 16359 }, { "epoch": 2.0811601577407455, "ewc_loss": 0.06663189828395844, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.0003196392790414393, "grad_norm": 7.819272994995117, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8709050416946411, "num_tokens": 624165231.0, "step": 16360 }, { "epoch": 2.081287368019336, "ewc_loss": 0.06685872375965118, "ewc_loss_diag": 3.457069396972656e-05, "ewc_loss_parallel": 0.00032190754427574575, "grad_norm": 7.943981647491455, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8573791980743408, "num_tokens": 624205109.0, "step": 16361 }, { "epoch": 2.0814145782979265, "ewc_loss": 0.06680688261985779, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003189477720297873, "grad_norm": 7.842281341552734, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8625639081001282, "num_tokens": 624241849.0, "step": 16362 }, { "epoch": 2.081541788576517, "ewc_loss": 0.06702631711959839, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003211420262232423, "grad_norm": 7.933150291442871, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8677687644958496, "num_tokens": 624274174.0, "step": 16363 }, { "epoch": 2.0816689988551076, "ewc_loss": 0.06685661524534225, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031944504007697105, "grad_norm": 7.839047431945801, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8652702569961548, "num_tokens": 624311526.0, "step": 16364 }, { "epoch": 2.081796209133698, "ewc_loss": 0.06700831651687622, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003209620772395283, "grad_norm": 7.893667697906494, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8521896600723267, "num_tokens": 624346657.0, "step": 16365 }, { "epoch": 2.0819234194122886, "ewc_loss": 0.06686528027057648, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031953174038790166, "grad_norm": 7.846624851226807, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8515772223472595, "num_tokens": 624386563.0, "step": 16366 }, { "epoch": 2.082050629690879, "ewc_loss": 0.06701343506574631, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003210132708773017, "grad_norm": 7.83475399017334, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8761814832687378, "num_tokens": 624427299.0, "step": 16367 }, { "epoch": 2.0821778399694697, "ewc_loss": 0.06699313223361969, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003208102425560355, "grad_norm": 7.893285274505615, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8499623537063599, "num_tokens": 624461253.0, "step": 16368 }, { "epoch": 2.0823050502480602, "ewc_loss": 0.06694644689559937, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003203433589078486, "grad_norm": 7.817765712738037, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8749606609344482, "num_tokens": 624498699.0, "step": 16369 }, { "epoch": 2.0824322605266508, "ewc_loss": 0.06732690334320068, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032170655322261155, "grad_norm": 7.86274528503418, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8646104335784912, "num_tokens": 624534655.0, "step": 16370 }, { "epoch": 2.082559470805241, "ewc_loss": 0.06693349778652191, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003202139341738075, "grad_norm": 7.813806056976318, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8715723752975464, "num_tokens": 624575342.0, "step": 16371 }, { "epoch": 2.0826866810838314, "ewc_loss": 0.06734426319599152, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003218801284674555, "grad_norm": 7.875864505767822, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8717986345291138, "num_tokens": 624608493.0, "step": 16372 }, { "epoch": 2.082813891362422, "ewc_loss": 0.06694930791854858, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032037196797318757, "grad_norm": 7.83902645111084, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.848278284072876, "num_tokens": 624655777.0, "step": 16373 }, { "epoch": 2.0829411016410124, "ewc_loss": 0.06700265407562256, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003209054993931204, "grad_norm": 7.792097091674805, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8764594793319702, "num_tokens": 624699349.0, "step": 16374 }, { "epoch": 2.083068311919603, "ewc_loss": 0.06711205095052719, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032199942506849766, "grad_norm": 7.918787956237793, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8743415474891663, "num_tokens": 624730146.0, "step": 16375 }, { "epoch": 2.0831955221981935, "ewc_loss": 0.06687180697917938, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031959693296812475, "grad_norm": 7.861418724060059, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.858629584312439, "num_tokens": 624773186.0, "step": 16376 }, { "epoch": 2.083322732476784, "ewc_loss": 0.06702980399131775, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003211769799236208, "grad_norm": 7.8322978019714355, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8645106554031372, "num_tokens": 624810256.0, "step": 16377 }, { "epoch": 2.0834499427553745, "ewc_loss": 0.06704610586166382, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032133993227034807, "grad_norm": 7.875078201293945, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8564494848251343, "num_tokens": 624848166.0, "step": 16378 }, { "epoch": 2.083577153033965, "ewc_loss": 0.0670018345117569, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032089726300910115, "grad_norm": 7.833871841430664, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8649675846099854, "num_tokens": 624887601.0, "step": 16379 }, { "epoch": 2.0837043633125556, "ewc_loss": 0.06714460998773575, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032232500961981714, "grad_norm": 7.855325222015381, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8650026321411133, "num_tokens": 624921160.0, "step": 16380 }, { "epoch": 2.083831573591146, "ewc_loss": 0.0671984851360321, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003204223175998777, "grad_norm": 7.858761787414551, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8586585521697998, "num_tokens": 624959053.0, "step": 16381 }, { "epoch": 2.0839587838697367, "ewc_loss": 0.06725583225488663, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032099580857902765, "grad_norm": 7.834782123565674, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8808022737503052, "num_tokens": 624995272.0, "step": 16382 }, { "epoch": 2.084085994148327, "ewc_loss": 0.06731951236724854, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032163257128559053, "grad_norm": 7.826123237609863, "learning_rate": 1e-06, "loss": 0.4007, "mean_token_accuracy": 0.8829184174537659, "num_tokens": 625033388.0, "step": 16383 }, { "epoch": 2.0842132044269177, "ewc_loss": 0.0673273578286171, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032171106431633234, "grad_norm": 7.899057865142822, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8557038307189941, "num_tokens": 625071103.0, "step": 16384 }, { "epoch": 2.0843404147055082, "ewc_loss": 0.0672132670879364, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003205701650585979, "grad_norm": 7.8438944816589355, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8786571025848389, "num_tokens": 625109562.0, "step": 16385 }, { "epoch": 2.0844676249840988, "ewc_loss": 0.06734833121299744, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003219208447262645, "grad_norm": 7.894902229309082, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8719401359558105, "num_tokens": 625147438.0, "step": 16386 }, { "epoch": 2.0845948352626893, "ewc_loss": 0.06718532741069794, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003202907682862133, "grad_norm": 7.890296459197998, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8751890659332275, "num_tokens": 625184293.0, "step": 16387 }, { "epoch": 2.08472204554128, "ewc_loss": 0.06722219288349152, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032065939740277827, "grad_norm": 7.879456043243408, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8676460981369019, "num_tokens": 625218201.0, "step": 16388 }, { "epoch": 2.0848492558198704, "ewc_loss": 0.067233145236969, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032076897332444787, "grad_norm": 7.862218856811523, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.866558313369751, "num_tokens": 625252072.0, "step": 16389 }, { "epoch": 2.084976466098461, "ewc_loss": 0.06691564619541168, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.000320035353070125, "grad_norm": 7.898565292358398, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.879697859287262, "num_tokens": 625286990.0, "step": 16390 }, { "epoch": 2.0851036763770514, "ewc_loss": 0.06718967109918594, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003203342203050852, "grad_norm": 7.8387885093688965, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8773616552352905, "num_tokens": 625326579.0, "step": 16391 }, { "epoch": 2.085230886655642, "ewc_loss": 0.06732873618602753, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003217248304281384, "grad_norm": 7.9151835441589355, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8722851276397705, "num_tokens": 625367317.0, "step": 16392 }, { "epoch": 2.0853580969342325, "ewc_loss": 0.06690504401922226, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003199293278157711, "grad_norm": 7.941045761108398, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8607038855552673, "num_tokens": 625401047.0, "step": 16393 }, { "epoch": 2.085485307212823, "ewc_loss": 0.067209891974926, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032053643371909857, "grad_norm": 7.877830505371094, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8515827655792236, "num_tokens": 625441878.0, "step": 16394 }, { "epoch": 2.0856125174914135, "ewc_loss": 0.06702078133821487, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031864529591985047, "grad_norm": 7.865278244018555, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.863928496837616, "num_tokens": 625477007.0, "step": 16395 }, { "epoch": 2.0857397277700036, "ewc_loss": 0.06717294454574585, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003201669896952808, "grad_norm": 7.882739543914795, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8786816000938416, "num_tokens": 625519908.0, "step": 16396 }, { "epoch": 2.085866938048594, "ewc_loss": 0.06678412109613419, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031872012186795473, "grad_norm": 7.8420090675354, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8692567944526672, "num_tokens": 625554946.0, "step": 16397 }, { "epoch": 2.0859941483271847, "ewc_loss": 0.06690074503421783, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031988637056201696, "grad_norm": 7.885184288024902, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8767321705818176, "num_tokens": 625592121.0, "step": 16398 }, { "epoch": 2.086121358605775, "ewc_loss": 0.06682686507701874, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031914759892970324, "grad_norm": 7.805169105529785, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8651071786880493, "num_tokens": 625630596.0, "step": 16399 }, { "epoch": 2.0862485688843657, "ewc_loss": 0.06715820729732513, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032001957879401743, "grad_norm": 7.940457820892334, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8489958047866821, "num_tokens": 625669535.0, "step": 16400 }, { "epoch": 2.0863757791629562, "ewc_loss": 0.06680473685264587, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003189263225067407, "grad_norm": 7.888205051422119, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8651740550994873, "num_tokens": 625710072.0, "step": 16401 }, { "epoch": 2.0865029894415468, "ewc_loss": 0.06693879514932632, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032026684493757784, "grad_norm": 7.920636177062988, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8746371269226074, "num_tokens": 625746328.0, "step": 16402 }, { "epoch": 2.0866301997201373, "ewc_loss": 0.066679447889328, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031767343170940876, "grad_norm": 7.769265651702881, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.867652952671051, "num_tokens": 625787108.0, "step": 16403 }, { "epoch": 2.086757409998728, "ewc_loss": 0.06694425642490387, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003203214437235147, "grad_norm": 7.880423069000244, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8709399104118347, "num_tokens": 625827887.0, "step": 16404 }, { "epoch": 2.0868846202773184, "ewc_loss": 0.06675421446561813, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031842105090618134, "grad_norm": 7.812532901763916, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8671720623970032, "num_tokens": 625863475.0, "step": 16405 }, { "epoch": 2.087011830555909, "ewc_loss": 0.06697431951761246, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003206221153959632, "grad_norm": 7.8799896240234375, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8825984597206116, "num_tokens": 625902369.0, "step": 16406 }, { "epoch": 2.0871390408344994, "ewc_loss": 0.06715637445449829, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032000127248466015, "grad_norm": 7.8594279289245605, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8405393362045288, "num_tokens": 625941054.0, "step": 16407 }, { "epoch": 2.08726625111309, "ewc_loss": 0.06692388653755188, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032011777511797845, "grad_norm": 7.84307861328125, "learning_rate": 1e-06, "loss": 0.5278, "mean_token_accuracy": 0.8492975831031799, "num_tokens": 625981113.0, "step": 16408 }, { "epoch": 2.0873934613916805, "ewc_loss": 0.06695067882537842, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032038570498116314, "grad_norm": 7.8829145431518555, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.86236572265625, "num_tokens": 626014758.0, "step": 16409 }, { "epoch": 2.087520671670271, "ewc_loss": 0.06682534515857697, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031913237762637436, "grad_norm": 7.8568806648254395, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.852307915687561, "num_tokens": 626054442.0, "step": 16410 }, { "epoch": 2.0876478819488615, "ewc_loss": 0.06706763803958893, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032155527151189744, "grad_norm": 7.990538120269775, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8482754230499268, "num_tokens": 626093993.0, "step": 16411 }, { "epoch": 2.087775092227452, "ewc_loss": 0.06673122942447662, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003181911597494036, "grad_norm": 7.831961631774902, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8495259284973145, "num_tokens": 626131242.0, "step": 16412 }, { "epoch": 2.0879023025060426, "ewc_loss": 0.06708058714866638, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.000321684725349769, "grad_norm": 7.9711408615112305, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8532099723815918, "num_tokens": 626162137.0, "step": 16413 }, { "epoch": 2.088029512784633, "ewc_loss": 0.06702479720115662, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003186855174135417, "grad_norm": 7.823748588562012, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8669388890266418, "num_tokens": 626201031.0, "step": 16414 }, { "epoch": 2.0881567230632236, "ewc_loss": 0.06736315786838531, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003220690996386111, "grad_norm": 7.913555145263672, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8754451870918274, "num_tokens": 626243647.0, "step": 16415 }, { "epoch": 2.088283933341814, "ewc_loss": 0.06718741357326508, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003203116648364812, "grad_norm": 7.886113166809082, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8726921081542969, "num_tokens": 626281346.0, "step": 16416 }, { "epoch": 2.0884111436204047, "ewc_loss": 0.0670933946967125, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032181284041143954, "grad_norm": 7.993135929107666, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8764759302139282, "num_tokens": 626315964.0, "step": 16417 }, { "epoch": 2.0885383538989952, "ewc_loss": 0.06705357134342194, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003189732669852674, "grad_norm": 7.763801574707031, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8597031831741333, "num_tokens": 626367920.0, "step": 16418 }, { "epoch": 2.0886655641775858, "ewc_loss": 0.06715060770511627, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032238499261438847, "grad_norm": 7.880443572998047, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8632944822311401, "num_tokens": 626412123.0, "step": 16419 }, { "epoch": 2.0887927744561763, "ewc_loss": 0.06709210574626923, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031935860170051455, "grad_norm": 7.834713459014893, "learning_rate": 1e-06, "loss": 0.5468, "mean_token_accuracy": 0.8376619219779968, "num_tokens": 626451560.0, "step": 16420 }, { "epoch": 2.0889199847347664, "ewc_loss": 0.06737253069877625, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003221627848688513, "grad_norm": 7.881249904632568, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8603690266609192, "num_tokens": 626489361.0, "step": 16421 }, { "epoch": 2.089047195013357, "ewc_loss": 0.06730885058641434, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032152599305845797, "grad_norm": 7.849112510681152, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8811972141265869, "num_tokens": 626527150.0, "step": 16422 }, { "epoch": 2.0891744052919474, "ewc_loss": 0.06722355633974075, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032067307620309293, "grad_norm": 7.922612190246582, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8588486909866333, "num_tokens": 626568110.0, "step": 16423 }, { "epoch": 2.089301615570538, "ewc_loss": 0.06721073389053345, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003205448156222701, "grad_norm": 7.861208915710449, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8672201633453369, "num_tokens": 626607215.0, "step": 16424 }, { "epoch": 2.0894288258491285, "ewc_loss": 0.06724122166633606, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032084976555779576, "grad_norm": 7.84264612197876, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8711352348327637, "num_tokens": 626644424.0, "step": 16425 }, { "epoch": 2.089556036127719, "ewc_loss": 0.06730401515960693, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003214776806998998, "grad_norm": 7.869859218597412, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8450512290000916, "num_tokens": 626689509.0, "step": 16426 }, { "epoch": 2.0896832464063095, "ewc_loss": 0.06717240810394287, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003201615472789854, "grad_norm": 7.805837631225586, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8552658557891846, "num_tokens": 626733866.0, "step": 16427 }, { "epoch": 2.0898104566849, "ewc_loss": 0.06718836724758148, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032276258571073413, "grad_norm": 7.880622386932373, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8736145496368408, "num_tokens": 626767465.0, "step": 16428 }, { "epoch": 2.0899376669634906, "ewc_loss": 0.06695660203695297, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003204449312761426, "grad_norm": 7.837686061859131, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8523211479187012, "num_tokens": 626806049.0, "step": 16429 }, { "epoch": 2.090064877242081, "ewc_loss": 0.06734403967857361, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003218778583686799, "grad_norm": 7.813594341278076, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8515276908874512, "num_tokens": 626848917.0, "step": 16430 }, { "epoch": 2.0901920875206716, "ewc_loss": 0.06742997467517853, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032273726537823677, "grad_norm": 7.902446746826172, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8531186580657959, "num_tokens": 626884442.0, "step": 16431 }, { "epoch": 2.090319297799262, "ewc_loss": 0.06705767661333084, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003214556782040745, "grad_norm": 7.801745891571045, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8631751537322998, "num_tokens": 626922517.0, "step": 16432 }, { "epoch": 2.0904465080778527, "ewc_loss": 0.06721276044845581, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032300656312145293, "grad_norm": 7.849203586578369, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8637278079986572, "num_tokens": 626962880.0, "step": 16433 }, { "epoch": 2.0905737183564432, "ewc_loss": 0.06710753589868546, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003219542559236288, "grad_norm": 7.8071818351745605, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8726395964622498, "num_tokens": 627008115.0, "step": 16434 }, { "epoch": 2.0907009286350338, "ewc_loss": 0.06722799688577652, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032315889257006347, "grad_norm": 7.879441738128662, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8772916793823242, "num_tokens": 627043935.0, "step": 16435 }, { "epoch": 2.0908281389136243, "ewc_loss": 0.06710836291313171, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.000321962550515309, "grad_norm": 7.853277683258057, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8659012317657471, "num_tokens": 627078495.0, "step": 16436 }, { "epoch": 2.090955349192215, "ewc_loss": 0.06732645630836487, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032170210033655167, "grad_norm": 7.882633686065674, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8695287704467773, "num_tokens": 627116652.0, "step": 16437 }, { "epoch": 2.0910825594708053, "ewc_loss": 0.06709872931241989, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003218662168364972, "grad_norm": 7.8557281494140625, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8737950325012207, "num_tokens": 627152326.0, "step": 16438 }, { "epoch": 2.091209769749396, "ewc_loss": 0.06706923246383667, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003215711913071573, "grad_norm": 7.822765350341797, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8763956427574158, "num_tokens": 627189502.0, "step": 16439 }, { "epoch": 2.0913369800279864, "ewc_loss": 0.06730452179908752, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032148274476639926, "grad_norm": 7.822206497192383, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8793281316757202, "num_tokens": 627225438.0, "step": 16440 }, { "epoch": 2.091464190306577, "ewc_loss": 0.06732933968305588, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003217308840248734, "grad_norm": 7.829378128051758, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8618987798690796, "num_tokens": 627258154.0, "step": 16441 }, { "epoch": 2.0915914005851675, "ewc_loss": 0.06717821955680847, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032266112975776196, "grad_norm": 7.862490653991699, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8583219051361084, "num_tokens": 627301804.0, "step": 16442 }, { "epoch": 2.091718610863758, "ewc_loss": 0.06747332215309143, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032317073782905936, "grad_norm": 7.86025333404541, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.870572566986084, "num_tokens": 627344301.0, "step": 16443 }, { "epoch": 2.0918458211423485, "ewc_loss": 0.06730090081691742, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003214465396013111, "grad_norm": 7.840255260467529, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8739654421806335, "num_tokens": 627381068.0, "step": 16444 }, { "epoch": 2.0919730314209386, "ewc_loss": 0.06740154325962067, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000322452891850844, "grad_norm": 7.799023628234863, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8592056035995483, "num_tokens": 627423410.0, "step": 16445 }, { "epoch": 2.092100241699529, "ewc_loss": 0.06743308901786804, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003227684064768255, "grad_norm": 7.878564834594727, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8747363090515137, "num_tokens": 627456593.0, "step": 16446 }, { "epoch": 2.0922274519781197, "ewc_loss": 0.06749774515628815, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032341497717425227, "grad_norm": 7.8734893798828125, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8677647709846497, "num_tokens": 627493742.0, "step": 16447 }, { "epoch": 2.09235466225671, "ewc_loss": 0.06748432666063309, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032328077941201627, "grad_norm": 7.876364707946777, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8711516261100769, "num_tokens": 627529809.0, "step": 16448 }, { "epoch": 2.0924818725353007, "ewc_loss": 0.06740561872720718, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032249369542114437, "grad_norm": 7.8287434577941895, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8685323596000671, "num_tokens": 627570693.0, "step": 16449 }, { "epoch": 2.0926090828138912, "ewc_loss": 0.06758753955364227, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032431285944767296, "grad_norm": 7.907686710357666, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8742011189460754, "num_tokens": 627607608.0, "step": 16450 }, { "epoch": 2.0927362930924818, "ewc_loss": 0.06726899743080139, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032112750341184437, "grad_norm": 7.8523268699646, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8639813661575317, "num_tokens": 627642102.0, "step": 16451 }, { "epoch": 2.0928635033710723, "ewc_loss": 0.06744712591171265, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003229087160434574, "grad_norm": 7.903902053833008, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8594713807106018, "num_tokens": 627678838.0, "step": 16452 }, { "epoch": 2.092990713649663, "ewc_loss": 0.0673375055193901, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032181254937313497, "grad_norm": 7.84827995300293, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8697896003723145, "num_tokens": 627714952.0, "step": 16453 }, { "epoch": 2.0931179239282534, "ewc_loss": 0.06733550131320953, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003217924677301198, "grad_norm": 7.844699382781982, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8569706678390503, "num_tokens": 627754710.0, "step": 16454 }, { "epoch": 2.093245134206844, "ewc_loss": 0.06733927875757217, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003218302736058831, "grad_norm": 7.858185768127441, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8705666065216064, "num_tokens": 627791504.0, "step": 16455 }, { "epoch": 2.0933723444854344, "ewc_loss": 0.0673520639538765, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032195815583691, "grad_norm": 7.8228888511657715, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8799082636833191, "num_tokens": 627829683.0, "step": 16456 }, { "epoch": 2.093499554764025, "ewc_loss": 0.06732876598834991, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003217252087779343, "grad_norm": 7.838798522949219, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8669561743736267, "num_tokens": 627869826.0, "step": 16457 }, { "epoch": 2.0936267650426155, "ewc_loss": 0.0674605667591095, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032304320484399796, "grad_norm": 7.865922451019287, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8621535301208496, "num_tokens": 627904165.0, "step": 16458 }, { "epoch": 2.093753975321206, "ewc_loss": 0.06743741035461426, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003228116256650537, "grad_norm": 7.903556823730469, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.871774435043335, "num_tokens": 627936559.0, "step": 16459 }, { "epoch": 2.0938811855997965, "ewc_loss": 0.06737284362316132, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003221658989787102, "grad_norm": 7.869124412536621, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.871709942817688, "num_tokens": 627971742.0, "step": 16460 }, { "epoch": 2.094008395878387, "ewc_loss": 0.06734944880008698, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032193196238949895, "grad_norm": 7.847959041595459, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8752106428146362, "num_tokens": 628013104.0, "step": 16461 }, { "epoch": 2.0941356061569776, "ewc_loss": 0.06738272309303284, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032226473558694124, "grad_norm": 7.892207145690918, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8624025583267212, "num_tokens": 628049378.0, "step": 16462 }, { "epoch": 2.094262816435568, "ewc_loss": 0.06732594966888428, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032169700716622174, "grad_norm": 7.831343173980713, "learning_rate": 1e-06, "loss": 0.3999, "mean_token_accuracy": 0.8842260837554932, "num_tokens": 628087213.0, "step": 16463 }, { "epoch": 2.0943900267141586, "ewc_loss": 0.0674184113740921, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003226216649636626, "grad_norm": 7.92827033996582, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8702637553215027, "num_tokens": 628122060.0, "step": 16464 }, { "epoch": 2.094517236992749, "ewc_loss": 0.06720565259456635, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032049405854195356, "grad_norm": 7.796174049377441, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8691181540489197, "num_tokens": 628166554.0, "step": 16465 }, { "epoch": 2.0946444472713397, "ewc_loss": 0.06752756983041763, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003237132041249424, "grad_norm": 7.966371059417725, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8642547726631165, "num_tokens": 628199953.0, "step": 16466 }, { "epoch": 2.09477165754993, "ewc_loss": 0.06713810563087463, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031981858774088323, "grad_norm": 7.774021148681641, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8681842088699341, "num_tokens": 628237044.0, "step": 16467 }, { "epoch": 2.0948988678285207, "ewc_loss": 0.06757351011037827, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032417260808870196, "grad_norm": 7.940607070922852, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8692564964294434, "num_tokens": 628275773.0, "step": 16468 }, { "epoch": 2.095026078107111, "ewc_loss": 0.06710667163133621, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003195042081642896, "grad_norm": 7.8100385665893555, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8508772253990173, "num_tokens": 628314549.0, "step": 16469 }, { "epoch": 2.0951532883857014, "ewc_loss": 0.06758569180965424, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032429437851533294, "grad_norm": 7.950676441192627, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8609929084777832, "num_tokens": 628349417.0, "step": 16470 }, { "epoch": 2.095280498664292, "ewc_loss": 0.0671524778008461, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003199622733518481, "grad_norm": 7.793802738189697, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8713527321815491, "num_tokens": 628384435.0, "step": 16471 }, { "epoch": 2.0954077089428824, "ewc_loss": 0.06749945878982544, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003234320611227304, "grad_norm": 7.911186695098877, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8534947633743286, "num_tokens": 628424124.0, "step": 16472 }, { "epoch": 2.095534919221473, "ewc_loss": 0.0671926960349083, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032036445918492973, "grad_norm": 7.78600549697876, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8687372207641602, "num_tokens": 628462412.0, "step": 16473 }, { "epoch": 2.0956621295000635, "ewc_loss": 0.06735673546791077, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003220048383809626, "grad_norm": 7.903274059295654, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8493415117263794, "num_tokens": 628499066.0, "step": 16474 }, { "epoch": 2.095789339778654, "ewc_loss": 0.06721368432044983, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003205743560101837, "grad_norm": 7.824919700622559, "learning_rate": 1e-06, "loss": 0.4125, "mean_token_accuracy": 0.880262017250061, "num_tokens": 628532379.0, "step": 16475 }, { "epoch": 2.0959165500572445, "ewc_loss": 0.06746400147676468, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032307751826010644, "grad_norm": 7.846871852874756, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8761000037193298, "num_tokens": 628570497.0, "step": 16476 }, { "epoch": 2.096043760335835, "ewc_loss": 0.06748355180025101, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032083160476759076, "grad_norm": 7.77919864654541, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8703750371932983, "num_tokens": 628608298.0, "step": 16477 }, { "epoch": 2.0961709706144256, "ewc_loss": 0.06763777136802673, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003223737876396626, "grad_norm": 7.8579607009887695, "learning_rate": 1e-06, "loss": 0.3954, "mean_token_accuracy": 0.88484787940979, "num_tokens": 628641217.0, "step": 16478 }, { "epoch": 2.096298180893016, "ewc_loss": 0.06725870072841644, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000321024504955858, "grad_norm": 7.789847373962402, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.867385983467102, "num_tokens": 628682480.0, "step": 16479 }, { "epoch": 2.0964253911716066, "ewc_loss": 0.06773119419813156, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032330804970115423, "grad_norm": 7.891392707824707, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8645275831222534, "num_tokens": 628718359.0, "step": 16480 }, { "epoch": 2.096552601450197, "ewc_loss": 0.06746994704008102, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000320695573464036, "grad_norm": 7.80976676940918, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8742063045501709, "num_tokens": 628752214.0, "step": 16481 }, { "epoch": 2.0966798117287877, "ewc_loss": 0.06771092116832733, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003231053124181926, "grad_norm": 7.879668712615967, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8794059157371521, "num_tokens": 628787882.0, "step": 16482 }, { "epoch": 2.0968070220073782, "ewc_loss": 0.06756577640771866, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003216538461856544, "grad_norm": 7.876816749572754, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8624840378761292, "num_tokens": 628824910.0, "step": 16483 }, { "epoch": 2.0969342322859688, "ewc_loss": 0.06733806431293488, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003218181955162436, "grad_norm": 7.800908088684082, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8789575099945068, "num_tokens": 628860657.0, "step": 16484 }, { "epoch": 2.0970614425645593, "ewc_loss": 0.06741563230752945, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000322593841701746, "grad_norm": 7.885242938995361, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8488473892211914, "num_tokens": 628900619.0, "step": 16485 }, { "epoch": 2.09718865284315, "ewc_loss": 0.06729383021593094, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003213758172933012, "grad_norm": 7.856754302978516, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8780348300933838, "num_tokens": 628944304.0, "step": 16486 }, { "epoch": 2.0973158631217403, "ewc_loss": 0.0675741583108902, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003217377234250307, "grad_norm": 7.815639495849609, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8708896040916443, "num_tokens": 628980490.0, "step": 16487 }, { "epoch": 2.097443073400331, "ewc_loss": 0.06761722266674042, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000322168372804299, "grad_norm": 7.897676467895508, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8709008693695068, "num_tokens": 629011924.0, "step": 16488 }, { "epoch": 2.0975702836789214, "ewc_loss": 0.06759810447692871, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003219771897420287, "grad_norm": 7.891810417175293, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8423250913619995, "num_tokens": 629049514.0, "step": 16489 }, { "epoch": 2.097697493957512, "ewc_loss": 0.06767389178276062, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003227350243832916, "grad_norm": 7.804280757904053, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8843486309051514, "num_tokens": 629085374.0, "step": 16490 }, { "epoch": 2.0978247042361025, "ewc_loss": 0.06737276911735535, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003221651422791183, "grad_norm": 7.890279769897461, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8636198043823242, "num_tokens": 629126307.0, "step": 16491 }, { "epoch": 2.097951914514693, "ewc_loss": 0.06729981303215027, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032143565476872027, "grad_norm": 7.820530891418457, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8667700290679932, "num_tokens": 629159693.0, "step": 16492 }, { "epoch": 2.0980791247932835, "ewc_loss": 0.0674472525715828, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032291002571582794, "grad_norm": 7.818329811096191, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8723047971725464, "num_tokens": 629198544.0, "step": 16493 }, { "epoch": 2.0982063350718736, "ewc_loss": 0.06729720532894135, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003214096068404615, "grad_norm": 7.838805198669434, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.858970046043396, "num_tokens": 629240956.0, "step": 16494 }, { "epoch": 2.098333545350464, "ewc_loss": 0.06749739497900009, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000323411455610767, "grad_norm": 7.860912322998047, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8677518367767334, "num_tokens": 629276196.0, "step": 16495 }, { "epoch": 2.0984607556290547, "ewc_loss": 0.0673113614320755, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003215510805603117, "grad_norm": 7.789663314819336, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8540446758270264, "num_tokens": 629311398.0, "step": 16496 }, { "epoch": 2.098587965907645, "ewc_loss": 0.06753136217594147, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000323751155519858, "grad_norm": 7.860995769500732, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8610926866531372, "num_tokens": 629349488.0, "step": 16497 }, { "epoch": 2.0987151761862357, "ewc_loss": 0.06732684373855591, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003217059711460024, "grad_norm": 7.768869876861572, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8805317878723145, "num_tokens": 629388023.0, "step": 16498 }, { "epoch": 2.0988423864648262, "ewc_loss": 0.06761863827705383, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003246238629799336, "grad_norm": 7.871714115142822, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8672486543655396, "num_tokens": 629432140.0, "step": 16499 }, { "epoch": 2.0989695967434168, "ewc_loss": 0.06728584319353104, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003212959272786975, "grad_norm": 7.758270740509033, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8529932498931885, "num_tokens": 629480690.0, "step": 16500 }, { "epoch": 2.0990968070220073, "ewc_loss": 0.06756894290447235, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003241269150748849, "grad_norm": 7.83103084564209, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8836842775344849, "num_tokens": 629522500.0, "step": 16501 }, { "epoch": 2.099224017300598, "ewc_loss": 0.06737124919891357, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003221500082872808, "grad_norm": 7.781967639923096, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.86821448802948, "num_tokens": 629563450.0, "step": 16502 }, { "epoch": 2.0993512275791884, "ewc_loss": 0.06758074462413788, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003242449020035565, "grad_norm": 7.838006019592285, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8765609264373779, "num_tokens": 629603753.0, "step": 16503 }, { "epoch": 2.099478437857779, "ewc_loss": 0.0674237310886383, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003226748085580766, "grad_norm": 7.831505298614502, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8691845536231995, "num_tokens": 629637828.0, "step": 16504 }, { "epoch": 2.0996056481363694, "ewc_loss": 0.06723161786794662, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032319509773515165, "grad_norm": 7.8281450271606445, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8732941150665283, "num_tokens": 629674636.0, "step": 16505 }, { "epoch": 2.09973285841496, "ewc_loss": 0.06726053357124329, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003234842442907393, "grad_norm": 7.842947959899902, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.857728123664856, "num_tokens": 629717046.0, "step": 16506 }, { "epoch": 2.0998600686935505, "ewc_loss": 0.0673816055059433, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003222535888198763, "grad_norm": 7.825368881225586, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8780950307846069, "num_tokens": 629750331.0, "step": 16507 }, { "epoch": 2.099987278972141, "ewc_loss": 0.06720089912414551, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032288787770085037, "grad_norm": 7.8032636642456055, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8485053777694702, "num_tokens": 629790095.0, "step": 16508 }, { "epoch": 2.1001144892507315, "ewc_loss": 0.06724508106708527, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003233296738471836, "grad_norm": 7.839186191558838, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8811469078063965, "num_tokens": 629823035.0, "step": 16509 }, { "epoch": 2.100241699529322, "ewc_loss": 0.06725924462080002, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032347135129384696, "grad_norm": 7.8726487159729, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8616052865982056, "num_tokens": 629857096.0, "step": 16510 }, { "epoch": 2.1003689098079126, "ewc_loss": 0.0672166720032692, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032304562046192586, "grad_norm": 7.869745254516602, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8656033277511597, "num_tokens": 629893448.0, "step": 16511 }, { "epoch": 2.100496120086503, "ewc_loss": 0.06721776723861694, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032305659260600805, "grad_norm": 7.867432594299316, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8651754856109619, "num_tokens": 629931301.0, "step": 16512 }, { "epoch": 2.1006233303650936, "ewc_loss": 0.06718907505273819, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003227696579415351, "grad_norm": 7.832812309265137, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8744499683380127, "num_tokens": 629963481.0, "step": 16513 }, { "epoch": 2.100750540643684, "ewc_loss": 0.06721087545156479, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003229876747354865, "grad_norm": 7.812980651855469, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8628853559494019, "num_tokens": 630002739.0, "step": 16514 }, { "epoch": 2.1008777509222747, "ewc_loss": 0.06739912927150726, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003224287647753954, "grad_norm": 7.831155776977539, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8599889278411865, "num_tokens": 630041206.0, "step": 16515 }, { "epoch": 2.101004961200865, "ewc_loss": 0.06718192994594574, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003226981789339334, "grad_norm": 7.855047702789307, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.853165328502655, "num_tokens": 630083811.0, "step": 16516 }, { "epoch": 2.1011321714794557, "ewc_loss": 0.06710463762283325, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003219253267161548, "grad_norm": 7.829342365264893, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8649610280990601, "num_tokens": 630119823.0, "step": 16517 }, { "epoch": 2.1012593817580463, "ewc_loss": 0.06726104766130447, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003234893665648997, "grad_norm": 7.829428672790527, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8669992089271545, "num_tokens": 630163807.0, "step": 16518 }, { "epoch": 2.1013865920366364, "ewc_loss": 0.0671117752790451, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003219966311007738, "grad_norm": 7.850935935974121, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8605354428291321, "num_tokens": 630203038.0, "step": 16519 }, { "epoch": 2.101513802315227, "ewc_loss": 0.06717157363891602, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003225946275051683, "grad_norm": 7.809111595153809, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.870869517326355, "num_tokens": 630238505.0, "step": 16520 }, { "epoch": 2.1016410125938174, "ewc_loss": 0.06718967109918594, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032277562422677875, "grad_norm": 7.879410266876221, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8642899990081787, "num_tokens": 630275279.0, "step": 16521 }, { "epoch": 2.101768222872408, "ewc_loss": 0.06730823963880539, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003215199103578925, "grad_norm": 7.789745330810547, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8632578253746033, "num_tokens": 630317288.0, "step": 16522 }, { "epoch": 2.1018954331509985, "ewc_loss": 0.06730841100215912, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.000323963031405583, "grad_norm": 7.909843444824219, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.866436779499054, "num_tokens": 630352786.0, "step": 16523 }, { "epoch": 2.102022643429589, "ewc_loss": 0.06701850891113281, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032106402795761824, "grad_norm": 7.806345462799072, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8674800395965576, "num_tokens": 630391138.0, "step": 16524 }, { "epoch": 2.1021498537081795, "ewc_loss": 0.06733383238315582, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003242171951569617, "grad_norm": 7.882507801055908, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.853770911693573, "num_tokens": 630426567.0, "step": 16525 }, { "epoch": 2.10227706398677, "ewc_loss": 0.06714888662099838, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003223677631467581, "grad_norm": 7.882060527801514, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8554810881614685, "num_tokens": 630467308.0, "step": 16526 }, { "epoch": 2.1024042742653606, "ewc_loss": 0.06747019290924072, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003231394221074879, "grad_norm": 7.867120265960693, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8768036365509033, "num_tokens": 630503333.0, "step": 16527 }, { "epoch": 2.102531484543951, "ewc_loss": 0.0674089789390564, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000322527252137661, "grad_norm": 7.9640889167785645, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8625423908233643, "num_tokens": 630536838.0, "step": 16528 }, { "epoch": 2.1026586948225416, "ewc_loss": 0.06724540889263153, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032089155865833163, "grad_norm": 7.893294334411621, "learning_rate": 1e-06, "loss": 0.4186, "mean_token_accuracy": 0.8785345554351807, "num_tokens": 630570833.0, "step": 16529 }, { "epoch": 2.102785905101132, "ewc_loss": 0.06729939579963684, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003214314638171345, "grad_norm": 7.857868194580078, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8754317164421082, "num_tokens": 630615043.0, "step": 16530 }, { "epoch": 2.1029131153797227, "ewc_loss": 0.0672859400510788, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000321296916808933, "grad_norm": 7.8812408447265625, "learning_rate": 1e-06, "loss": 0.3941, "mean_token_accuracy": 0.888517439365387, "num_tokens": 630651368.0, "step": 16531 }, { "epoch": 2.1030403256583132, "ewc_loss": 0.06719723343849182, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003204098320566118, "grad_norm": 7.86208963394165, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8594145774841309, "num_tokens": 630690341.0, "step": 16532 }, { "epoch": 2.1031675359369038, "ewc_loss": 0.06731383502483368, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003215758188161999, "grad_norm": 7.864249229431152, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8626199960708618, "num_tokens": 630727168.0, "step": 16533 }, { "epoch": 2.1032947462154943, "ewc_loss": 0.06716849654912949, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000320122460834682, "grad_norm": 7.856268882751465, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8716349005699158, "num_tokens": 630766737.0, "step": 16534 }, { "epoch": 2.103421956494085, "ewc_loss": 0.06734755635261536, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000321913103107363, "grad_norm": 7.87766695022583, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8637200593948364, "num_tokens": 630803721.0, "step": 16535 }, { "epoch": 2.1035491667726753, "ewc_loss": 0.06715895235538483, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032002697116695344, "grad_norm": 7.7948994636535645, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.872279167175293, "num_tokens": 630843697.0, "step": 16536 }, { "epoch": 2.103676377051266, "ewc_loss": 0.06743216514587402, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003227591223549098, "grad_norm": 7.9998321533203125, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8674907684326172, "num_tokens": 630874325.0, "step": 16537 }, { "epoch": 2.1038035873298564, "ewc_loss": 0.06710497289896011, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003194872406311333, "grad_norm": 7.8387298583984375, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8633332848548889, "num_tokens": 630911456.0, "step": 16538 }, { "epoch": 2.103930797608447, "ewc_loss": 0.06739480048418045, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003223855164833367, "grad_norm": 7.898348808288574, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8658168315887451, "num_tokens": 630954418.0, "step": 16539 }, { "epoch": 2.1040580078870375, "ewc_loss": 0.06714296340942383, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031986713293008506, "grad_norm": 7.847598075866699, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8660415410995483, "num_tokens": 630991826.0, "step": 16540 }, { "epoch": 2.104185218165628, "ewc_loss": 0.06732699275016785, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003217073972336948, "grad_norm": 7.943585395812988, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8678771257400513, "num_tokens": 631025831.0, "step": 16541 }, { "epoch": 2.1043124284442185, "ewc_loss": 0.06723187863826752, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003207563131581992, "grad_norm": 7.889109134674072, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8529072999954224, "num_tokens": 631062896.0, "step": 16542 }, { "epoch": 2.1044396387228086, "ewc_loss": 0.06757400929927826, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032173615181818604, "grad_norm": 7.941264629364014, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8723863959312439, "num_tokens": 631099291.0, "step": 16543 }, { "epoch": 2.104566849001399, "ewc_loss": 0.06730509549379349, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003214884491171688, "grad_norm": 7.903696060180664, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8573399186134338, "num_tokens": 631141570.0, "step": 16544 }, { "epoch": 2.1046940592799896, "ewc_loss": 0.06729564070701599, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032139394897967577, "grad_norm": 7.945271968841553, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8607606291770935, "num_tokens": 631178882.0, "step": 16545 }, { "epoch": 2.10482126955858, "ewc_loss": 0.06716383993625641, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032007586560212076, "grad_norm": 7.865214824676514, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8634523153305054, "num_tokens": 631218059.0, "step": 16546 }, { "epoch": 2.1049484798371707, "ewc_loss": 0.06757725775241852, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032176871900446713, "grad_norm": 7.94631814956665, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8688263893127441, "num_tokens": 631252827.0, "step": 16547 }, { "epoch": 2.1050756901157612, "ewc_loss": 0.06739836931228638, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00031997973565012217, "grad_norm": 7.857318878173828, "learning_rate": 1e-06, "loss": 0.4258, "mean_token_accuracy": 0.873466432094574, "num_tokens": 631294370.0, "step": 16548 }, { "epoch": 2.1052029003943518, "ewc_loss": 0.06753766536712646, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003213727322872728, "grad_norm": 7.8523688316345215, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8628949522972107, "num_tokens": 631331091.0, "step": 16549 }, { "epoch": 2.1053301106729423, "ewc_loss": 0.06745891273021698, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032058521173894405, "grad_norm": 7.8997802734375, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8695887923240662, "num_tokens": 631367285.0, "step": 16550 }, { "epoch": 2.105457320951533, "ewc_loss": 0.06744213402271271, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003204174863640219, "grad_norm": 7.856842517852783, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8540288209915161, "num_tokens": 631402471.0, "step": 16551 }, { "epoch": 2.1055845312301233, "ewc_loss": 0.06763563305139542, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003223524254281074, "grad_norm": 7.915882587432861, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.865544319152832, "num_tokens": 631439706.0, "step": 16552 }, { "epoch": 2.105711741508714, "ewc_loss": 0.06742087751626968, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000320204853778705, "grad_norm": 7.917214870452881, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.871624767780304, "num_tokens": 631471437.0, "step": 16553 }, { "epoch": 2.1058389517873044, "ewc_loss": 0.06733030080795288, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003217404882889241, "grad_norm": 7.908921241760254, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8629273772239685, "num_tokens": 631515028.0, "step": 16554 }, { "epoch": 2.105966162065895, "ewc_loss": 0.0672435387969017, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003208728739991784, "grad_norm": 7.916110992431641, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8622512221336365, "num_tokens": 631552946.0, "step": 16555 }, { "epoch": 2.1060933723444855, "ewc_loss": 0.06720198690891266, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003204573877155781, "grad_norm": 7.842891693115234, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8704589009284973, "num_tokens": 631596045.0, "step": 16556 }, { "epoch": 2.106220582623076, "ewc_loss": 0.06730666756629944, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003215041942894459, "grad_norm": 7.881868362426758, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8753829598426819, "num_tokens": 631630233.0, "step": 16557 }, { "epoch": 2.1063477929016665, "ewc_loss": 0.06719068437814713, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003203443484380841, "grad_norm": 7.8577189445495605, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.872075617313385, "num_tokens": 631666665.0, "step": 16558 }, { "epoch": 2.106475003180257, "ewc_loss": 0.06703051924705505, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003211841394659132, "grad_norm": 7.89356803894043, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8630521297454834, "num_tokens": 631709536.0, "step": 16559 }, { "epoch": 2.1066022134588476, "ewc_loss": 0.06719061732292175, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003203436208423227, "grad_norm": 7.926368236541748, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8809607625007629, "num_tokens": 631744333.0, "step": 16560 }, { "epoch": 2.106729423737438, "ewc_loss": 0.0668962299823761, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00031984123052097857, "grad_norm": 7.81758451461792, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8517611026763916, "num_tokens": 631783522.0, "step": 16561 }, { "epoch": 2.1068566340160286, "ewc_loss": 0.06708680093288422, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003217468620277941, "grad_norm": 7.882133483886719, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8501529693603516, "num_tokens": 631822579.0, "step": 16562 }, { "epoch": 2.106983844294619, "ewc_loss": 0.06721989810466766, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032063646358437836, "grad_norm": 7.810719013214111, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8744291663169861, "num_tokens": 631862642.0, "step": 16563 }, { "epoch": 2.1071110545732097, "ewc_loss": 0.06749892979860306, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003234267933294177, "grad_norm": 7.923372745513916, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.876834511756897, "num_tokens": 631903161.0, "step": 16564 }, { "epoch": 2.1072382648518, "ewc_loss": 0.06721659749746323, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003206034889444709, "grad_norm": 7.915097713470459, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8693800568580627, "num_tokens": 631938149.0, "step": 16565 }, { "epoch": 2.1073654751303907, "ewc_loss": 0.06742638349533081, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003227013803552836, "grad_norm": 7.893592357635498, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8673145771026611, "num_tokens": 631974717.0, "step": 16566 }, { "epoch": 2.107492685408981, "ewc_loss": 0.06730849295854568, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003215224132873118, "grad_norm": 7.918159484863281, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8712857961654663, "num_tokens": 632011147.0, "step": 16567 }, { "epoch": 2.1076198956875714, "ewc_loss": 0.06726451218128204, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003210826253052801, "grad_norm": 7.839197158813477, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8736441731452942, "num_tokens": 632051915.0, "step": 16568 }, { "epoch": 2.107747105966162, "ewc_loss": 0.06752848625183105, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032372234272770584, "grad_norm": 7.977027893066406, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8568394184112549, "num_tokens": 632083047.0, "step": 16569 }, { "epoch": 2.1078743162447524, "ewc_loss": 0.06716085970401764, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000320046121487394, "grad_norm": 7.7969207763671875, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8789708614349365, "num_tokens": 632119250.0, "step": 16570 }, { "epoch": 2.108001526523343, "ewc_loss": 0.06768901646137238, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003253276227042079, "grad_norm": 8.01452922821045, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8728941679000854, "num_tokens": 632152696.0, "step": 16571 }, { "epoch": 2.1081287368019335, "ewc_loss": 0.06716632843017578, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032010074937716126, "grad_norm": 7.8104987144470215, "learning_rate": 1e-06, "loss": 0.3943, "mean_token_accuracy": 0.8875484466552734, "num_tokens": 632188167.0, "step": 16572 }, { "epoch": 2.108255947080524, "ewc_loss": 0.06764452159404755, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003248827706556767, "grad_norm": 7.953550338745117, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8638410568237305, "num_tokens": 632229567.0, "step": 16573 }, { "epoch": 2.1083831573591145, "ewc_loss": 0.06718279421329498, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003202654479537159, "grad_norm": 7.797212600708008, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8774305582046509, "num_tokens": 632266870.0, "step": 16574 }, { "epoch": 2.108510367637705, "ewc_loss": 0.06760933995246887, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032453093444928527, "grad_norm": 7.985046863555908, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8668437600135803, "num_tokens": 632303844.0, "step": 16575 }, { "epoch": 2.1086375779162956, "ewc_loss": 0.06722739338874817, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003207114350516349, "grad_norm": 7.842493534088135, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8688966631889343, "num_tokens": 632339343.0, "step": 16576 }, { "epoch": 2.108764788194886, "ewc_loss": 0.06751352548599243, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003235727781429887, "grad_norm": 7.9071879386901855, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.870884895324707, "num_tokens": 632385516.0, "step": 16577 }, { "epoch": 2.1088919984734766, "ewc_loss": 0.06708985567092896, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032177750836126506, "grad_norm": 7.883447647094727, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8719068765640259, "num_tokens": 632420664.0, "step": 16578 }, { "epoch": 2.109019208752067, "ewc_loss": 0.06712590157985687, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032213793019764125, "grad_norm": 7.848145484924316, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.872470498085022, "num_tokens": 632458082.0, "step": 16579 }, { "epoch": 2.1091464190306577, "ewc_loss": 0.0675472766160965, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032391026616096497, "grad_norm": 7.90212869644165, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8552489280700684, "num_tokens": 632495963.0, "step": 16580 }, { "epoch": 2.109273629309248, "ewc_loss": 0.06741195917129517, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003225571126677096, "grad_norm": 7.895832061767578, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8684790134429932, "num_tokens": 632529968.0, "step": 16581 }, { "epoch": 2.1094008395878387, "ewc_loss": 0.06753266602754593, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032376416493207216, "grad_norm": 7.883811950683594, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8745239973068237, "num_tokens": 632567527.0, "step": 16582 }, { "epoch": 2.1095280498664293, "ewc_loss": 0.06742706149816513, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003227081033401191, "grad_norm": 7.848550319671631, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.866923451423645, "num_tokens": 632606192.0, "step": 16583 }, { "epoch": 2.10965526014502, "ewc_loss": 0.06751345098018646, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003235719632357359, "grad_norm": 7.916181564331055, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8780522346496582, "num_tokens": 632645940.0, "step": 16584 }, { "epoch": 2.1097824704236103, "ewc_loss": 0.0673525482416153, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003219629288651049, "grad_norm": 7.864471912384033, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8751357197761536, "num_tokens": 632689385.0, "step": 16585 }, { "epoch": 2.109909680702201, "ewc_loss": 0.06746359169483185, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003230733855161816, "grad_norm": 7.908732891082764, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8724435567855835, "num_tokens": 632720178.0, "step": 16586 }, { "epoch": 2.1100368909807914, "ewc_loss": 0.06739653646945953, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032240283326245844, "grad_norm": 7.905672073364258, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.859713613986969, "num_tokens": 632763249.0, "step": 16587 }, { "epoch": 2.110164101259382, "ewc_loss": 0.06742522120475769, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003226897388231009, "grad_norm": 7.892200469970703, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.84819096326828, "num_tokens": 632803293.0, "step": 16588 }, { "epoch": 2.1102913115379724, "ewc_loss": 0.06747579574584961, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003231954760849476, "grad_norm": 7.894119739532471, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8595273494720459, "num_tokens": 632838731.0, "step": 16589 }, { "epoch": 2.110418521816563, "ewc_loss": 0.06748206913471222, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032325813663192093, "grad_norm": 7.932638168334961, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8568910360336304, "num_tokens": 632875044.0, "step": 16590 }, { "epoch": 2.1105457320951535, "ewc_loss": 0.0675843209028244, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032428072881884873, "grad_norm": 7.861588001251221, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8464163541793823, "num_tokens": 632911330.0, "step": 16591 }, { "epoch": 2.1106729423737436, "ewc_loss": 0.06761905550956726, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032462808303534985, "grad_norm": 7.90093469619751, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8562384247779846, "num_tokens": 632952122.0, "step": 16592 }, { "epoch": 2.110800152652334, "ewc_loss": 0.06750041246414185, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003234416071791202, "grad_norm": 7.951786518096924, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8480061292648315, "num_tokens": 632985602.0, "step": 16593 }, { "epoch": 2.1109273629309246, "ewc_loss": 0.06766858696937561, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000322681968100369, "grad_norm": 7.809707164764404, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8675481081008911, "num_tokens": 633023792.0, "step": 16594 }, { "epoch": 2.111054573209515, "ewc_loss": 0.06753836572170258, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000323821121128276, "grad_norm": 7.931191921234131, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8686988353729248, "num_tokens": 633065457.0, "step": 16595 }, { "epoch": 2.1111817834881057, "ewc_loss": 0.06734215468168259, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003218590281903744, "grad_norm": 7.848764896392822, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8743188977241516, "num_tokens": 633101402.0, "step": 16596 }, { "epoch": 2.1113089937666962, "ewc_loss": 0.06754598766565323, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032389737316407263, "grad_norm": 7.8978095054626465, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8577364683151245, "num_tokens": 633147002.0, "step": 16597 }, { "epoch": 2.1114362040452868, "ewc_loss": 0.06755423545837402, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000321538420394063, "grad_norm": 7.802774906158447, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8661362528800964, "num_tokens": 633184209.0, "step": 16598 }, { "epoch": 2.1115634143238773, "ewc_loss": 0.06778916716575623, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003238877689000219, "grad_norm": 7.924389839172363, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8713017702102661, "num_tokens": 633224525.0, "step": 16599 }, { "epoch": 2.111690624602468, "ewc_loss": 0.0676644966006279, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032264107721857727, "grad_norm": 7.90403413772583, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8797265291213989, "num_tokens": 633253171.0, "step": 16600 }, { "epoch": 2.1118178348810583, "ewc_loss": 0.06743835657835007, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032282108440995216, "grad_norm": 7.863722324371338, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8645083904266357, "num_tokens": 633294326.0, "step": 16601 }, { "epoch": 2.111945045159649, "ewc_loss": 0.06769724190235138, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000322968524415046, "grad_norm": 7.87888765335083, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8815696239471436, "num_tokens": 633327236.0, "step": 16602 }, { "epoch": 2.1120722554382394, "ewc_loss": 0.06745576858520508, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003229951544199139, "grad_norm": 7.9130988121032715, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8770342469215393, "num_tokens": 633368593.0, "step": 16603 }, { "epoch": 2.11219946571683, "ewc_loss": 0.06744372099637985, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032287469366565347, "grad_norm": 8.029955863952637, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8469011783599854, "num_tokens": 633402598.0, "step": 16604 }, { "epoch": 2.1123266759954205, "ewc_loss": 0.0675492137670517, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032148821628652513, "grad_norm": 7.868075847625732, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8618226647377014, "num_tokens": 633437069.0, "step": 16605 }, { "epoch": 2.112453886274011, "ewc_loss": 0.06772877275943756, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032328383531421423, "grad_norm": 7.861779689788818, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8686517477035522, "num_tokens": 633479752.0, "step": 16606 }, { "epoch": 2.1125810965526015, "ewc_loss": 0.06764404475688934, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003224365646019578, "grad_norm": 7.8971757888793945, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8631637692451477, "num_tokens": 633516949.0, "step": 16607 }, { "epoch": 2.112708306831192, "ewc_loss": 0.06764793395996094, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003224753891117871, "grad_norm": 7.945725917816162, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8708468675613403, "num_tokens": 633547511.0, "step": 16608 }, { "epoch": 2.1128355171097826, "ewc_loss": 0.06766627728939056, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032265885965898633, "grad_norm": 7.897818088531494, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8677199482917786, "num_tokens": 633583940.0, "step": 16609 }, { "epoch": 2.112962727388373, "ewc_loss": 0.06767401099205017, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032273618853650987, "grad_norm": 7.84006929397583, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8555856943130493, "num_tokens": 633626245.0, "step": 16610 }, { "epoch": 2.1130899376669636, "ewc_loss": 0.06767189502716064, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003227150591555983, "grad_norm": 7.927094459533691, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8690950870513916, "num_tokens": 633662889.0, "step": 16611 }, { "epoch": 2.113217147945554, "ewc_loss": 0.06740528345108032, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032249034848064184, "grad_norm": 7.895625114440918, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8631676435470581, "num_tokens": 633701461.0, "step": 16612 }, { "epoch": 2.1133443582241447, "ewc_loss": 0.06751701235771179, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003236076154280454, "grad_norm": 7.960881233215332, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8680618405342102, "num_tokens": 633741302.0, "step": 16613 }, { "epoch": 2.113471568502735, "ewc_loss": 0.0672418475151062, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032085596467368305, "grad_norm": 7.789506435394287, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.871950089931488, "num_tokens": 633780337.0, "step": 16614 }, { "epoch": 2.1135987787813257, "ewc_loss": 0.06759534031152725, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032439088681712747, "grad_norm": 7.963346481323242, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8729950189590454, "num_tokens": 633814450.0, "step": 16615 }, { "epoch": 2.1137259890599163, "ewc_loss": 0.06721509993076324, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003205884713679552, "grad_norm": 7.790926933288574, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8595162630081177, "num_tokens": 633856252.0, "step": 16616 }, { "epoch": 2.1138531993385064, "ewc_loss": 0.06770551204681396, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003254926123190671, "grad_norm": 7.9725022315979, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8554626107215881, "num_tokens": 633895235.0, "step": 16617 }, { "epoch": 2.113980409617097, "ewc_loss": 0.0671379566192627, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00031981701613403857, "grad_norm": 7.824455261230469, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8690727949142456, "num_tokens": 633938948.0, "step": 16618 }, { "epoch": 2.1141076198956874, "ewc_loss": 0.0677173063158989, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032561057014390826, "grad_norm": 7.9640374183654785, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8750494122505188, "num_tokens": 633973393.0, "step": 16619 }, { "epoch": 2.114234830174278, "ewc_loss": 0.0671411007642746, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003198485355824232, "grad_norm": 7.814241886138916, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8725035190582275, "num_tokens": 634006714.0, "step": 16620 }, { "epoch": 2.1143620404528685, "ewc_loss": 0.06783134490251541, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003243095416110009, "grad_norm": 7.9026594161987305, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8727765679359436, "num_tokens": 634046104.0, "step": 16621 }, { "epoch": 2.114489250731459, "ewc_loss": 0.06725803017616272, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003210177819710225, "grad_norm": 7.841241836547852, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8598537445068359, "num_tokens": 634086401.0, "step": 16622 }, { "epoch": 2.1146164610100495, "ewc_loss": 0.06758461892604828, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003242836974095553, "grad_norm": 7.939680576324463, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8580969572067261, "num_tokens": 634125479.0, "step": 16623 }, { "epoch": 2.11474367128864, "ewc_loss": 0.067254438996315, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003209818387404084, "grad_norm": 7.878377437591553, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8476754426956177, "num_tokens": 634163873.0, "step": 16624 }, { "epoch": 2.1148708815672306, "ewc_loss": 0.06765986979007721, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032259474392049015, "grad_norm": 7.808627128601074, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8648651838302612, "num_tokens": 634200533.0, "step": 16625 }, { "epoch": 2.114998091845821, "ewc_loss": 0.06777995824813843, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003237956843804568, "grad_norm": 7.891316890716553, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8599240183830261, "num_tokens": 634235891.0, "step": 16626 }, { "epoch": 2.1151253021244116, "ewc_loss": 0.06736913323402405, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003221288206987083, "grad_norm": 7.915616989135742, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8665759563446045, "num_tokens": 634266748.0, "step": 16627 }, { "epoch": 2.115252512403002, "ewc_loss": 0.06744959950447083, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003229334542993456, "grad_norm": 7.864615440368652, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8535856008529663, "num_tokens": 634302603.0, "step": 16628 }, { "epoch": 2.1153797226815927, "ewc_loss": 0.06745073199272156, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032294486300088465, "grad_norm": 7.834959506988525, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8683086633682251, "num_tokens": 634345471.0, "step": 16629 }, { "epoch": 2.115506932960183, "ewc_loss": 0.0674951896071434, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003233893949072808, "grad_norm": 7.870700836181641, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8654742240905762, "num_tokens": 634383902.0, "step": 16630 }, { "epoch": 2.1156341432387737, "ewc_loss": 0.06740155071020126, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032245300826616585, "grad_norm": 7.886733531951904, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8577232360839844, "num_tokens": 634421082.0, "step": 16631 }, { "epoch": 2.1157613535173643, "ewc_loss": 0.06769244372844696, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003229205613024533, "grad_norm": 7.891274452209473, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8614335656166077, "num_tokens": 634456490.0, "step": 16632 }, { "epoch": 2.115888563795955, "ewc_loss": 0.06761284917593002, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032212460064329207, "grad_norm": 7.826412677764893, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8482699394226074, "num_tokens": 634498252.0, "step": 16633 }, { "epoch": 2.1160157740745453, "ewc_loss": 0.0677475854754448, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003234719333704561, "grad_norm": 7.92600679397583, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8753504157066345, "num_tokens": 634540769.0, "step": 16634 }, { "epoch": 2.116142984353136, "ewc_loss": 0.06762474775314331, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003222435188945383, "grad_norm": 7.865153789520264, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8737828731536865, "num_tokens": 634573807.0, "step": 16635 }, { "epoch": 2.1162701946317264, "ewc_loss": 0.067755326628685, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032354937866330147, "grad_norm": 7.8913445472717285, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8616316318511963, "num_tokens": 634609542.0, "step": 16636 }, { "epoch": 2.116397404910317, "ewc_loss": 0.06765151023864746, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003225112450309098, "grad_norm": 7.853775978088379, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8740965127944946, "num_tokens": 634648641.0, "step": 16637 }, { "epoch": 2.1165246151889074, "ewc_loss": 0.06778606027364731, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003238566860090941, "grad_norm": 7.895616054534912, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.864107608795166, "num_tokens": 634685531.0, "step": 16638 }, { "epoch": 2.116651825467498, "ewc_loss": 0.06768842041492462, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032288028160110116, "grad_norm": 7.953929424285889, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8696367740631104, "num_tokens": 634717239.0, "step": 16639 }, { "epoch": 2.116779035746088, "ewc_loss": 0.06753785908222198, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032137465314008296, "grad_norm": 7.841994762420654, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8639649152755737, "num_tokens": 634750058.0, "step": 16640 }, { "epoch": 2.1169062460246786, "ewc_loss": 0.06772781163454056, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032327420194633305, "grad_norm": 7.8286919593811035, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8595287203788757, "num_tokens": 634793480.0, "step": 16641 }, { "epoch": 2.117033456303269, "ewc_loss": 0.06768839061260223, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032287996145896614, "grad_norm": 7.877285957336426, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.856838583946228, "num_tokens": 634829923.0, "step": 16642 }, { "epoch": 2.1171606665818596, "ewc_loss": 0.06770285964012146, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000323024665703997, "grad_norm": 7.887475490570068, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.861019492149353, "num_tokens": 634868499.0, "step": 16643 }, { "epoch": 2.11728787686045, "ewc_loss": 0.06773021072149277, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032329821260645986, "grad_norm": 7.872119426727295, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8748916387557983, "num_tokens": 634905922.0, "step": 16644 }, { "epoch": 2.1174150871390407, "ewc_loss": 0.06774094700813293, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032340551842935383, "grad_norm": 7.865688323974609, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8679019808769226, "num_tokens": 634946190.0, "step": 16645 }, { "epoch": 2.1175422974176312, "ewc_loss": 0.06768582761287689, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003228544082958251, "grad_norm": 7.8569464683532715, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8777929544448853, "num_tokens": 634981840.0, "step": 16646 }, { "epoch": 2.1176695076962218, "ewc_loss": 0.06789329648017883, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032492910395376384, "grad_norm": 7.913207530975342, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8642364740371704, "num_tokens": 635024755.0, "step": 16647 }, { "epoch": 2.1177967179748123, "ewc_loss": 0.06773950159549713, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003233910829294473, "grad_norm": 7.837493419647217, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8703424334526062, "num_tokens": 635063076.0, "step": 16648 }, { "epoch": 2.117923928253403, "ewc_loss": 0.06784150749444962, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003244111721869558, "grad_norm": 7.868569850921631, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8781369924545288, "num_tokens": 635103052.0, "step": 16649 }, { "epoch": 2.1180511385319933, "ewc_loss": 0.06774307787418365, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032342690974473953, "grad_norm": 7.864180088043213, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8755313158035278, "num_tokens": 635140917.0, "step": 16650 }, { "epoch": 2.118178348810584, "ewc_loss": 0.06782971322536469, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003242931852582842, "grad_norm": 7.891088008880615, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8707937002182007, "num_tokens": 635182049.0, "step": 16651 }, { "epoch": 2.1183055590891744, "ewc_loss": 0.06776328384876251, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032362889032810926, "grad_norm": 7.929699897766113, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8604981899261475, "num_tokens": 635217613.0, "step": 16652 }, { "epoch": 2.118432769367765, "ewc_loss": 0.0677468478679657, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003234645409975201, "grad_norm": 7.872809410095215, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8765839338302612, "num_tokens": 635252716.0, "step": 16653 }, { "epoch": 2.1185599796463555, "ewc_loss": 0.06787319481372833, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003247280837967992, "grad_norm": 7.938718795776367, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.869773268699646, "num_tokens": 635288919.0, "step": 16654 }, { "epoch": 2.118687189924946, "ewc_loss": 0.06753955781459808, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032139162067323923, "grad_norm": 7.854316711425781, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8612807989120483, "num_tokens": 635329781.0, "step": 16655 }, { "epoch": 2.1188144002035365, "ewc_loss": 0.06779465079307556, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003239426005166024, "grad_norm": 7.935582637786865, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8746785521507263, "num_tokens": 635363844.0, "step": 16656 }, { "epoch": 2.118941610482127, "ewc_loss": 0.06751397997140884, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003211358853150159, "grad_norm": 7.852016925811768, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.862838089466095, "num_tokens": 635397568.0, "step": 16657 }, { "epoch": 2.1190688207607176, "ewc_loss": 0.06784141808748245, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032441026996821165, "grad_norm": 7.951094150543213, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8526897430419922, "num_tokens": 635432903.0, "step": 16658 }, { "epoch": 2.119196031039308, "ewc_loss": 0.06758664548397064, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003218625788576901, "grad_norm": 7.811821937561035, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8681524395942688, "num_tokens": 635473388.0, "step": 16659 }, { "epoch": 2.1193232413178986, "ewc_loss": 0.06782548129558563, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032425092649646103, "grad_norm": 7.8841094970703125, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8694290518760681, "num_tokens": 635512966.0, "step": 16660 }, { "epoch": 2.119450451596489, "ewc_loss": 0.06758850067853928, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032188111799769104, "grad_norm": 7.8876166343688965, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8522037267684937, "num_tokens": 635547814.0, "step": 16661 }, { "epoch": 2.1195776618750797, "ewc_loss": 0.06771869957447052, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000323183077853173, "grad_norm": 7.8339033126831055, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8574224710464478, "num_tokens": 635583120.0, "step": 16662 }, { "epoch": 2.11970487215367, "ewc_loss": 0.06779410690069199, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000323937158100307, "grad_norm": 7.853191375732422, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8606983423233032, "num_tokens": 635621531.0, "step": 16663 }, { "epoch": 2.1198320824322607, "ewc_loss": 0.06770332157611847, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003230293223168701, "grad_norm": 7.865052700042725, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8544977903366089, "num_tokens": 635659641.0, "step": 16664 }, { "epoch": 2.119959292710851, "ewc_loss": 0.06787049770355225, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003247010172344744, "grad_norm": 7.898318290710449, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8608813285827637, "num_tokens": 635697953.0, "step": 16665 }, { "epoch": 2.1200865029894413, "ewc_loss": 0.06766965985298157, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032269273651763797, "grad_norm": 7.842290878295898, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8419327735900879, "num_tokens": 635742519.0, "step": 16666 }, { "epoch": 2.120213713268032, "ewc_loss": 0.06785982102155685, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003245942934881896, "grad_norm": 7.87931489944458, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8732277154922485, "num_tokens": 635777786.0, "step": 16667 }, { "epoch": 2.1203409235466224, "ewc_loss": 0.06769649684429169, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032296107383444905, "grad_norm": 7.836560249328613, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8676536083221436, "num_tokens": 635814490.0, "step": 16668 }, { "epoch": 2.120468133825213, "ewc_loss": 0.06775090098381042, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003235050826333463, "grad_norm": 7.854973793029785, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8535507321357727, "num_tokens": 635858351.0, "step": 16669 }, { "epoch": 2.1205953441038035, "ewc_loss": 0.06790097802877426, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003250058798585087, "grad_norm": 7.913845539093018, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8637140989303589, "num_tokens": 635895585.0, "step": 16670 }, { "epoch": 2.120722554382394, "ewc_loss": 0.06776577234268188, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032365386141464114, "grad_norm": 7.848451614379883, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8735709190368652, "num_tokens": 635933591.0, "step": 16671 }, { "epoch": 2.1208497646609845, "ewc_loss": 0.06801917403936386, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003261878155171871, "grad_norm": 7.979720115661621, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8637956976890564, "num_tokens": 635972040.0, "step": 16672 }, { "epoch": 2.120976974939575, "ewc_loss": 0.06769511848688126, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032294727861881256, "grad_norm": 7.831207275390625, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8691264986991882, "num_tokens": 636013287.0, "step": 16673 }, { "epoch": 2.1211041852181656, "ewc_loss": 0.06804471462965012, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003264432307332754, "grad_norm": 7.946972846984863, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8627016544342041, "num_tokens": 636046137.0, "step": 16674 }, { "epoch": 2.121231395496756, "ewc_loss": 0.06763187050819397, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032231485238298774, "grad_norm": 7.862635135650635, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8559838533401489, "num_tokens": 636088870.0, "step": 16675 }, { "epoch": 2.1213586057753466, "ewc_loss": 0.06808270514011383, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032682312303222716, "grad_norm": 7.9810590744018555, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8757050633430481, "num_tokens": 636130519.0, "step": 16676 }, { "epoch": 2.121485816053937, "ewc_loss": 0.0676918476819992, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032291459501720965, "grad_norm": 7.889856815338135, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8700014352798462, "num_tokens": 636165641.0, "step": 16677 }, { "epoch": 2.1216130263325277, "ewc_loss": 0.06782737374305725, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003242698439862579, "grad_norm": 7.919830322265625, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8523730039596558, "num_tokens": 636203811.0, "step": 16678 }, { "epoch": 2.121740236611118, "ewc_loss": 0.0676787868142128, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032278394792228937, "grad_norm": 7.815017223358154, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8780696988105774, "num_tokens": 636239503.0, "step": 16679 }, { "epoch": 2.1218674468897087, "ewc_loss": 0.06790822744369507, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000325078348396346, "grad_norm": 7.923224925994873, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8701900243759155, "num_tokens": 636278994.0, "step": 16680 }, { "epoch": 2.1219946571682993, "ewc_loss": 0.06762479245662689, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032224401365965605, "grad_norm": 7.860848426818848, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8627344369888306, "num_tokens": 636318025.0, "step": 16681 }, { "epoch": 2.12212186744689, "ewc_loss": 0.06789201498031616, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032491624006070197, "grad_norm": 7.925200462341309, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.867835283279419, "num_tokens": 636357289.0, "step": 16682 }, { "epoch": 2.1222490777254803, "ewc_loss": 0.06766872107982635, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032268333598040044, "grad_norm": 7.844665050506592, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8511835336685181, "num_tokens": 636399067.0, "step": 16683 }, { "epoch": 2.122376288004071, "ewc_loss": 0.068004310131073, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003260392404627055, "grad_norm": 7.913206577301025, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8711228370666504, "num_tokens": 636442418.0, "step": 16684 }, { "epoch": 2.1225034982826614, "ewc_loss": 0.06769344955682755, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032293060212396085, "grad_norm": 7.852591514587402, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8551923632621765, "num_tokens": 636484273.0, "step": 16685 }, { "epoch": 2.122630708561252, "ewc_loss": 0.06795049458742142, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003255010233260691, "grad_norm": 7.9277801513671875, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.873404860496521, "num_tokens": 636525675.0, "step": 16686 }, { "epoch": 2.1227579188398424, "ewc_loss": 0.06769032031297684, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003228992864023894, "grad_norm": 7.900589466094971, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8643473386764526, "num_tokens": 636559652.0, "step": 16687 }, { "epoch": 2.122885129118433, "ewc_loss": 0.06787264347076416, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032472252496518195, "grad_norm": 7.944729804992676, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8860158324241638, "num_tokens": 636597900.0, "step": 16688 }, { "epoch": 2.1230123393970235, "ewc_loss": 0.06763224303722382, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032231854856945574, "grad_norm": 7.8752241134643555, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.867560088634491, "num_tokens": 636635822.0, "step": 16689 }, { "epoch": 2.1231395496756136, "ewc_loss": 0.06781911849975586, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032418727641925216, "grad_norm": 7.956766605377197, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8567296266555786, "num_tokens": 636667275.0, "step": 16690 }, { "epoch": 2.123266759954204, "ewc_loss": 0.06762571632862091, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032225329778157175, "grad_norm": 7.94340705871582, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8676542639732361, "num_tokens": 636704723.0, "step": 16691 }, { "epoch": 2.1233939702327946, "ewc_loss": 0.06754891574382782, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003239266516175121, "grad_norm": 7.941980838775635, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8706375360488892, "num_tokens": 636748209.0, "step": 16692 }, { "epoch": 2.123521180511385, "ewc_loss": 0.067384272813797, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003222802188247442, "grad_norm": 7.960067272186279, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8718259930610657, "num_tokens": 636785052.0, "step": 16693 }, { "epoch": 2.1236483907899757, "ewc_loss": 0.0675445944070816, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032144199940375984, "grad_norm": 7.898166179656982, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8793208599090576, "num_tokens": 636816073.0, "step": 16694 }, { "epoch": 2.123775601068566, "ewc_loss": 0.06749295443296432, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032336704316549003, "grad_norm": 7.9312872886657715, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8720684051513672, "num_tokens": 636848963.0, "step": 16695 }, { "epoch": 2.1239028113471567, "ewc_loss": 0.06731946766376495, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003216321929357946, "grad_norm": 7.901288032531738, "learning_rate": 1e-06, "loss": 0.4113, "mean_token_accuracy": 0.8781452178955078, "num_tokens": 636880632.0, "step": 16696 }, { "epoch": 2.1240300216257473, "ewc_loss": 0.06745399534702301, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003229774010833353, "grad_norm": 7.868856906890869, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8533234596252441, "num_tokens": 636923597.0, "step": 16697 }, { "epoch": 2.124157231904338, "ewc_loss": 0.06775756180286407, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032357173040509224, "grad_norm": 7.930508613586426, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8604050874710083, "num_tokens": 636962250.0, "step": 16698 }, { "epoch": 2.1242844421829283, "ewc_loss": 0.06765294820070267, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032252559321932495, "grad_norm": 7.988860607147217, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8745437860488892, "num_tokens": 636997973.0, "step": 16699 }, { "epoch": 2.124411652461519, "ewc_loss": 0.06764641404151917, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003224602260161191, "grad_norm": 7.856337547302246, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8537893295288086, "num_tokens": 637037320.0, "step": 16700 }, { "epoch": 2.1245388627401094, "ewc_loss": 0.06758973747491837, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003243348910473287, "grad_norm": 7.947549343109131, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8435083627700806, "num_tokens": 637072267.0, "step": 16701 }, { "epoch": 2.1246660730187, "ewc_loss": 0.06769756972789764, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003229717549402267, "grad_norm": 7.893482208251953, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8807225227355957, "num_tokens": 637108782.0, "step": 16702 }, { "epoch": 2.1247932832972904, "ewc_loss": 0.06752028316259384, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032364032813347876, "grad_norm": 7.900933265686035, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8655937910079956, "num_tokens": 637147368.0, "step": 16703 }, { "epoch": 2.124920493575881, "ewc_loss": 0.06773290038108826, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003233250754419714, "grad_norm": 7.8851213455200195, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8754047155380249, "num_tokens": 637181909.0, "step": 16704 }, { "epoch": 2.1250477038544715, "ewc_loss": 0.06753356754779816, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003237732162233442, "grad_norm": 7.912769317626953, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8654609322547913, "num_tokens": 637218498.0, "step": 16705 }, { "epoch": 2.125174914133062, "ewc_loss": 0.0675571858882904, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032400936470367014, "grad_norm": 8.071178436279297, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8682212829589844, "num_tokens": 637254825.0, "step": 16706 }, { "epoch": 2.1253021244116526, "ewc_loss": 0.06727778911590576, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003212153387721628, "grad_norm": 7.837703704833984, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8567566275596619, "num_tokens": 637301593.0, "step": 16707 }, { "epoch": 2.125429334690243, "ewc_loss": 0.06783071160316467, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032430319697596133, "grad_norm": 7.866131782531738, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8783727884292603, "num_tokens": 637341676.0, "step": 16708 }, { "epoch": 2.1255565449688336, "ewc_loss": 0.06768141686916351, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003228102286811918, "grad_norm": 7.883550643920898, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8705499768257141, "num_tokens": 637378840.0, "step": 16709 }, { "epoch": 2.125683755247424, "ewc_loss": 0.06772289425134659, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032322504557669163, "grad_norm": 7.887236595153809, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8493740558624268, "num_tokens": 637412711.0, "step": 16710 }, { "epoch": 2.1258109655260147, "ewc_loss": 0.06778484582901001, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003238445206079632, "grad_norm": 7.923396110534668, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8647062182426453, "num_tokens": 637448287.0, "step": 16711 }, { "epoch": 2.125938175804605, "ewc_loss": 0.06769846379756927, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003229807480238378, "grad_norm": 7.869265556335449, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8642836809158325, "num_tokens": 637487899.0, "step": 16712 }, { "epoch": 2.1260653860831957, "ewc_loss": 0.06787799298763275, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032477598870173097, "grad_norm": 7.894401550292969, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8703392148017883, "num_tokens": 637528161.0, "step": 16713 }, { "epoch": 2.1261925963617863, "ewc_loss": 0.06773418188095093, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003233378811273724, "grad_norm": 7.860979080200195, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8539118766784668, "num_tokens": 637565692.0, "step": 16714 }, { "epoch": 2.1263198066403763, "ewc_loss": 0.0678696557879448, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032469266443513334, "grad_norm": 7.917914390563965, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8547343015670776, "num_tokens": 637600368.0, "step": 16715 }, { "epoch": 2.126447016918967, "ewc_loss": 0.06768588721752167, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032285493216477334, "grad_norm": 7.838948726654053, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8632172346115112, "num_tokens": 637635732.0, "step": 16716 }, { "epoch": 2.1265742271975574, "ewc_loss": 0.06787362694740295, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003247323620598763, "grad_norm": 7.893587589263916, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8618353605270386, "num_tokens": 637677127.0, "step": 16717 }, { "epoch": 2.126701437476148, "ewc_loss": 0.06782082468271255, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003242043312638998, "grad_norm": 7.864065647125244, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8657211065292358, "num_tokens": 637715278.0, "step": 16718 }, { "epoch": 2.1268286477547385, "ewc_loss": 0.06780483573675156, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032404446392320096, "grad_norm": 7.890466690063477, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8700307607650757, "num_tokens": 637742937.0, "step": 16719 }, { "epoch": 2.126955858033329, "ewc_loss": 0.06783534586429596, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032434958848170936, "grad_norm": 7.875291347503662, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8695135712623596, "num_tokens": 637776761.0, "step": 16720 }, { "epoch": 2.1270830683119195, "ewc_loss": 0.06784073263406754, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032440340146422386, "grad_norm": 7.861162185668945, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8691130876541138, "num_tokens": 637812197.0, "step": 16721 }, { "epoch": 2.12721027859051, "ewc_loss": 0.06786523014307022, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032464839750900865, "grad_norm": 7.859503746032715, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8707833290100098, "num_tokens": 637850442.0, "step": 16722 }, { "epoch": 2.1273374888691006, "ewc_loss": 0.0678434669971466, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032443078816868365, "grad_norm": 7.901627063751221, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8677219152450562, "num_tokens": 637889352.0, "step": 16723 }, { "epoch": 2.127464699147691, "ewc_loss": 0.06773853302001953, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003233813913539052, "grad_norm": 7.886830806732178, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8473877310752869, "num_tokens": 637927195.0, "step": 16724 }, { "epoch": 2.1275919094262816, "ewc_loss": 0.06760558485984802, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032205189927481115, "grad_norm": 7.821948051452637, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8545244932174683, "num_tokens": 637969431.0, "step": 16725 }, { "epoch": 2.127719119704872, "ewc_loss": 0.06784021854400635, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032439830829389393, "grad_norm": 7.8488993644714355, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8825274109840393, "num_tokens": 638013910.0, "step": 16726 }, { "epoch": 2.1278463299834627, "ewc_loss": 0.06769970059394836, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003229931462556124, "grad_norm": 7.880644798278809, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8553155660629272, "num_tokens": 638051848.0, "step": 16727 }, { "epoch": 2.127973540262053, "ewc_loss": 0.06774634122848511, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003234595060348511, "grad_norm": 7.876623630523682, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8675602078437805, "num_tokens": 638092192.0, "step": 16728 }, { "epoch": 2.1281007505406437, "ewc_loss": 0.06770293414592743, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003230254806112498, "grad_norm": 7.8724799156188965, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8766529560089111, "num_tokens": 638134306.0, "step": 16729 }, { "epoch": 2.1282279608192343, "ewc_loss": 0.06771733611822128, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032316945726051927, "grad_norm": 7.910238265991211, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.852289080619812, "num_tokens": 638177988.0, "step": 16730 }, { "epoch": 2.128355171097825, "ewc_loss": 0.06766825169324875, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032267862115986645, "grad_norm": 7.9574761390686035, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8772848844528198, "num_tokens": 638212237.0, "step": 16731 }, { "epoch": 2.1284823813764153, "ewc_loss": 0.06755828112363815, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003215789038222283, "grad_norm": 7.859508037567139, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.85922771692276, "num_tokens": 638252492.0, "step": 16732 }, { "epoch": 2.128609591655006, "ewc_loss": 0.06762595474720001, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032225565519183874, "grad_norm": 7.849802494049072, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8654621839523315, "num_tokens": 638294308.0, "step": 16733 }, { "epoch": 2.1287368019335964, "ewc_loss": 0.06770020723342896, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003229981230106205, "grad_norm": 7.899264335632324, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8707027435302734, "num_tokens": 638330178.0, "step": 16734 }, { "epoch": 2.128864012212187, "ewc_loss": 0.0676683858036995, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032267995993606746, "grad_norm": 7.913440227508545, "learning_rate": 1e-06, "loss": 0.5265, "mean_token_accuracy": 0.8471267819404602, "num_tokens": 638372731.0, "step": 16735 }, { "epoch": 2.1289912224907774, "ewc_loss": 0.06760861724615097, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000322082283673808, "grad_norm": 7.846803188323975, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8631654977798462, "num_tokens": 638407522.0, "step": 16736 }, { "epoch": 2.129118432769368, "ewc_loss": 0.06771467626094818, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032314282725565135, "grad_norm": 7.927568435668945, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8684045076370239, "num_tokens": 638443262.0, "step": 16737 }, { "epoch": 2.129245643047958, "ewc_loss": 0.06766730546951294, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000322669162414968, "grad_norm": 7.834303379058838, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8665037155151367, "num_tokens": 638482729.0, "step": 16738 }, { "epoch": 2.129372853326549, "ewc_loss": 0.06781312078237534, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032412729342468083, "grad_norm": 7.931495189666748, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8738245368003845, "num_tokens": 638520034.0, "step": 16739 }, { "epoch": 2.129500063605139, "ewc_loss": 0.06766530871391296, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003226491389796138, "grad_norm": 7.833057403564453, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8737742304801941, "num_tokens": 638558631.0, "step": 16740 }, { "epoch": 2.1296272738837296, "ewc_loss": 0.06782442331314087, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003242403327021748, "grad_norm": 7.881396770477295, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8778212070465088, "num_tokens": 638593565.0, "step": 16741 }, { "epoch": 2.12975448416232, "ewc_loss": 0.06765105575323105, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003225066466256976, "grad_norm": 8.062335014343262, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8701764345169067, "num_tokens": 638625952.0, "step": 16742 }, { "epoch": 2.1298816944409107, "ewc_loss": 0.06747917830944061, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032078789081424475, "grad_norm": 7.784576416015625, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8636268377304077, "num_tokens": 638662000.0, "step": 16743 }, { "epoch": 2.130008904719501, "ewc_loss": 0.0680524930357933, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032652102527208626, "grad_norm": 7.934268474578857, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8530932664871216, "num_tokens": 638706020.0, "step": 16744 }, { "epoch": 2.1301361149980917, "ewc_loss": 0.06749770790338516, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003209731657989323, "grad_norm": 7.806053638458252, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8726420402526855, "num_tokens": 638745354.0, "step": 16745 }, { "epoch": 2.1302633252766823, "ewc_loss": 0.06816910207271576, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032768715755082667, "grad_norm": 7.989023685455322, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8610085844993591, "num_tokens": 638785497.0, "step": 16746 }, { "epoch": 2.130390535555273, "ewc_loss": 0.06755681335926056, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032156420638784766, "grad_norm": 7.867672920227051, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.846038281917572, "num_tokens": 638821650.0, "step": 16747 }, { "epoch": 2.1305177458338633, "ewc_loss": 0.06805245578289032, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003265206469222903, "grad_norm": 7.953568458557129, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8530325889587402, "num_tokens": 638855589.0, "step": 16748 }, { "epoch": 2.130644956112454, "ewc_loss": 0.06770741939544678, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032307024230249226, "grad_norm": 7.838918209075928, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8645840883255005, "num_tokens": 638895766.0, "step": 16749 }, { "epoch": 2.1307721663910444, "ewc_loss": 0.06805001944303513, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032649628701619804, "grad_norm": 7.98397970199585, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8568031787872314, "num_tokens": 638932622.0, "step": 16750 }, { "epoch": 2.130899376669635, "ewc_loss": 0.06766360998153687, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003226322296541184, "grad_norm": 7.836058139801025, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8556568622589111, "num_tokens": 638971588.0, "step": 16751 }, { "epoch": 2.1310265869482254, "ewc_loss": 0.06809684634208679, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003269645676482469, "grad_norm": 7.957400798797607, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8704205751419067, "num_tokens": 639009921.0, "step": 16752 }, { "epoch": 2.131153797226816, "ewc_loss": 0.0678061917424202, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032405799720436335, "grad_norm": 7.8597092628479, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8663424253463745, "num_tokens": 639048160.0, "step": 16753 }, { "epoch": 2.1312810075054065, "ewc_loss": 0.0680701807141304, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032669788924977183, "grad_norm": 8.041559219360352, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8782853484153748, "num_tokens": 639080260.0, "step": 16754 }, { "epoch": 2.131408217783997, "ewc_loss": 0.067678302526474, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000322779145790264, "grad_norm": 7.893894195556641, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8580772876739502, "num_tokens": 639118758.0, "step": 16755 }, { "epoch": 2.1315354280625876, "ewc_loss": 0.06792671978473663, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003252633032388985, "grad_norm": 7.980038166046143, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8660714030265808, "num_tokens": 639159528.0, "step": 16756 }, { "epoch": 2.131662638341178, "ewc_loss": 0.0677650049328804, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003236461488995701, "grad_norm": 7.87741231918335, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8702480792999268, "num_tokens": 639200394.0, "step": 16757 }, { "epoch": 2.1317898486197686, "ewc_loss": 0.06788511574268341, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003248472639825195, "grad_norm": 7.940790176391602, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8574678301811218, "num_tokens": 639240938.0, "step": 16758 }, { "epoch": 2.131917058898359, "ewc_loss": 0.06777617335319519, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003237578785046935, "grad_norm": 7.901548862457275, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8610504865646362, "num_tokens": 639278097.0, "step": 16759 }, { "epoch": 2.1320442691769497, "ewc_loss": 0.06780020892620087, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032399818883277476, "grad_norm": 7.826355457305908, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8667250871658325, "num_tokens": 639324512.0, "step": 16760 }, { "epoch": 2.13217147945554, "ewc_loss": 0.06789682060480118, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032496429048478603, "grad_norm": 7.914386749267578, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8708895444869995, "num_tokens": 639366833.0, "step": 16761 }, { "epoch": 2.1322986897341307, "ewc_loss": 0.06773616373538971, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003233577008359134, "grad_norm": 7.87639856338501, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8506551384925842, "num_tokens": 639405765.0, "step": 16762 }, { "epoch": 2.132425900012721, "ewc_loss": 0.06789153814315796, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032491146703250706, "grad_norm": 7.9374542236328125, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8615332841873169, "num_tokens": 639439622.0, "step": 16763 }, { "epoch": 2.1325531102913113, "ewc_loss": 0.0676572322845459, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032256837585009634, "grad_norm": 7.846911430358887, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.876632809638977, "num_tokens": 639479943.0, "step": 16764 }, { "epoch": 2.132680320569902, "ewc_loss": 0.06791085004806519, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003251046291552484, "grad_norm": 7.871769905090332, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8601921796798706, "num_tokens": 639520487.0, "step": 16765 }, { "epoch": 2.1328075308484924, "ewc_loss": 0.0677274540066719, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032327065127901733, "grad_norm": 7.896833419799805, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8645166158676147, "num_tokens": 639558997.0, "step": 16766 }, { "epoch": 2.132934741127083, "ewc_loss": 0.06785440444946289, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003245401312597096, "grad_norm": 7.87256383895874, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8653527498245239, "num_tokens": 639594569.0, "step": 16767 }, { "epoch": 2.1330619514056735, "ewc_loss": 0.06783518195152283, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032434792956337333, "grad_norm": 7.877350330352783, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8792506456375122, "num_tokens": 639629827.0, "step": 16768 }, { "epoch": 2.133189161684264, "ewc_loss": 0.06786935031414032, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032468955032527447, "grad_norm": 7.905661106109619, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8584598302841187, "num_tokens": 639667634.0, "step": 16769 }, { "epoch": 2.1333163719628545, "ewc_loss": 0.06790833175182343, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003250793961342424, "grad_norm": 7.961160659790039, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8633868098258972, "num_tokens": 639700536.0, "step": 16770 }, { "epoch": 2.133443582241445, "ewc_loss": 0.06779025495052338, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032389865373261273, "grad_norm": 7.9241437911987305, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8685226440429688, "num_tokens": 639744006.0, "step": 16771 }, { "epoch": 2.1335707925200356, "ewc_loss": 0.06777074933052063, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003237035416532308, "grad_norm": 7.856542110443115, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8720571994781494, "num_tokens": 639784950.0, "step": 16772 }, { "epoch": 2.133698002798626, "ewc_loss": 0.06817424297332764, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00032529712188988924, "grad_norm": 7.91774845123291, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8804349899291992, "num_tokens": 639823567.0, "step": 16773 }, { "epoch": 2.1338252130772166, "ewc_loss": 0.06781698763370514, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032416597241535783, "grad_norm": 7.851698398590088, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8767392635345459, "num_tokens": 639863386.0, "step": 16774 }, { "epoch": 2.133952423355807, "ewc_loss": 0.06798761337995529, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032587224268354475, "grad_norm": 7.952418804168701, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8551660776138306, "num_tokens": 639901219.0, "step": 16775 }, { "epoch": 2.1340796336343977, "ewc_loss": 0.06788169592618942, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032481306698173285, "grad_norm": 7.940810680389404, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8501534461975098, "num_tokens": 639935240.0, "step": 16776 }, { "epoch": 2.134206843912988, "ewc_loss": 0.06773410737514496, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032577855745330453, "grad_norm": 7.916964054107666, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8578261137008667, "num_tokens": 639975370.0, "step": 16777 }, { "epoch": 2.1343340541915787, "ewc_loss": 0.06766770780086517, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003251145826652646, "grad_norm": 7.860122203826904, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8463877439498901, "num_tokens": 640020463.0, "step": 16778 }, { "epoch": 2.1344612644701693, "ewc_loss": 0.0677623599767685, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032606112654320896, "grad_norm": 8.015437126159668, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.860393762588501, "num_tokens": 640057142.0, "step": 16779 }, { "epoch": 2.13458847474876, "ewc_loss": 0.06745734810829163, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032301092869602144, "grad_norm": 7.831450462341309, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8629366159439087, "num_tokens": 640098879.0, "step": 16780 }, { "epoch": 2.1347156850273503, "ewc_loss": 0.06812259554862976, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003272220492362976, "grad_norm": 7.8997039794921875, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8688735961914062, "num_tokens": 640136077.0, "step": 16781 }, { "epoch": 2.134842895305941, "ewc_loss": 0.06750070303678513, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032344451756216586, "grad_norm": 7.917296886444092, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8511173129081726, "num_tokens": 640175160.0, "step": 16782 }, { "epoch": 2.1349701055845314, "ewc_loss": 0.06776286661624908, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032606616150587797, "grad_norm": 7.872403621673584, "learning_rate": 1e-06, "loss": 0.3868, "mean_token_accuracy": 0.8901945352554321, "num_tokens": 640216670.0, "step": 16783 }, { "epoch": 2.135097315863122, "ewc_loss": 0.06773298978805542, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032576743979007006, "grad_norm": 7.94395112991333, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8623623251914978, "num_tokens": 640254872.0, "step": 16784 }, { "epoch": 2.1352245261417124, "ewc_loss": 0.06787377595901489, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032473381725139916, "grad_norm": 7.9378180503845215, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8636677861213684, "num_tokens": 640289319.0, "step": 16785 }, { "epoch": 2.135351736420303, "ewc_loss": 0.06791163980960846, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003251125162933022, "grad_norm": 7.856456279754639, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.8797838687896729, "num_tokens": 640323897.0, "step": 16786 }, { "epoch": 2.1354789466988935, "ewc_loss": 0.06809872388839722, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003269833396188915, "grad_norm": 8.029637336730957, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8723382353782654, "num_tokens": 640366203.0, "step": 16787 }, { "epoch": 2.1356061569774836, "ewc_loss": 0.06749974191188812, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003234349424019456, "grad_norm": 7.81348180770874, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8759071826934814, "num_tokens": 640408089.0, "step": 16788 }, { "epoch": 2.135733367256074, "ewc_loss": 0.06801174581050873, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032855497556738555, "grad_norm": 8.030007362365723, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8751329183578491, "num_tokens": 640443532.0, "step": 16789 }, { "epoch": 2.1358605775346646, "ewc_loss": 0.06742337346076965, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032267128699459136, "grad_norm": 7.8834967613220215, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8593087792396545, "num_tokens": 640480425.0, "step": 16790 }, { "epoch": 2.135987787813255, "ewc_loss": 0.06798885762691498, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003283260448370129, "grad_norm": 7.992193698883057, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8695825338363647, "num_tokens": 640525658.0, "step": 16791 }, { "epoch": 2.1361149980918457, "ewc_loss": 0.06779617071151733, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032395776361227036, "grad_norm": 7.89707088470459, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8748660087585449, "num_tokens": 640563276.0, "step": 16792 }, { "epoch": 2.136242208370436, "ewc_loss": 0.06780297309160233, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003264672413934022, "grad_norm": 7.942819595336914, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8583036661148071, "num_tokens": 640608150.0, "step": 16793 }, { "epoch": 2.1363694186490267, "ewc_loss": 0.06777544319629669, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032619197736494243, "grad_norm": 7.993539810180664, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8677853345870972, "num_tokens": 640646127.0, "step": 16794 }, { "epoch": 2.1364966289276173, "ewc_loss": 0.06780187785625458, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003240148944314569, "grad_norm": 7.949490070343018, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8719245195388794, "num_tokens": 640683195.0, "step": 16795 }, { "epoch": 2.136623839206208, "ewc_loss": 0.06776313483715057, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032606883905828, "grad_norm": 7.930061340332031, "learning_rate": 1e-06, "loss": 0.4114, "mean_token_accuracy": 0.8804761171340942, "num_tokens": 640724102.0, "step": 16796 }, { "epoch": 2.1367510494847983, "ewc_loss": 0.06789326667785645, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032492875470779836, "grad_norm": 7.997673511505127, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8621290922164917, "num_tokens": 640762624.0, "step": 16797 }, { "epoch": 2.136878259763389, "ewc_loss": 0.06754842400550842, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032392179127782583, "grad_norm": 7.915058612823486, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8608089089393616, "num_tokens": 640799437.0, "step": 16798 }, { "epoch": 2.1370054700419794, "ewc_loss": 0.06776441633701324, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032608170295134187, "grad_norm": 7.938249111175537, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8715980052947998, "num_tokens": 640837115.0, "step": 16799 }, { "epoch": 2.13713268032057, "ewc_loss": 0.06770168244838715, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032545431167818606, "grad_norm": 7.88891077041626, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8670340776443481, "num_tokens": 640882312.0, "step": 16800 }, { "epoch": 2.1372598905991604, "ewc_loss": 0.06776605546474457, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032609800109639764, "grad_norm": 7.958174705505371, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8657337427139282, "num_tokens": 640921032.0, "step": 16801 }, { "epoch": 2.137387100877751, "ewc_loss": 0.06770244240760803, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003254619368817657, "grad_norm": 7.935479164123535, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8678282499313354, "num_tokens": 640957424.0, "step": 16802 }, { "epoch": 2.1375143111563415, "ewc_loss": 0.0677071362733841, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003255088231526315, "grad_norm": 7.986138343811035, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8585624694824219, "num_tokens": 640992985.0, "step": 16803 }, { "epoch": 2.137641521434932, "ewc_loss": 0.06793098151683807, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003253058821428567, "grad_norm": 7.898706436157227, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8760790824890137, "num_tokens": 641030379.0, "step": 16804 }, { "epoch": 2.1377687317135226, "ewc_loss": 0.06785304844379425, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032696794369257987, "grad_norm": 7.955288887023926, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8577501773834229, "num_tokens": 641068526.0, "step": 16805 }, { "epoch": 2.137895941992113, "ewc_loss": 0.06762801110744476, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032471760641783476, "grad_norm": 7.932733058929443, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8579179048538208, "num_tokens": 641103351.0, "step": 16806 }, { "epoch": 2.1380231522707036, "ewc_loss": 0.06769321858882904, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003253697359468788, "grad_norm": 7.905165195465088, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8682270050048828, "num_tokens": 641143570.0, "step": 16807 }, { "epoch": 2.138150362549294, "ewc_loss": 0.0677720382809639, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003261578967794776, "grad_norm": 7.932394504547119, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8696061968803406, "num_tokens": 641180625.0, "step": 16808 }, { "epoch": 2.1382775728278847, "ewc_loss": 0.06784103810787201, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003244064864702523, "grad_norm": 7.876733779907227, "learning_rate": 1e-06, "loss": 0.5353, "mean_token_accuracy": 0.8445333242416382, "num_tokens": 641226770.0, "step": 16809 }, { "epoch": 2.138404783106475, "ewc_loss": 0.0680771917104721, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032676797127351165, "grad_norm": 7.964044094085693, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8782116174697876, "num_tokens": 641269682.0, "step": 16810 }, { "epoch": 2.1385319933850653, "ewc_loss": 0.06792198121547699, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003252159513067454, "grad_norm": 7.9446187019348145, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8777994513511658, "num_tokens": 641303803.0, "step": 16811 }, { "epoch": 2.1386592036636562, "ewc_loss": 0.068059042096138, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032658653799444437, "grad_norm": 7.924411773681641, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8607462644577026, "num_tokens": 641343141.0, "step": 16812 }, { "epoch": 2.1387864139422463, "ewc_loss": 0.06806447356939316, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032664084574207664, "grad_norm": 7.951316833496094, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8698651790618896, "num_tokens": 641384006.0, "step": 16813 }, { "epoch": 2.138913624220837, "ewc_loss": 0.06803975254297256, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003263936087023467, "grad_norm": 7.9124274253845215, "learning_rate": 1e-06, "loss": 0.5225, "mean_token_accuracy": 0.8442625999450684, "num_tokens": 641423998.0, "step": 16814 }, { "epoch": 2.1390408344994274, "ewc_loss": 0.06806878745555878, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032668394851498306, "grad_norm": 7.9980268478393555, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8651013970375061, "num_tokens": 641457886.0, "step": 16815 }, { "epoch": 2.139168044778018, "ewc_loss": 0.06768489629030228, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003252864698879421, "grad_norm": 7.909783840179443, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8666906356811523, "num_tokens": 641503386.0, "step": 16816 }, { "epoch": 2.1392952550566084, "ewc_loss": 0.06778265535831451, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032626400934532285, "grad_norm": 7.91921854019165, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8757914304733276, "num_tokens": 641539533.0, "step": 16817 }, { "epoch": 2.139422465335199, "ewc_loss": 0.06773962825536728, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032583376741968095, "grad_norm": 7.975072860717773, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8606209754943848, "num_tokens": 641574020.0, "step": 16818 }, { "epoch": 2.1395496756137895, "ewc_loss": 0.06763682514429092, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003248057619202882, "grad_norm": 8.019811630249023, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8699362277984619, "num_tokens": 641605206.0, "step": 16819 }, { "epoch": 2.13967688589238, "ewc_loss": 0.0677473247051239, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032591077615506947, "grad_norm": 7.886051654815674, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8561368584632874, "num_tokens": 641649857.0, "step": 16820 }, { "epoch": 2.1398040961709706, "ewc_loss": 0.06788262724876404, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032726378412917256, "grad_norm": 8.040358543395996, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.859764575958252, "num_tokens": 641683360.0, "step": 16821 }, { "epoch": 2.139931306449561, "ewc_loss": 0.0676112249493599, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003245497355237603, "grad_norm": 7.943904876708984, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.86623215675354, "num_tokens": 641720105.0, "step": 16822 }, { "epoch": 2.1400585167281516, "ewc_loss": 0.06785587221384048, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003269962326157838, "grad_norm": 8.024855613708496, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8688021898269653, "num_tokens": 641750718.0, "step": 16823 }, { "epoch": 2.140185727006742, "ewc_loss": 0.06765587627887726, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032499630469828844, "grad_norm": 7.909851551055908, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8641598224639893, "num_tokens": 641791620.0, "step": 16824 }, { "epoch": 2.1403129372853327, "ewc_loss": 0.06808008253574371, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003267969295848161, "grad_norm": 8.003591537475586, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8545792102813721, "num_tokens": 641833566.0, "step": 16825 }, { "epoch": 2.140440147563923, "ewc_loss": 0.06816565990447998, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003252113237977028, "grad_norm": 7.926568984985352, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.871993899345398, "num_tokens": 641869080.0, "step": 16826 }, { "epoch": 2.1405673578425137, "ewc_loss": 0.06805320084095001, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032652815571054816, "grad_norm": 7.982956886291504, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8629860877990723, "num_tokens": 641910195.0, "step": 16827 }, { "epoch": 2.1406945681211043, "ewc_loss": 0.06786344200372696, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003246305277571082, "grad_norm": 7.911584377288818, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8531707525253296, "num_tokens": 641947270.0, "step": 16828 }, { "epoch": 2.140821778399695, "ewc_loss": 0.06824301183223724, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032842616201378405, "grad_norm": 8.08824348449707, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8678281307220459, "num_tokens": 641986472.0, "step": 16829 }, { "epoch": 2.1409489886782853, "ewc_loss": 0.06806717813014984, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003242264792788774, "grad_norm": 7.863104343414307, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.853431224822998, "num_tokens": 642028130.0, "step": 16830 }, { "epoch": 2.141076198956876, "ewc_loss": 0.06826838105916977, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003286799183115363, "grad_norm": 8.003695487976074, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8609856963157654, "num_tokens": 642067097.0, "step": 16831 }, { "epoch": 2.1412034092354664, "ewc_loss": 0.0678146481513977, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032414254383184016, "grad_norm": 7.854019641876221, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8769497871398926, "num_tokens": 642099628.0, "step": 16832 }, { "epoch": 2.141330619514057, "ewc_loss": 0.06833495199680328, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003293456102255732, "grad_norm": 8.039942741394043, "learning_rate": 1e-06, "loss": 0.3842, "mean_token_accuracy": 0.8889707326889038, "num_tokens": 642135941.0, "step": 16833 }, { "epoch": 2.1414578297926474, "ewc_loss": 0.06779864430427551, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003239825600758195, "grad_norm": 7.884392738342285, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8586536645889282, "num_tokens": 642169465.0, "step": 16834 }, { "epoch": 2.141585040071238, "ewc_loss": 0.06824260205030441, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032842211658135056, "grad_norm": 8.142223358154297, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8682857751846313, "num_tokens": 642208961.0, "step": 16835 }, { "epoch": 2.141712250349828, "ewc_loss": 0.06795553863048553, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00032311008544638753, "grad_norm": 7.765829563140869, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8642166256904602, "num_tokens": 642249691.0, "step": 16836 }, { "epoch": 2.141839460628419, "ewc_loss": 0.06877311319112778, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003337272210046649, "grad_norm": 8.336727142333984, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8688331842422485, "num_tokens": 642292425.0, "step": 16837 }, { "epoch": 2.141966670907009, "ewc_loss": 0.06759551167488098, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003219511709176004, "grad_norm": 7.706340789794922, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8762227296829224, "num_tokens": 642330982.0, "step": 16838 }, { "epoch": 2.1420938811855996, "ewc_loss": 0.0691043883562088, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003370399645064026, "grad_norm": 8.231547355651855, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8663763999938965, "num_tokens": 642365414.0, "step": 16839 }, { "epoch": 2.14222109146419, "ewc_loss": 0.06774960458278656, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000323492189636454, "grad_norm": 7.833605766296387, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.859199047088623, "num_tokens": 642398660.0, "step": 16840 }, { "epoch": 2.1423483017427807, "ewc_loss": 0.06892135739326477, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033520962460897863, "grad_norm": 8.129837989807129, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8733651638031006, "num_tokens": 642432605.0, "step": 16841 }, { "epoch": 2.142475512021371, "ewc_loss": 0.0680018737912178, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003260148223489523, "grad_norm": 8.025378227233887, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8757888078689575, "num_tokens": 642472041.0, "step": 16842 }, { "epoch": 2.1426027222999617, "ewc_loss": 0.06851327419281006, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033112880191765726, "grad_norm": 8.110084533691406, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8742260336875916, "num_tokens": 642512399.0, "step": 16843 }, { "epoch": 2.1427299325785523, "ewc_loss": 0.06810308992862701, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032702702446840703, "grad_norm": 8.1989164352417, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8591276407241821, "num_tokens": 642550829.0, "step": 16844 }, { "epoch": 2.142857142857143, "ewc_loss": 0.06776313483715057, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032362743513658643, "grad_norm": 7.925727367401123, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8661525845527649, "num_tokens": 642591255.0, "step": 16845 }, { "epoch": 2.1429843531357333, "ewc_loss": 0.06801725924015045, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003261687234044075, "grad_norm": 7.940999984741211, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8670667409896851, "num_tokens": 642625772.0, "step": 16846 }, { "epoch": 2.143111563414324, "ewc_loss": 0.06787792593240738, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003247753484174609, "grad_norm": 7.937224388122559, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.858984112739563, "num_tokens": 642667596.0, "step": 16847 }, { "epoch": 2.1432387736929144, "ewc_loss": 0.06804527342319489, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000326448876876384, "grad_norm": 7.905887603759766, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8675049543380737, "num_tokens": 642702159.0, "step": 16848 }, { "epoch": 2.143365983971505, "ewc_loss": 0.06802096962928772, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003262057725805789, "grad_norm": 7.903190612792969, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.880082368850708, "num_tokens": 642737307.0, "step": 16849 }, { "epoch": 2.1434931942500954, "ewc_loss": 0.06816369295120239, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032763302442617714, "grad_norm": 7.966671466827393, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8561987280845642, "num_tokens": 642776660.0, "step": 16850 }, { "epoch": 2.143620404528686, "ewc_loss": 0.06809871643781662, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003269832523074001, "grad_norm": 7.924402713775635, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8594785332679749, "num_tokens": 642814131.0, "step": 16851 }, { "epoch": 2.1437476148072765, "ewc_loss": 0.0681910440325737, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032790654222480953, "grad_norm": 7.943729877471924, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8469393849372864, "num_tokens": 642854138.0, "step": 16852 }, { "epoch": 2.143874825085867, "ewc_loss": 0.06808128952980042, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003268090367782861, "grad_norm": 7.854914665222168, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8723021149635315, "num_tokens": 642893767.0, "step": 16853 }, { "epoch": 2.1440020353644575, "ewc_loss": 0.06826320290565491, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003286281134933233, "grad_norm": 7.9565582275390625, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.8813372254371643, "num_tokens": 642928286.0, "step": 16854 }, { "epoch": 2.144129245643048, "ewc_loss": 0.06802266836166382, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003262227401137352, "grad_norm": 7.906976699829102, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8737151622772217, "num_tokens": 642971220.0, "step": 16855 }, { "epoch": 2.1442564559216386, "ewc_loss": 0.06826423108577728, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003286383580416441, "grad_norm": 7.941531658172607, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8701021075248718, "num_tokens": 643013618.0, "step": 16856 }, { "epoch": 2.144383666200229, "ewc_loss": 0.06810766458511353, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000327072775689885, "grad_norm": 7.850844383239746, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8683133125305176, "num_tokens": 643058122.0, "step": 16857 }, { "epoch": 2.1445108764788197, "ewc_loss": 0.06829418241977692, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003289379528723657, "grad_norm": 7.9896745681762695, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.854325532913208, "num_tokens": 643096662.0, "step": 16858 }, { "epoch": 2.14463808675741, "ewc_loss": 0.06813737750053406, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032736986759118736, "grad_norm": 7.881997585296631, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8790314197540283, "num_tokens": 643137923.0, "step": 16859 }, { "epoch": 2.1447652970360007, "ewc_loss": 0.06832654774188995, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032926155836321414, "grad_norm": 7.967984676361084, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8721456527709961, "num_tokens": 643176918.0, "step": 16860 }, { "epoch": 2.144892507314591, "ewc_loss": 0.06821160018444061, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003281121316831559, "grad_norm": 7.88381814956665, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.884583055973053, "num_tokens": 643217525.0, "step": 16861 }, { "epoch": 2.1450197175931813, "ewc_loss": 0.06833820044994354, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003293781483080238, "grad_norm": 7.937376022338867, "learning_rate": 1e-06, "loss": 0.4016, "mean_token_accuracy": 0.883420467376709, "num_tokens": 643255131.0, "step": 16862 }, { "epoch": 2.145146927871772, "ewc_loss": 0.06819824874401093, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003279785451013595, "grad_norm": 7.9635233879089355, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8755325078964233, "num_tokens": 643292092.0, "step": 16863 }, { "epoch": 2.1452741381503624, "ewc_loss": 0.06828843057155609, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032888035639189184, "grad_norm": 7.950689315795898, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8626374006271362, "num_tokens": 643330510.0, "step": 16864 }, { "epoch": 2.145401348428953, "ewc_loss": 0.06827928125858307, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032878885394893587, "grad_norm": 7.932928085327148, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.86130690574646, "num_tokens": 643372900.0, "step": 16865 }, { "epoch": 2.1455285587075434, "ewc_loss": 0.06792937964200974, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032773130806162953, "grad_norm": 7.947514057159424, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8746311068534851, "num_tokens": 643412866.0, "step": 16866 }, { "epoch": 2.145655768986134, "ewc_loss": 0.0679420679807663, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032785822986625135, "grad_norm": 7.9068603515625, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.85035240650177, "num_tokens": 643452846.0, "step": 16867 }, { "epoch": 2.1457829792647245, "ewc_loss": 0.0682687759399414, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003286838182248175, "grad_norm": 8.002830505371094, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8725115060806274, "num_tokens": 643488264.0, "step": 16868 }, { "epoch": 2.145910189543315, "ewc_loss": 0.06772977113723755, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000325735192745924, "grad_norm": 7.933476448059082, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8541064858436584, "num_tokens": 643528131.0, "step": 16869 }, { "epoch": 2.1460373998219056, "ewc_loss": 0.06815122067928314, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003299497184343636, "grad_norm": 8.007433891296387, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8664359450340271, "num_tokens": 643564041.0, "step": 16870 }, { "epoch": 2.146164610100496, "ewc_loss": 0.0678175836801529, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032661331351846457, "grad_norm": 7.960705280303955, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8548441529273987, "num_tokens": 643601507.0, "step": 16871 }, { "epoch": 2.1462918203790866, "ewc_loss": 0.06799681484699249, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003284056729171425, "grad_norm": 7.8990159034729, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8516035079956055, "num_tokens": 643647161.0, "step": 16872 }, { "epoch": 2.146419030657677, "ewc_loss": 0.06802015006542206, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003286390274297446, "grad_norm": 7.984642028808594, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8724610209465027, "num_tokens": 643687114.0, "step": 16873 }, { "epoch": 2.1465462409362677, "ewc_loss": 0.06777950376272202, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003262325481045991, "grad_norm": 7.924602508544922, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8521928191184998, "num_tokens": 643722110.0, "step": 16874 }, { "epoch": 2.146673451214858, "ewc_loss": 0.06796059012413025, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032804341753944755, "grad_norm": 7.938805103302002, "learning_rate": 1e-06, "loss": 0.5393, "mean_token_accuracy": 0.8438549637794495, "num_tokens": 643765770.0, "step": 16875 }, { "epoch": 2.1468006614934487, "ewc_loss": 0.0679592564702034, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003280300588812679, "grad_norm": 7.953981876373291, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8573963642120361, "num_tokens": 643810097.0, "step": 16876 }, { "epoch": 2.1469278717720393, "ewc_loss": 0.06788580864667892, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003272955946158618, "grad_norm": 8.002018928527832, "learning_rate": 1e-06, "loss": 0.5338, "mean_token_accuracy": 0.8396930694580078, "num_tokens": 643847904.0, "step": 16877 }, { "epoch": 2.14705508205063, "ewc_loss": 0.06822022795677185, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032819839543662965, "grad_norm": 10.763333320617676, "learning_rate": 1e-06, "loss": 0.4279, "mean_token_accuracy": 0.8735382556915283, "num_tokens": 643889157.0, "step": 16878 }, { "epoch": 2.1471822923292203, "ewc_loss": 0.06980187445878983, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00034401484299451113, "grad_norm": 7.954680919647217, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8701677918434143, "num_tokens": 643931181.0, "step": 16879 }, { "epoch": 2.147309502607811, "ewc_loss": 0.06979265809059143, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003463640750851482, "grad_norm": 8.508257865905762, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8648957014083862, "num_tokens": 643965415.0, "step": 16880 }, { "epoch": 2.1474367128864014, "ewc_loss": 0.06773485988378525, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003257860953453928, "grad_norm": 7.852044105529785, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8517439961433411, "num_tokens": 644010647.0, "step": 16881 }, { "epoch": 2.147563923164992, "ewc_loss": 0.07021729648113251, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00035061041126027703, "grad_norm": 8.508606910705566, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8824615478515625, "num_tokens": 644044347.0, "step": 16882 }, { "epoch": 2.1476911334435824, "ewc_loss": 0.06796576082706451, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003280951059423387, "grad_norm": 7.978279113769531, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8739208579063416, "num_tokens": 644083456.0, "step": 16883 }, { "epoch": 2.147818343722173, "ewc_loss": 0.06934581696987152, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000341895705787465, "grad_norm": 8.291760444641113, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8578125238418579, "num_tokens": 644123851.0, "step": 16884 }, { "epoch": 2.1479455540007635, "ewc_loss": 0.06806056201457977, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032904313411563635, "grad_norm": 8.006800651550293, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8769856691360474, "num_tokens": 644168880.0, "step": 16885 }, { "epoch": 2.1480727642793536, "ewc_loss": 0.06886174529790878, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033705495297908783, "grad_norm": 8.23507308959961, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8677074909210205, "num_tokens": 644204532.0, "step": 16886 }, { "epoch": 2.148199974557944, "ewc_loss": 0.06812776625156403, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003297151706647128, "grad_norm": 7.957614421844482, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.8845915794372559, "num_tokens": 644246160.0, "step": 16887 }, { "epoch": 2.1483271848365346, "ewc_loss": 0.06853094696998596, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033374695340171456, "grad_norm": 8.117016792297363, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8790968060493469, "num_tokens": 644284881.0, "step": 16888 }, { "epoch": 2.148454395115125, "ewc_loss": 0.06788887828588486, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032976767397485673, "grad_norm": 8.052584648132324, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8596789240837097, "num_tokens": 644326661.0, "step": 16889 }, { "epoch": 2.1485816053937157, "ewc_loss": 0.06793861091136932, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003302650002297014, "grad_norm": 8.047304153442383, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.871246337890625, "num_tokens": 644364420.0, "step": 16890 }, { "epoch": 2.148708815672306, "ewc_loss": 0.06791386008262634, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003300175303593278, "grad_norm": 8.022927284240723, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8495677709579468, "num_tokens": 644398626.0, "step": 16891 }, { "epoch": 2.1488360259508967, "ewc_loss": 0.06783230602741241, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032920201192609966, "grad_norm": 8.040061950683594, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8662199974060059, "num_tokens": 644434772.0, "step": 16892 }, { "epoch": 2.1489632362294873, "ewc_loss": 0.06799592077732086, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032839670893736184, "grad_norm": 8.010796546936035, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8629844784736633, "num_tokens": 644474425.0, "step": 16893 }, { "epoch": 2.149090446508078, "ewc_loss": 0.06805220991373062, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003289596061222255, "grad_norm": 8.047447204589844, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8569461703300476, "num_tokens": 644507796.0, "step": 16894 }, { "epoch": 2.1492176567866683, "ewc_loss": 0.06766489148139954, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003275277849752456, "grad_norm": 8.003684997558594, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8602535128593445, "num_tokens": 644544172.0, "step": 16895 }, { "epoch": 2.149344867065259, "ewc_loss": 0.06805291771888733, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003289666783530265, "grad_norm": 7.955964088439941, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.864687442779541, "num_tokens": 644582217.0, "step": 16896 }, { "epoch": 2.1494720773438494, "ewc_loss": 0.06814561039209366, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000329893606249243, "grad_norm": 7.975370407104492, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8729426860809326, "num_tokens": 644620028.0, "step": 16897 }, { "epoch": 2.14959928762244, "ewc_loss": 0.06777764111757278, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032865532557480037, "grad_norm": 8.022027969360352, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8502250909805298, "num_tokens": 644663743.0, "step": 16898 }, { "epoch": 2.1497264979010304, "ewc_loss": 0.06777188181877136, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003285977290943265, "grad_norm": 7.924840450286865, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8683032989501953, "num_tokens": 644700914.0, "step": 16899 }, { "epoch": 2.149853708179621, "ewc_loss": 0.06787866353988647, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032966554863378406, "grad_norm": 7.962165832519531, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8619414567947388, "num_tokens": 644744232.0, "step": 16900 }, { "epoch": 2.1499809184582115, "ewc_loss": 0.06779550015926361, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.00032883393578231335, "grad_norm": 10.733373641967773, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8583697080612183, "num_tokens": 644780851.0, "step": 16901 }, { "epoch": 2.150108128736802, "ewc_loss": 0.06940706074237823, "ewc_loss_diag": 3.4809112548828125e-05, "ewc_loss_parallel": 0.0003449494834057987, "grad_norm": 7.958227157592773, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8746960163116455, "num_tokens": 644822103.0, "step": 16902 }, { "epoch": 2.1502353390153925, "ewc_loss": 0.06996888667345047, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00034812637022696435, "grad_norm": 8.467939376831055, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8636782169342041, "num_tokens": 644861209.0, "step": 16903 }, { "epoch": 2.150362549293983, "ewc_loss": 0.06787189841270447, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032715650741010904, "grad_norm": 7.886452674865723, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8528299331665039, "num_tokens": 644899860.0, "step": 16904 }, { "epoch": 2.1504897595725736, "ewc_loss": 0.07009431719779968, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003493807162158191, "grad_norm": 8.40500259399414, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8608320951461792, "num_tokens": 644943101.0, "step": 16905 }, { "epoch": 2.150616969851164, "ewc_loss": 0.06824542582035065, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033089175121858716, "grad_norm": 7.999416351318359, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8547989726066589, "num_tokens": 644981786.0, "step": 16906 }, { "epoch": 2.1507441801297547, "ewc_loss": 0.06919622421264648, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00034039971069432795, "grad_norm": 8.181686401367188, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8685951232910156, "num_tokens": 645017949.0, "step": 16907 }, { "epoch": 2.150871390408345, "ewc_loss": 0.06835853308439255, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003320228133816272, "grad_norm": 8.113715171813965, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8650102615356445, "num_tokens": 645053443.0, "step": 16908 }, { "epoch": 2.1509986006869353, "ewc_loss": 0.06861457973718643, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033458328107371926, "grad_norm": 8.104058265686035, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8674824237823486, "num_tokens": 645089916.0, "step": 16909 }, { "epoch": 2.1511258109655262, "ewc_loss": 0.06833165884017944, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033175412681885064, "grad_norm": 8.094103813171387, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.849389910697937, "num_tokens": 645134531.0, "step": 16910 }, { "epoch": 2.1512530212441163, "ewc_loss": 0.06829477846622467, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000331385264871642, "grad_norm": 8.061080932617188, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.859447717666626, "num_tokens": 645172608.0, "step": 16911 }, { "epoch": 2.151380231522707, "ewc_loss": 0.06825150549411774, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003309525491204113, "grad_norm": 8.021100997924805, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8616126775741577, "num_tokens": 645209895.0, "step": 16912 }, { "epoch": 2.1515074418012974, "ewc_loss": 0.06817573308944702, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033019480179063976, "grad_norm": 8.005661010742188, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8719775676727295, "num_tokens": 645245259.0, "step": 16913 }, { "epoch": 2.151634652079888, "ewc_loss": 0.06822380423545837, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003306755388621241, "grad_norm": 8.053411483764648, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8613995909690857, "num_tokens": 645284962.0, "step": 16914 }, { "epoch": 2.1517618623584784, "ewc_loss": 0.06818769872188568, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003303145058453083, "grad_norm": 7.9512457847595215, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8802498579025269, "num_tokens": 645322352.0, "step": 16915 }, { "epoch": 2.151889072637069, "ewc_loss": 0.06825891137123108, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033102661836892366, "grad_norm": 8.032597541809082, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8648606538772583, "num_tokens": 645358401.0, "step": 16916 }, { "epoch": 2.1520162829156595, "ewc_loss": 0.0680546686053276, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000328984169755131, "grad_norm": 7.942483901977539, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.878089964389801, "num_tokens": 645399147.0, "step": 16917 }, { "epoch": 2.15214349319425, "ewc_loss": 0.06824889779090881, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033092653029598296, "grad_norm": 8.015233039855957, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8478187918663025, "num_tokens": 645437286.0, "step": 16918 }, { "epoch": 2.1522707034728406, "ewc_loss": 0.06801068782806396, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032854441087692976, "grad_norm": 7.962700843811035, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8583858013153076, "num_tokens": 645477853.0, "step": 16919 }, { "epoch": 2.152397913751431, "ewc_loss": 0.06823104619979858, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033074794919230044, "grad_norm": 8.013312339782715, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8564514517784119, "num_tokens": 645520476.0, "step": 16920 }, { "epoch": 2.1525251240300216, "ewc_loss": 0.06805595755577087, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003289970336481929, "grad_norm": 7.975786209106445, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8503004908561707, "num_tokens": 645561539.0, "step": 16921 }, { "epoch": 2.152652334308612, "ewc_loss": 0.06844004988670349, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003303966368548572, "grad_norm": 7.939836025238037, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8613173961639404, "num_tokens": 645602950.0, "step": 16922 }, { "epoch": 2.1527795445872027, "ewc_loss": 0.0682208240032196, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033064576564356685, "grad_norm": 8.081295013427734, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8555986881256104, "num_tokens": 645639215.0, "step": 16923 }, { "epoch": 2.152906754865793, "ewc_loss": 0.06792590022087097, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003276964998804033, "grad_norm": 7.865181922912598, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8645020723342896, "num_tokens": 645677287.0, "step": 16924 }, { "epoch": 2.1530339651443837, "ewc_loss": 0.06841087341308594, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003325462166685611, "grad_norm": 8.049344062805176, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8732593059539795, "num_tokens": 645710784.0, "step": 16925 }, { "epoch": 2.1531611754229742, "ewc_loss": 0.06815791875123978, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000327575282426551, "grad_norm": 7.9403815269470215, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8618748188018799, "num_tokens": 645750568.0, "step": 16926 }, { "epoch": 2.1532883857015648, "ewc_loss": 0.06858067214488983, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003318027884233743, "grad_norm": 8.045022010803223, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8699489235877991, "num_tokens": 645783911.0, "step": 16927 }, { "epoch": 2.1534155959801553, "ewc_loss": 0.0678737536072731, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003271750174462795, "grad_norm": 7.895853042602539, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8817265033721924, "num_tokens": 645822524.0, "step": 16928 }, { "epoch": 2.153542806258746, "ewc_loss": 0.06838549673557281, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003322924312669784, "grad_norm": 8.060966491699219, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8607488870620728, "num_tokens": 645856924.0, "step": 16929 }, { "epoch": 2.1536700165373364, "ewc_loss": 0.06818057596683502, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032780185574665666, "grad_norm": 7.94279670715332, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8578227758407593, "num_tokens": 645896995.0, "step": 16930 }, { "epoch": 2.153797226815927, "ewc_loss": 0.06829476356506348, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033138514845632017, "grad_norm": 8.030319213867188, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8723971247673035, "num_tokens": 645941768.0, "step": 16931 }, { "epoch": 2.1539244370945174, "ewc_loss": 0.06808025389909744, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032924002152867615, "grad_norm": 7.95216178894043, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8630449771881104, "num_tokens": 645981909.0, "step": 16932 }, { "epoch": 2.154051647373108, "ewc_loss": 0.0681571513414383, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003300090611446649, "grad_norm": 7.990965843200684, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8557108640670776, "num_tokens": 646022034.0, "step": 16933 }, { "epoch": 2.154178857651698, "ewc_loss": 0.06822118163108826, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003306492872070521, "grad_norm": 8.02402114868164, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8641278743743896, "num_tokens": 646056714.0, "step": 16934 }, { "epoch": 2.1543060679302886, "ewc_loss": 0.06805351376533508, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003289726737421006, "grad_norm": 8.001137733459473, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8563599586486816, "num_tokens": 646095668.0, "step": 16935 }, { "epoch": 2.154433278208879, "ewc_loss": 0.06811332702636719, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000329570728354156, "grad_norm": 8.037454605102539, "learning_rate": 1e-06, "loss": 0.5168, "mean_token_accuracy": 0.8492568731307983, "num_tokens": 646136745.0, "step": 16936 }, { "epoch": 2.1545604884874696, "ewc_loss": 0.0682506114244461, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003285022103227675, "grad_norm": 7.913374900817871, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8611456155776978, "num_tokens": 646180314.0, "step": 16937 }, { "epoch": 2.15468769876606, "ewc_loss": 0.06824187189340591, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003308562154415995, "grad_norm": 8.008987426757812, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8676353693008423, "num_tokens": 646218191.0, "step": 16938 }, { "epoch": 2.1548149090446507, "ewc_loss": 0.06824225187301636, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003284185950178653, "grad_norm": 7.967529296875, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8808556795120239, "num_tokens": 646256350.0, "step": 16939 }, { "epoch": 2.154942119323241, "ewc_loss": 0.0684821754693985, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033081788569688797, "grad_norm": 7.989222526550293, "learning_rate": 1e-06, "loss": 0.3782, "mean_token_accuracy": 0.887978732585907, "num_tokens": 646290915.0, "step": 16940 }, { "epoch": 2.1550693296018317, "ewc_loss": 0.06839863210916519, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003299824020359665, "grad_norm": 8.00658893585205, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8587282299995422, "num_tokens": 646325838.0, "step": 16941 }, { "epoch": 2.1551965398804223, "ewc_loss": 0.0684196949005127, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003301930264569819, "grad_norm": 8.000692367553711, "learning_rate": 1e-06, "loss": 0.5219, "mean_token_accuracy": 0.8461270332336426, "num_tokens": 646362742.0, "step": 16942 }, { "epoch": 2.155323750159013, "ewc_loss": 0.06827132403850555, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003287093713879585, "grad_norm": 7.9390435218811035, "learning_rate": 1e-06, "loss": 0.5133, "mean_token_accuracy": 0.8524937629699707, "num_tokens": 646403958.0, "step": 16943 }, { "epoch": 2.1554509604376033, "ewc_loss": 0.06854850053787231, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003314811328891665, "grad_norm": 8.031275749206543, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8572642803192139, "num_tokens": 646442872.0, "step": 16944 }, { "epoch": 2.155578170716194, "ewc_loss": 0.06822296977043152, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003282258112449199, "grad_norm": 7.947106838226318, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8716909885406494, "num_tokens": 646484642.0, "step": 16945 }, { "epoch": 2.1557053809947844, "ewc_loss": 0.06855808198451996, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003315769135951996, "grad_norm": 8.045100212097168, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8749570846557617, "num_tokens": 646521688.0, "step": 16946 }, { "epoch": 2.155832591273375, "ewc_loss": 0.0683295726776123, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003292918554507196, "grad_norm": 7.925909042358398, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.872198224067688, "num_tokens": 646560182.0, "step": 16947 }, { "epoch": 2.1559598015519654, "ewc_loss": 0.06850621104240417, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033105816692113876, "grad_norm": 8.032867431640625, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8694235682487488, "num_tokens": 646593252.0, "step": 16948 }, { "epoch": 2.156087011830556, "ewc_loss": 0.06811625510454178, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003296000650152564, "grad_norm": 7.9245500564575195, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8640192747116089, "num_tokens": 646635514.0, "step": 16949 }, { "epoch": 2.1562142221091465, "ewc_loss": 0.06836201250553131, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033205767977051437, "grad_norm": 8.050678253173828, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8530907034873962, "num_tokens": 646674785.0, "step": 16950 }, { "epoch": 2.156341432387737, "ewc_loss": 0.06816218793392181, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003300593525636941, "grad_norm": 7.969390392303467, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8650403022766113, "num_tokens": 646713832.0, "step": 16951 }, { "epoch": 2.1564686426663275, "ewc_loss": 0.06828523427248001, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033128983341157436, "grad_norm": 8.014603614807129, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8672477006912231, "num_tokens": 646752218.0, "step": 16952 }, { "epoch": 2.156595852944918, "ewc_loss": 0.06844329833984375, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000330429058521986, "grad_norm": 8.014327049255371, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8643662929534912, "num_tokens": 646786196.0, "step": 16953 }, { "epoch": 2.1567230632235086, "ewc_loss": 0.06834490597248077, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003294452035333961, "grad_norm": 8.619646072387695, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8579871654510498, "num_tokens": 646821462.0, "step": 16954 }, { "epoch": 2.156850273502099, "ewc_loss": 0.06763821840286255, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003248196735512465, "grad_norm": 7.861702919006348, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8723852634429932, "num_tokens": 646856613.0, "step": 16955 }, { "epoch": 2.1569774837806897, "ewc_loss": 0.06854219734668732, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003338595270179212, "grad_norm": 8.11591625213623, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8771642446517944, "num_tokens": 646893013.0, "step": 16956 }, { "epoch": 2.15710469405928, "ewc_loss": 0.06781461834907532, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032414228189736605, "grad_norm": 7.8764967918396, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8691591620445251, "num_tokens": 646936180.0, "step": 16957 }, { "epoch": 2.1572319043378707, "ewc_loss": 0.06882897019386292, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033428578171879053, "grad_norm": 8.08267593383789, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8655319213867188, "num_tokens": 646979314.0, "step": 16958 }, { "epoch": 2.157359114616461, "ewc_loss": 0.06780888140201569, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003265262639615685, "grad_norm": 7.934121131896973, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8755991458892822, "num_tokens": 647020136.0, "step": 16959 }, { "epoch": 2.1574863248950513, "ewc_loss": 0.0683327466249466, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033176501165144145, "grad_norm": 8.112237930297852, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.869103729724884, "num_tokens": 647052970.0, "step": 16960 }, { "epoch": 2.157613535173642, "ewc_loss": 0.06819084286689758, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003279044758528471, "grad_norm": 7.963037967681885, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8587955236434937, "num_tokens": 647093195.0, "step": 16961 }, { "epoch": 2.1577407454522324, "ewc_loss": 0.06851141154766083, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003311102627776563, "grad_norm": 8.029664993286133, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8662859201431274, "num_tokens": 647133800.0, "step": 16962 }, { "epoch": 2.157867955730823, "ewc_loss": 0.06829630583524704, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032895916956476867, "grad_norm": 8.083940505981445, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8494815826416016, "num_tokens": 647167704.0, "step": 16963 }, { "epoch": 2.1579951660094134, "ewc_loss": 0.06802739948034286, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032871149596758187, "grad_norm": 8.02183723449707, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8522125482559204, "num_tokens": 647208474.0, "step": 16964 }, { "epoch": 2.158122376288004, "ewc_loss": 0.06804110109806061, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032884854590520263, "grad_norm": 8.644560813903809, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.87835294008255, "num_tokens": 647242131.0, "step": 16965 }, { "epoch": 2.1582495865665945, "ewc_loss": 0.06743074953556061, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003227449778933078, "grad_norm": 7.775613784790039, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8450604677200317, "num_tokens": 647280952.0, "step": 16966 }, { "epoch": 2.158376796845185, "ewc_loss": 0.06884296238422394, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003368671459611505, "grad_norm": 8.270328521728516, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8595832586288452, "num_tokens": 647318411.0, "step": 16967 }, { "epoch": 2.1585040071237755, "ewc_loss": 0.0675303041934967, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032129912870004773, "grad_norm": 7.786665916442871, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8517534136772156, "num_tokens": 647361002.0, "step": 16968 }, { "epoch": 2.158631217402366, "ewc_loss": 0.06929203867912292, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033891649218276143, "grad_norm": 8.244601249694824, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8677700757980347, "num_tokens": 647402995.0, "step": 16969 }, { "epoch": 2.1587584276809566, "ewc_loss": 0.06782714277505875, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032426751567982137, "grad_norm": 7.846273899078369, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8673460483551025, "num_tokens": 647441220.0, "step": 16970 }, { "epoch": 2.158885637959547, "ewc_loss": 0.06909272819757462, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033692337456159294, "grad_norm": 8.169633865356445, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8575568199157715, "num_tokens": 647476832.0, "step": 16971 }, { "epoch": 2.1590128482381377, "ewc_loss": 0.06824858486652374, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032848192495293915, "grad_norm": 7.9824700355529785, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.866970419883728, "num_tokens": 647516855.0, "step": 16972 }, { "epoch": 2.159140058516728, "ewc_loss": 0.06872539222240448, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033325000549666584, "grad_norm": 8.154072761535645, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8622101545333862, "num_tokens": 647556913.0, "step": 16973 }, { "epoch": 2.1592672687953187, "ewc_loss": 0.06819061934947968, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032790223485790193, "grad_norm": 7.906197547912598, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8820532560348511, "num_tokens": 647591242.0, "step": 16974 }, { "epoch": 2.1593944790739092, "ewc_loss": 0.06875590980052948, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003335552173666656, "grad_norm": 8.106473922729492, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8686712384223938, "num_tokens": 647632156.0, "step": 16975 }, { "epoch": 2.1595216893524998, "ewc_loss": 0.06808731704950333, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003268692526035011, "grad_norm": 7.932834148406982, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8708516359329224, "num_tokens": 647669857.0, "step": 16976 }, { "epoch": 2.1596488996310903, "ewc_loss": 0.06861358880996704, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033213195274583995, "grad_norm": 8.083294868469238, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8525499701499939, "num_tokens": 647715580.0, "step": 16977 }, { "epoch": 2.159776109909681, "ewc_loss": 0.06828676909208298, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032886379631236196, "grad_norm": 7.962386131286621, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8559714555740356, "num_tokens": 647755917.0, "step": 16978 }, { "epoch": 2.1599033201882714, "ewc_loss": 0.06851065158843994, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003311026084702462, "grad_norm": 8.02038288116455, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8809413909912109, "num_tokens": 647802205.0, "step": 16979 }, { "epoch": 2.160030530466862, "ewc_loss": 0.06837454438209534, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032974156783893704, "grad_norm": 8.027192115783691, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.851938009262085, "num_tokens": 647846627.0, "step": 16980 }, { "epoch": 2.1601577407454524, "ewc_loss": 0.06834767758846283, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003294728812761605, "grad_norm": 8.045642852783203, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8564028739929199, "num_tokens": 647885033.0, "step": 16981 }, { "epoch": 2.160284951024043, "ewc_loss": 0.06807602196931839, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003291977336630225, "grad_norm": 8.011427879333496, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8557974696159363, "num_tokens": 647923370.0, "step": 16982 }, { "epoch": 2.1604121613026335, "ewc_loss": 0.06829683482646942, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032896449556574225, "grad_norm": 7.960717678070068, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8711895942687988, "num_tokens": 647963431.0, "step": 16983 }, { "epoch": 2.1605393715812236, "ewc_loss": 0.06819500029087067, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003303875564597547, "grad_norm": 8.017498016357422, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8568778038024902, "num_tokens": 648002854.0, "step": 16984 }, { "epoch": 2.160666581859814, "ewc_loss": 0.06804054975509644, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032884295796975493, "grad_norm": 7.963739395141602, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8517956733703613, "num_tokens": 648041082.0, "step": 16985 }, { "epoch": 2.1607937921384046, "ewc_loss": 0.06832461059093475, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003316835791338235, "grad_norm": 8.049362182617188, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8562103509902954, "num_tokens": 648082109.0, "step": 16986 }, { "epoch": 2.160921002416995, "ewc_loss": 0.06817857921123505, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033022332354448736, "grad_norm": 7.9661688804626465, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8793801069259644, "num_tokens": 648117398.0, "step": 16987 }, { "epoch": 2.1610482126955857, "ewc_loss": 0.06831491738557816, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033158669248223305, "grad_norm": 8.026138305664062, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8725281357765198, "num_tokens": 648158997.0, "step": 16988 }, { "epoch": 2.161175422974176, "ewc_loss": 0.06812798231840134, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003297173243481666, "grad_norm": 7.9730329513549805, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8624696731567383, "num_tokens": 648194334.0, "step": 16989 }, { "epoch": 2.1613026332527667, "ewc_loss": 0.06823308020830154, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003307682927697897, "grad_norm": 8.01126766204834, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.860487699508667, "num_tokens": 648230601.0, "step": 16990 }, { "epoch": 2.1614298435313573, "ewc_loss": 0.06820448487997055, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003304823476355523, "grad_norm": 7.944175720214844, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8596937656402588, "num_tokens": 648274979.0, "step": 16991 }, { "epoch": 2.161557053809948, "ewc_loss": 0.0686088353395462, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003320844261907041, "grad_norm": 8.06393814086914, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8489818572998047, "num_tokens": 648319229.0, "step": 16992 }, { "epoch": 2.1616842640885383, "ewc_loss": 0.06831993162631989, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000329195405356586, "grad_norm": 7.911476135253906, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8774086833000183, "num_tokens": 648354550.0, "step": 16993 }, { "epoch": 2.161811474367129, "ewc_loss": 0.06848610937595367, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003332986088935286, "grad_norm": 8.23221492767334, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.877765417098999, "num_tokens": 648389820.0, "step": 16994 }, { "epoch": 2.1619386846457194, "ewc_loss": 0.06784789264202118, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003269164590165019, "grad_norm": 7.878419876098633, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8796378374099731, "num_tokens": 648423841.0, "step": 16995 }, { "epoch": 2.16206589492431, "ewc_loss": 0.06869940459728241, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033543151221238077, "grad_norm": 8.25817584991455, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8532872200012207, "num_tokens": 648459865.0, "step": 16996 }, { "epoch": 2.1621931052029004, "ewc_loss": 0.06763563305139542, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032479382934980094, "grad_norm": 7.811326503753662, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8826025724411011, "num_tokens": 648492639.0, "step": 16997 }, { "epoch": 2.162320315481491, "ewc_loss": 0.06911223381757736, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033711842843331397, "grad_norm": 8.173867225646973, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8706820011138916, "num_tokens": 648533456.0, "step": 16998 }, { "epoch": 2.1624475257600815, "ewc_loss": 0.06805744767189026, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032657053088769317, "grad_norm": 7.918792724609375, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8789713382720947, "num_tokens": 648571955.0, "step": 16999 }, { "epoch": 2.162574736038672, "ewc_loss": 0.06853355467319489, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003337730886414647, "grad_norm": 8.000168800354004, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8811563849449158, "num_tokens": 648614284.0, "step": 17000 }, { "epoch": 2.1627019463172625, "ewc_loss": 0.06820361316204071, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003304736455902457, "grad_norm": 8.025288581848145, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8737132549285889, "num_tokens": 648654399.0, "step": 17001 }, { "epoch": 2.162829156595853, "ewc_loss": 0.06859563291072845, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033195241121575236, "grad_norm": 8.080765724182129, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8690706491470337, "num_tokens": 648694490.0, "step": 17002 }, { "epoch": 2.1629563668744436, "ewc_loss": 0.06812341511249542, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032967166043817997, "grad_norm": 7.944604873657227, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8710272312164307, "num_tokens": 648730665.0, "step": 17003 }, { "epoch": 2.163083577153034, "ewc_loss": 0.06834091246128082, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003318466478958726, "grad_norm": 8.082831382751465, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8689537644386292, "num_tokens": 648765289.0, "step": 17004 }, { "epoch": 2.1632107874316246, "ewc_loss": 0.06814417243003845, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003298791707493365, "grad_norm": 7.924673080444336, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8661977052688599, "num_tokens": 648803982.0, "step": 17005 }, { "epoch": 2.163337997710215, "ewc_loss": 0.06855641305446625, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033400164102204144, "grad_norm": 8.071282386779785, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.870032787322998, "num_tokens": 648840992.0, "step": 17006 }, { "epoch": 2.1634652079888053, "ewc_loss": 0.06838595867156982, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000329855625750497, "grad_norm": 7.978776454925537, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8617305159568787, "num_tokens": 648883621.0, "step": 17007 }, { "epoch": 2.1635924182673962, "ewc_loss": 0.06845439970493317, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033298146445304155, "grad_norm": 8.039889335632324, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.851215660572052, "num_tokens": 648922258.0, "step": 17008 }, { "epoch": 2.1637196285459863, "ewc_loss": 0.06833547353744507, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032935087801888585, "grad_norm": 7.952731609344482, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.864830732345581, "num_tokens": 648965010.0, "step": 17009 }, { "epoch": 2.163846838824577, "ewc_loss": 0.06831765919923782, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003316141082905233, "grad_norm": 8.013216972351074, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8602849841117859, "num_tokens": 649008124.0, "step": 17010 }, { "epoch": 2.1639740491031674, "ewc_loss": 0.06842227280139923, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003302187833469361, "grad_norm": 7.969303607940674, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8658605813980103, "num_tokens": 649044743.0, "step": 17011 }, { "epoch": 2.164101259381758, "ewc_loss": 0.06829437613487244, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003313812776468694, "grad_norm": 7.9956159591674805, "learning_rate": 1e-06, "loss": 0.4144, "mean_token_accuracy": 0.8786383271217346, "num_tokens": 649078410.0, "step": 17012 }, { "epoch": 2.1642284696603484, "ewc_loss": 0.06822998821735382, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003307374136056751, "grad_norm": 8.00340747833252, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8753363490104675, "num_tokens": 649115415.0, "step": 17013 }, { "epoch": 2.164355679938939, "ewc_loss": 0.06818942725658417, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003303317353129387, "grad_norm": 8.049776077270508, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8629521727561951, "num_tokens": 649156266.0, "step": 17014 }, { "epoch": 2.1644828902175295, "ewc_loss": 0.06809261441230774, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003293636837042868, "grad_norm": 8.029444694519043, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8726966381072998, "num_tokens": 649192552.0, "step": 17015 }, { "epoch": 2.16461010049612, "ewc_loss": 0.06824024021625519, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033083988819271326, "grad_norm": 8.004806518554688, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8680540323257446, "num_tokens": 649224163.0, "step": 17016 }, { "epoch": 2.1647373107747105, "ewc_loss": 0.06813199073076248, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003297574003227055, "grad_norm": 7.917417526245117, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8864034414291382, "num_tokens": 649259348.0, "step": 17017 }, { "epoch": 2.164864521053301, "ewc_loss": 0.06833173334598541, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033175485441461205, "grad_norm": 7.965859413146973, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8806651830673218, "num_tokens": 649296875.0, "step": 17018 }, { "epoch": 2.1649917313318916, "ewc_loss": 0.06818535178899765, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003302910190541297, "grad_norm": 7.93855094909668, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8522396683692932, "num_tokens": 649333455.0, "step": 17019 }, { "epoch": 2.165118941610482, "ewc_loss": 0.06842277944087982, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033266530954279006, "grad_norm": 8.030728340148926, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8679525852203369, "num_tokens": 649364567.0, "step": 17020 }, { "epoch": 2.1652461518890727, "ewc_loss": 0.06817454844713211, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003301829856354743, "grad_norm": 7.9870195388793945, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.8479715585708618, "num_tokens": 649400579.0, "step": 17021 }, { "epoch": 2.165373362167663, "ewc_loss": 0.06841343641281128, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003325719153508544, "grad_norm": 7.989136695861816, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.8772327303886414, "num_tokens": 649432940.0, "step": 17022 }, { "epoch": 2.1655005724462537, "ewc_loss": 0.06834907084703445, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003319282259326428, "grad_norm": 8.018610000610352, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8706972599029541, "num_tokens": 649466695.0, "step": 17023 }, { "epoch": 2.1656277827248442, "ewc_loss": 0.06822093576192856, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003306468715891242, "grad_norm": 7.984121322631836, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8625023365020752, "num_tokens": 649504578.0, "step": 17024 }, { "epoch": 2.1657549930034348, "ewc_loss": 0.06865054368972778, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033250154228881, "grad_norm": 8.106010437011719, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8547675609588623, "num_tokens": 649541536.0, "step": 17025 }, { "epoch": 2.1658822032820253, "ewc_loss": 0.06827839463949203, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003287800354883075, "grad_norm": 7.963350772857666, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.864912211894989, "num_tokens": 649581280.0, "step": 17026 }, { "epoch": 2.166009413560616, "ewc_loss": 0.0685780867934227, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003317769442219287, "grad_norm": 8.065614700317383, "learning_rate": 1e-06, "loss": 0.3971, "mean_token_accuracy": 0.8872116804122925, "num_tokens": 649619837.0, "step": 17027 }, { "epoch": 2.1661366238392064, "ewc_loss": 0.06829015910625458, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032889764406718314, "grad_norm": 7.897639751434326, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8775397539138794, "num_tokens": 649654708.0, "step": 17028 }, { "epoch": 2.166263834117797, "ewc_loss": 0.0688154399394989, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033415050711482763, "grad_norm": 8.085660934448242, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8444169759750366, "num_tokens": 649691709.0, "step": 17029 }, { "epoch": 2.1663910443963874, "ewc_loss": 0.06820164620876312, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032801253837533295, "grad_norm": 7.942455291748047, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8617246747016907, "num_tokens": 649734876.0, "step": 17030 }, { "epoch": 2.166518254674978, "ewc_loss": 0.0687338262796402, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003333343775011599, "grad_norm": 8.119175910949707, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8743509650230408, "num_tokens": 649775477.0, "step": 17031 }, { "epoch": 2.166645464953568, "ewc_loss": 0.06810382008552551, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003270342422183603, "grad_norm": 7.9494099617004395, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8775944113731384, "num_tokens": 649816136.0, "step": 17032 }, { "epoch": 2.1667726752321586, "ewc_loss": 0.06859463453292847, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033194239949807525, "grad_norm": 8.105847358703613, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8749517202377319, "num_tokens": 649847942.0, "step": 17033 }, { "epoch": 2.166899885510749, "ewc_loss": 0.06810574233531952, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003270535380579531, "grad_norm": 7.91455078125, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8628619909286499, "num_tokens": 649887875.0, "step": 17034 }, { "epoch": 2.1670270957893396, "ewc_loss": 0.06852620840072632, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033125816844403744, "grad_norm": 8.086490631103516, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8526568412780762, "num_tokens": 649920557.0, "step": 17035 }, { "epoch": 2.16715430606793, "ewc_loss": 0.06807741522789001, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003267702995799482, "grad_norm": 7.921290874481201, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8744381666183472, "num_tokens": 649959704.0, "step": 17036 }, { "epoch": 2.1672815163465207, "ewc_loss": 0.06856302917003632, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003316263610031456, "grad_norm": 7.9778971672058105, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8708614706993103, "num_tokens": 650000433.0, "step": 17037 }, { "epoch": 2.167408726625111, "ewc_loss": 0.06829825043678284, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032897855271585286, "grad_norm": 7.977363109588623, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8784295916557312, "num_tokens": 650038793.0, "step": 17038 }, { "epoch": 2.1675359369037017, "ewc_loss": 0.06845913827419281, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033058749977499247, "grad_norm": 8.022069931030273, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8762575387954712, "num_tokens": 650072877.0, "step": 17039 }, { "epoch": 2.1676631471822922, "ewc_loss": 0.06829723715782166, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003289684245828539, "grad_norm": 7.9976911544799805, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8540125489234924, "num_tokens": 650109245.0, "step": 17040 }, { "epoch": 2.1677903574608828, "ewc_loss": 0.06849515438079834, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033094760146923363, "grad_norm": 8.039410591125488, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8800325393676758, "num_tokens": 650150254.0, "step": 17041 }, { "epoch": 2.1679175677394733, "ewc_loss": 0.06826765835285187, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032867270056158304, "grad_norm": 7.974758148193359, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8456099033355713, "num_tokens": 650189587.0, "step": 17042 }, { "epoch": 2.168044778018064, "ewc_loss": 0.06831695884466171, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032916569034568965, "grad_norm": 8.115655899047852, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8626895546913147, "num_tokens": 650225443.0, "step": 17043 }, { "epoch": 2.1681719882966544, "ewc_loss": 0.06823620200157166, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003283581172581762, "grad_norm": 7.993556022644043, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8618417382240295, "num_tokens": 650262319.0, "step": 17044 }, { "epoch": 2.168299198575245, "ewc_loss": 0.06831061840057373, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003291023022029549, "grad_norm": 8.018277168273926, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8488156795501709, "num_tokens": 650297137.0, "step": 17045 }, { "epoch": 2.1684264088538354, "ewc_loss": 0.06792020797729492, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032763960189186037, "grad_norm": 7.9680938720703125, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8638728260993958, "num_tokens": 650335521.0, "step": 17046 }, { "epoch": 2.168553619132426, "ewc_loss": 0.06838760524988174, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032987212762236595, "grad_norm": 8.108786582946777, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8603823184967041, "num_tokens": 650373050.0, "step": 17047 }, { "epoch": 2.1686808294110165, "ewc_loss": 0.06800387799739838, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000326034874888137, "grad_norm": 7.913486957550049, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8723259568214417, "num_tokens": 650407438.0, "step": 17048 }, { "epoch": 2.168808039689607, "ewc_loss": 0.06849701702594757, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033096622792072594, "grad_norm": 8.069232940673828, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8613349199295044, "num_tokens": 650447793.0, "step": 17049 }, { "epoch": 2.1689352499681975, "ewc_loss": 0.06808172166347504, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003268133441451937, "grad_norm": 7.936066150665283, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8607112765312195, "num_tokens": 650487167.0, "step": 17050 }, { "epoch": 2.169062460246788, "ewc_loss": 0.06855593621730804, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033155540586449206, "grad_norm": 8.091976165771484, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.852649986743927, "num_tokens": 650522960.0, "step": 17051 }, { "epoch": 2.1691896705253786, "ewc_loss": 0.06805257499217987, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003265218692831695, "grad_norm": 7.932689189910889, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8590921759605408, "num_tokens": 650564161.0, "step": 17052 }, { "epoch": 2.169316880803969, "ewc_loss": 0.06862995028495789, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033229554537683725, "grad_norm": 8.183064460754395, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8718911409378052, "num_tokens": 650599201.0, "step": 17053 }, { "epoch": 2.1694440910825596, "ewc_loss": 0.06819207221269608, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032791681587696075, "grad_norm": 7.948119640350342, "learning_rate": 1e-06, "loss": 0.5122, "mean_token_accuracy": 0.8470966815948486, "num_tokens": 650637385.0, "step": 17054 }, { "epoch": 2.16957130136115, "ewc_loss": 0.06892649829387665, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033281961805187166, "grad_norm": 8.134708404541016, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8663213849067688, "num_tokens": 650671954.0, "step": 17055 }, { "epoch": 2.1696985116397407, "ewc_loss": 0.06813515722751617, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032734766136854887, "grad_norm": 7.8732733726501465, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8828896880149841, "num_tokens": 650708475.0, "step": 17056 }, { "epoch": 2.169825721918331, "ewc_loss": 0.0689990222454071, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003335449146106839, "grad_norm": 8.076353073120117, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8689145445823669, "num_tokens": 650748089.0, "step": 17057 }, { "epoch": 2.1699529321969213, "ewc_loss": 0.06823929399251938, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032838902552612126, "grad_norm": 7.900324821472168, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8680351972579956, "num_tokens": 650790869.0, "step": 17058 }, { "epoch": 2.170080142475512, "ewc_loss": 0.06884776055812836, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033447364694438875, "grad_norm": 8.140482902526855, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8520747423171997, "num_tokens": 650830697.0, "step": 17059 }, { "epoch": 2.1702073527541024, "ewc_loss": 0.06825055181980133, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032850162824615836, "grad_norm": 7.963676452636719, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8753370642662048, "num_tokens": 650868755.0, "step": 17060 }, { "epoch": 2.170334563032693, "ewc_loss": 0.06876187771558762, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003336148802191019, "grad_norm": 8.146862030029297, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8764472007751465, "num_tokens": 650908019.0, "step": 17061 }, { "epoch": 2.1704617733112834, "ewc_loss": 0.0683060735464096, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003290568711236119, "grad_norm": 7.910160541534424, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8644024133682251, "num_tokens": 650954195.0, "step": 17062 }, { "epoch": 2.170588983589874, "ewc_loss": 0.06875763088464737, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003335723886266351, "grad_norm": 8.048772811889648, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8800619840621948, "num_tokens": 650992740.0, "step": 17063 }, { "epoch": 2.1707161938684645, "ewc_loss": 0.06814941763877869, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003274902992416173, "grad_norm": 7.948897361755371, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.879360020160675, "num_tokens": 651031524.0, "step": 17064 }, { "epoch": 2.170843404147055, "ewc_loss": 0.06877945363521576, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003337905800435692, "grad_norm": 8.087141990661621, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.86520916223526, "num_tokens": 651070004.0, "step": 17065 }, { "epoch": 2.1709706144256455, "ewc_loss": 0.06832434237003326, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032923955586738884, "grad_norm": 7.978672027587891, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8519977927207947, "num_tokens": 651104474.0, "step": 17066 }, { "epoch": 2.171097824704236, "ewc_loss": 0.06861282885074615, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003321243275422603, "grad_norm": 8.011321067810059, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8577793836593628, "num_tokens": 651148913.0, "step": 17067 }, { "epoch": 2.1712250349828266, "ewc_loss": 0.06858789920806885, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003318751114420593, "grad_norm": 7.973931789398193, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8741912841796875, "num_tokens": 651192072.0, "step": 17068 }, { "epoch": 2.171352245261417, "ewc_loss": 0.06863326579332352, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003323287528473884, "grad_norm": 8.109097480773926, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8578782081604004, "num_tokens": 651226132.0, "step": 17069 }, { "epoch": 2.1714794555400077, "ewc_loss": 0.06837503612041473, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032974648638628423, "grad_norm": 8.02518081665039, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8681372404098511, "num_tokens": 651263467.0, "step": 17070 }, { "epoch": 2.171606665818598, "ewc_loss": 0.0686914324760437, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003329103928990662, "grad_norm": 8.074721336364746, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8571357727050781, "num_tokens": 651309521.0, "step": 17071 }, { "epoch": 2.1717338760971887, "ewc_loss": 0.0683029443025589, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032902558450587094, "grad_norm": 7.960654258728027, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8678194880485535, "num_tokens": 651347026.0, "step": 17072 }, { "epoch": 2.1718610863757792, "ewc_loss": 0.06869994103908539, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003329955507069826, "grad_norm": 8.076363563537598, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8753254413604736, "num_tokens": 651380545.0, "step": 17073 }, { "epoch": 2.1719882966543698, "ewc_loss": 0.06838304549455643, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003298265510238707, "grad_norm": 8.031343460083008, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8709437847137451, "num_tokens": 651409428.0, "step": 17074 }, { "epoch": 2.1721155069329603, "ewc_loss": 0.0685122013092041, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003311180917080492, "grad_norm": 8.013107299804688, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8527512550354004, "num_tokens": 651447993.0, "step": 17075 }, { "epoch": 2.172242717211551, "ewc_loss": 0.0685766190290451, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003317622758913785, "grad_norm": 10.81493854522705, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8600638508796692, "num_tokens": 651488210.0, "step": 17076 }, { "epoch": 2.1723699274901414, "ewc_loss": 0.07035485655069351, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034710325417108834, "grad_norm": 7.997647285461426, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8618126511573792, "num_tokens": 651525742.0, "step": 17077 }, { "epoch": 2.172497137768732, "ewc_loss": 0.07040051370859146, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00035000123898498714, "grad_norm": 8.444476127624512, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8719599843025208, "num_tokens": 651565569.0, "step": 17078 }, { "epoch": 2.1726243480473224, "ewc_loss": 0.0683625340461731, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032962148543447256, "grad_norm": 7.986691474914551, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8603605628013611, "num_tokens": 651605291.0, "step": 17079 }, { "epoch": 2.172751558325913, "ewc_loss": 0.07030858844518661, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00034908196539618075, "grad_norm": 8.366910934448242, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8662675619125366, "num_tokens": 651641998.0, "step": 17080 }, { "epoch": 2.1728787686045035, "ewc_loss": 0.06857676059007645, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003317637019790709, "grad_norm": 8.02907657623291, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.863195538520813, "num_tokens": 651683040.0, "step": 17081 }, { "epoch": 2.1730059788830935, "ewc_loss": 0.0694829449057579, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003408255288377404, "grad_norm": 8.25113296508789, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8740403056144714, "num_tokens": 651718067.0, "step": 17082 }, { "epoch": 2.173133189161684, "ewc_loss": 0.0688263475894928, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033425961737520993, "grad_norm": 8.042682647705078, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.8810293674468994, "num_tokens": 651761995.0, "step": 17083 }, { "epoch": 2.1732603994402746, "ewc_loss": 0.06932205706834793, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003367752651683986, "grad_norm": 8.187705039978027, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8814945220947266, "num_tokens": 651797780.0, "step": 17084 }, { "epoch": 2.173387609718865, "ewc_loss": 0.0685880184173584, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000331876304699108, "grad_norm": 8.036263465881348, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8541055917739868, "num_tokens": 651832931.0, "step": 17085 }, { "epoch": 2.1735148199974557, "ewc_loss": 0.06901265680789948, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033612269908189774, "grad_norm": 8.21027660369873, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8598259687423706, "num_tokens": 651868440.0, "step": 17086 }, { "epoch": 2.173642030276046, "ewc_loss": 0.06874576210975647, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033101235749199986, "grad_norm": 8.2304048538208, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8686386346817017, "num_tokens": 651900029.0, "step": 17087 }, { "epoch": 2.1737692405546367, "ewc_loss": 0.06857822835445404, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033177834120579064, "grad_norm": 8.060013771057129, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.861456036567688, "num_tokens": 651944892.0, "step": 17088 }, { "epoch": 2.1738964508332272, "ewc_loss": 0.06882339715957642, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033178870216943324, "grad_norm": 8.080265045166016, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8730727434158325, "num_tokens": 651979330.0, "step": 17089 }, { "epoch": 2.1740236611118178, "ewc_loss": 0.06841105967760086, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003301067044958472, "grad_norm": 8.052957534790039, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8575007915496826, "num_tokens": 652017389.0, "step": 17090 }, { "epoch": 2.1741508713904083, "ewc_loss": 0.06854429841041565, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003314390778541565, "grad_norm": 8.09089469909668, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8620606660842896, "num_tokens": 652052260.0, "step": 17091 }, { "epoch": 2.174278081668999, "ewc_loss": 0.06849425286054611, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003309386083856225, "grad_norm": 8.132004737854004, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.851104736328125, "num_tokens": 652089917.0, "step": 17092 }, { "epoch": 2.1744052919475894, "ewc_loss": 0.06833630800247192, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032935914350673556, "grad_norm": 7.991201877593994, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.869371771812439, "num_tokens": 652129849.0, "step": 17093 }, { "epoch": 2.17453250222618, "ewc_loss": 0.06860242784023285, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033202042686752975, "grad_norm": 8.194344520568848, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8573918342590332, "num_tokens": 652163178.0, "step": 17094 }, { "epoch": 2.1746597125047704, "ewc_loss": 0.06837890297174454, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00032734370324760675, "grad_norm": 7.997514724731445, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8727461099624634, "num_tokens": 652201521.0, "step": 17095 }, { "epoch": 2.174786922783361, "ewc_loss": 0.0688033252954483, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033158791484311223, "grad_norm": 8.077709197998047, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.867582380771637, "num_tokens": 652234820.0, "step": 17096 }, { "epoch": 2.1749141330619515, "ewc_loss": 0.06826631724834442, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003286592254880816, "grad_norm": 8.057312965393066, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8621041178703308, "num_tokens": 652275686.0, "step": 17097 }, { "epoch": 2.175041343340542, "ewc_loss": 0.06838919222354889, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032988801831379533, "grad_norm": 8.009841918945312, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8569539785385132, "num_tokens": 652313243.0, "step": 17098 }, { "epoch": 2.1751685536191325, "ewc_loss": 0.06854912638664246, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033148733200505376, "grad_norm": 8.16092300415039, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8806403875350952, "num_tokens": 652351550.0, "step": 17099 }, { "epoch": 2.175295763897723, "ewc_loss": 0.06833543628454208, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00032690903753973544, "grad_norm": 7.914927959442139, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8627814054489136, "num_tokens": 652399834.0, "step": 17100 }, { "epoch": 2.1754229741763136, "ewc_loss": 0.0688864067196846, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003324187418911606, "grad_norm": 8.092103958129883, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8522458076477051, "num_tokens": 652435839.0, "step": 17101 }, { "epoch": 2.175550184454904, "ewc_loss": 0.06810371577739716, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003270332526881248, "grad_norm": 7.98645544052124, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8617443442344666, "num_tokens": 652475747.0, "step": 17102 }, { "epoch": 2.1756773947334946, "ewc_loss": 0.06880372762680054, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033159193117171526, "grad_norm": 7.9788384437561035, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.877821683883667, "num_tokens": 652517147.0, "step": 17103 }, { "epoch": 2.175804605012085, "ewc_loss": 0.06837056577205658, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003297017246950418, "grad_norm": 7.944451808929443, "learning_rate": 1e-06, "loss": 0.4207, "mean_token_accuracy": 0.8782011270523071, "num_tokens": 652552811.0, "step": 17104 }, { "epoch": 2.1759318152906753, "ewc_loss": 0.06843976676464081, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000330393755575642, "grad_norm": 8.008398056030273, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.872247576713562, "num_tokens": 652587709.0, "step": 17105 }, { "epoch": 2.1760590255692662, "ewc_loss": 0.06852969527244568, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033129300572909415, "grad_norm": 8.013080596923828, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8638510704040527, "num_tokens": 652626213.0, "step": 17106 }, { "epoch": 2.1761862358478563, "ewc_loss": 0.06849700212478638, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033096614060923457, "grad_norm": 7.9750566482543945, "learning_rate": 1e-06, "loss": 0.5572, "mean_token_accuracy": 0.8368580341339111, "num_tokens": 652665692.0, "step": 17107 }, { "epoch": 2.176313446126447, "ewc_loss": 0.06846334040164948, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003306294674985111, "grad_norm": 7.960044860839844, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8577748537063599, "num_tokens": 652705515.0, "step": 17108 }, { "epoch": 2.1764406564050374, "ewc_loss": 0.06853291392326355, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003313252527732402, "grad_norm": 7.961739540100098, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8770962953567505, "num_tokens": 652743343.0, "step": 17109 }, { "epoch": 2.176567866683628, "ewc_loss": 0.06861095130443573, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033210558467544615, "grad_norm": 7.960165500640869, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8596841096878052, "num_tokens": 652784587.0, "step": 17110 }, { "epoch": 2.1766950769622184, "ewc_loss": 0.06847269833087921, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033072312362492085, "grad_norm": 7.9394097328186035, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.857347846031189, "num_tokens": 652821985.0, "step": 17111 }, { "epoch": 2.176822287240809, "ewc_loss": 0.0687405914068222, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033340195659548044, "grad_norm": 8.017032623291016, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8768154978752136, "num_tokens": 652855772.0, "step": 17112 }, { "epoch": 2.1769494975193995, "ewc_loss": 0.06855636090040207, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003315596841275692, "grad_norm": 7.949711799621582, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.86026531457901, "num_tokens": 652890497.0, "step": 17113 }, { "epoch": 2.17707670779799, "ewc_loss": 0.06871284544467926, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003331245097797364, "grad_norm": 7.952505588531494, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.876713216304779, "num_tokens": 652931611.0, "step": 17114 }, { "epoch": 2.1772039180765805, "ewc_loss": 0.06871574372053146, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033315352629870176, "grad_norm": 7.95614767074585, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.865485429763794, "num_tokens": 652972129.0, "step": 17115 }, { "epoch": 2.177331128355171, "ewc_loss": 0.06877414882183075, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033373761107213795, "grad_norm": 7.995815753936768, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.859184980392456, "num_tokens": 653009848.0, "step": 17116 }, { "epoch": 2.1774583386337616, "ewc_loss": 0.06866203248500824, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033261641510762274, "grad_norm": 7.992551803588867, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8624414205551147, "num_tokens": 653044614.0, "step": 17117 }, { "epoch": 2.177585548912352, "ewc_loss": 0.06875770539045334, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033357314532622695, "grad_norm": 7.930010795593262, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8800210952758789, "num_tokens": 653087614.0, "step": 17118 }, { "epoch": 2.1777127591909426, "ewc_loss": 0.06877366453409195, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003337327507324517, "grad_norm": 8.020513534545898, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8793774247169495, "num_tokens": 653120434.0, "step": 17119 }, { "epoch": 2.177839969469533, "ewc_loss": 0.06869877129793167, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003329837927594781, "grad_norm": 7.995457172393799, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8749669194221497, "num_tokens": 653159390.0, "step": 17120 }, { "epoch": 2.1779671797481237, "ewc_loss": 0.06899825483560562, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033353723119944334, "grad_norm": 8.005586624145508, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8582471609115601, "num_tokens": 653201045.0, "step": 17121 }, { "epoch": 2.1780943900267142, "ewc_loss": 0.06854245066642761, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003314206260256469, "grad_norm": 7.963161945343018, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8680300116539001, "num_tokens": 653244968.0, "step": 17122 }, { "epoch": 2.1782216003053048, "ewc_loss": 0.06874692440032959, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033346531563438475, "grad_norm": 8.004158973693848, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8596431016921997, "num_tokens": 653281157.0, "step": 17123 }, { "epoch": 2.1783488105838953, "ewc_loss": 0.0685393437743187, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003313895140308887, "grad_norm": 7.938233852386475, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.878649890422821, "num_tokens": 653321055.0, "step": 17124 }, { "epoch": 2.178476020862486, "ewc_loss": 0.06871134042739868, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033310955041088164, "grad_norm": 8.041197776794434, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8534797430038452, "num_tokens": 653359320.0, "step": 17125 }, { "epoch": 2.1786032311410763, "ewc_loss": 0.06838379800319672, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032983411801978946, "grad_norm": 7.935346603393555, "learning_rate": 1e-06, "loss": 0.3905, "mean_token_accuracy": 0.8866010904312134, "num_tokens": 653404186.0, "step": 17126 }, { "epoch": 2.178730441419667, "ewc_loss": 0.0686742514371872, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003327385929878801, "grad_norm": 8.005234718322754, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8754501342773438, "num_tokens": 653438122.0, "step": 17127 }, { "epoch": 2.1788576516982574, "ewc_loss": 0.06847689300775528, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033076503314077854, "grad_norm": 7.954845905303955, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8655550479888916, "num_tokens": 653474727.0, "step": 17128 }, { "epoch": 2.178984861976848, "ewc_loss": 0.06859983503818512, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033199440804310143, "grad_norm": 8.018760681152344, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8719834685325623, "num_tokens": 653513531.0, "step": 17129 }, { "epoch": 2.179112072255438, "ewc_loss": 0.06856855750083923, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003316817164886743, "grad_norm": 8.00752067565918, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8782054781913757, "num_tokens": 653555871.0, "step": 17130 }, { "epoch": 2.1792392825340285, "ewc_loss": 0.06852684915065765, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033126454218290746, "grad_norm": 7.935642242431641, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8683598041534424, "num_tokens": 653598825.0, "step": 17131 }, { "epoch": 2.179366492812619, "ewc_loss": 0.06859132647514343, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003319093957543373, "grad_norm": 8.066118240356445, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8758093118667603, "num_tokens": 653639838.0, "step": 17132 }, { "epoch": 2.1794937030912096, "ewc_loss": 0.06812817603349686, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003297192743048072, "grad_norm": 7.976727485656738, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8574531078338623, "num_tokens": 653678958.0, "step": 17133 }, { "epoch": 2.1796209133698, "ewc_loss": 0.06864270567893982, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033242316567339003, "grad_norm": 8.044353485107422, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8748478889465332, "num_tokens": 653718043.0, "step": 17134 }, { "epoch": 2.1797481236483907, "ewc_loss": 0.06807348132133484, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00032917235512286425, "grad_norm": 7.960631370544434, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8809215426445007, "num_tokens": 653756308.0, "step": 17135 }, { "epoch": 2.179875333926981, "ewc_loss": 0.06841149926185608, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003325524739921093, "grad_norm": 8.035346984863281, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8745782375335693, "num_tokens": 653793140.0, "step": 17136 }, { "epoch": 2.1800025442055717, "ewc_loss": 0.06815701723098755, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003300076932646334, "grad_norm": 8.044862747192383, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8713209629058838, "num_tokens": 653823441.0, "step": 17137 }, { "epoch": 2.1801297544841622, "ewc_loss": 0.06819422543048859, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033037978573702276, "grad_norm": 8.06784439086914, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8571475744247437, "num_tokens": 653860743.0, "step": 17138 }, { "epoch": 2.1802569647627528, "ewc_loss": 0.06824839115142822, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003309214662294835, "grad_norm": 7.996645927429199, "learning_rate": 1e-06, "loss": 0.409, "mean_token_accuracy": 0.8780409097671509, "num_tokens": 653894785.0, "step": 17139 }, { "epoch": 2.1803841750413433, "ewc_loss": 0.06827858090400696, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003312232729513198, "grad_norm": 7.988253593444824, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8622595071792603, "num_tokens": 653932208.0, "step": 17140 }, { "epoch": 2.180511385319934, "ewc_loss": 0.06849504262208939, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033094652462750673, "grad_norm": 8.01406192779541, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8525540828704834, "num_tokens": 653968818.0, "step": 17141 }, { "epoch": 2.1806385955985244, "ewc_loss": 0.06841759383678436, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033017201349139214, "grad_norm": 7.938204288482666, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8641034364700317, "num_tokens": 654010696.0, "step": 17142 }, { "epoch": 2.180765805877115, "ewc_loss": 0.06862220168113708, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003322181000839919, "grad_norm": 8.084922790527344, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.869962751865387, "num_tokens": 654047632.0, "step": 17143 }, { "epoch": 2.1808930161557054, "ewc_loss": 0.06831863522529602, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003291823959443718, "grad_norm": 7.947579860687256, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8698133230209351, "num_tokens": 654088200.0, "step": 17144 }, { "epoch": 2.181020226434296, "ewc_loss": 0.0686739832162857, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033273594453930855, "grad_norm": 8.052809715270996, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8631734848022461, "num_tokens": 654124225.0, "step": 17145 }, { "epoch": 2.1811474367128865, "ewc_loss": 0.06833862513303757, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003293823392596096, "grad_norm": 7.966500282287598, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8681540489196777, "num_tokens": 654163317.0, "step": 17146 }, { "epoch": 2.181274646991477, "ewc_loss": 0.06871253252029419, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033312139566987753, "grad_norm": 8.107694625854492, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8594163656234741, "num_tokens": 654199208.0, "step": 17147 }, { "epoch": 2.1814018572700675, "ewc_loss": 0.06830554455518723, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032905154512263834, "grad_norm": 7.981200695037842, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8547744154930115, "num_tokens": 654239988.0, "step": 17148 }, { "epoch": 2.181529067548658, "ewc_loss": 0.06874854862689972, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033348158467561007, "grad_norm": 8.115158081054688, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8595056533813477, "num_tokens": 654276361.0, "step": 17149 }, { "epoch": 2.1816562778272486, "ewc_loss": 0.06825346499681473, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003285307320766151, "grad_norm": 7.957074165344238, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8679954409599304, "num_tokens": 654316690.0, "step": 17150 }, { "epoch": 2.181783488105839, "ewc_loss": 0.06866847723722458, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033268085098825395, "grad_norm": 8.010496139526367, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8761551380157471, "num_tokens": 654354112.0, "step": 17151 }, { "epoch": 2.1819106983844296, "ewc_loss": 0.06835967302322388, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003295928181614727, "grad_norm": 7.9552154541015625, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8784766793251038, "num_tokens": 654398896.0, "step": 17152 }, { "epoch": 2.18203790866302, "ewc_loss": 0.06859244406223297, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033192054252140224, "grad_norm": 8.075081825256348, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8795002102851868, "num_tokens": 654435180.0, "step": 17153 }, { "epoch": 2.1821651189416107, "ewc_loss": 0.06837958097457886, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003297918592579663, "grad_norm": 7.962347507476807, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.85475093126297, "num_tokens": 654473958.0, "step": 17154 }, { "epoch": 2.1822923292202008, "ewc_loss": 0.06861324608325958, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033212857670150697, "grad_norm": 8.050654411315918, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8525374531745911, "num_tokens": 654512918.0, "step": 17155 }, { "epoch": 2.1824195394987913, "ewc_loss": 0.06837019324302673, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003296979994047433, "grad_norm": 7.9431047439575195, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8734043836593628, "num_tokens": 654552296.0, "step": 17156 }, { "epoch": 2.182546749777382, "ewc_loss": 0.06864170730113983, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033241312485188246, "grad_norm": 8.063140869140625, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8731770515441895, "num_tokens": 654592088.0, "step": 17157 }, { "epoch": 2.1826739600559724, "ewc_loss": 0.06839362531900406, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032993234344758093, "grad_norm": 8.010024070739746, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8591132164001465, "num_tokens": 654635119.0, "step": 17158 }, { "epoch": 2.182801170334563, "ewc_loss": 0.06866040825843811, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033260020427405834, "grad_norm": 8.050322532653809, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8538728952407837, "num_tokens": 654671204.0, "step": 17159 }, { "epoch": 2.1829283806131534, "ewc_loss": 0.06832975894212723, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032929368899203837, "grad_norm": 7.9352545738220215, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8647286891937256, "num_tokens": 654702986.0, "step": 17160 }, { "epoch": 2.183055590891744, "ewc_loss": 0.06868130713701248, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003328091697767377, "grad_norm": 8.082019805908203, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.842819094657898, "num_tokens": 654745813.0, "step": 17161 }, { "epoch": 2.1831828011703345, "ewc_loss": 0.06843778491020203, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003303739067632705, "grad_norm": 7.953192234039307, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8655160665512085, "num_tokens": 654783305.0, "step": 17162 }, { "epoch": 2.183310011448925, "ewc_loss": 0.0686725527048111, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003327215963508934, "grad_norm": 8.049018859863281, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8748420476913452, "num_tokens": 654823427.0, "step": 17163 }, { "epoch": 2.1834372217275155, "ewc_loss": 0.06848928332328796, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033088892814703286, "grad_norm": 7.996535301208496, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8799270391464233, "num_tokens": 654862400.0, "step": 17164 }, { "epoch": 2.183564432006106, "ewc_loss": 0.06863682717084885, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033236437593586743, "grad_norm": 7.9973273277282715, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8702117204666138, "num_tokens": 654902484.0, "step": 17165 }, { "epoch": 2.1836916422846966, "ewc_loss": 0.06855732947587967, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003315693757031113, "grad_norm": 8.005976676940918, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8730571269989014, "num_tokens": 654946882.0, "step": 17166 }, { "epoch": 2.183818852563287, "ewc_loss": 0.06858906149864197, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033188669476658106, "grad_norm": 8.043327331542969, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8566294312477112, "num_tokens": 654977850.0, "step": 17167 }, { "epoch": 2.1839460628418776, "ewc_loss": 0.06855840981006622, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003315802023280412, "grad_norm": 8.052922248840332, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8524436950683594, "num_tokens": 655015352.0, "step": 17168 }, { "epoch": 2.184073273120468, "ewc_loss": 0.06850112974643707, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003310074389446527, "grad_norm": 8.090412139892578, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8671828508377075, "num_tokens": 655047488.0, "step": 17169 }, { "epoch": 2.1842004833990587, "ewc_loss": 0.06842535734176636, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033024963340722024, "grad_norm": 7.964517593383789, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.864081621170044, "num_tokens": 655083410.0, "step": 17170 }, { "epoch": 2.1843276936776492, "ewc_loss": 0.06839078664779663, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003299039963167161, "grad_norm": 8.016036033630371, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8752110004425049, "num_tokens": 655120594.0, "step": 17171 }, { "epoch": 2.1844549039562398, "ewc_loss": 0.06846193969249725, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033061549765989184, "grad_norm": 8.04100513458252, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8640806078910828, "num_tokens": 655157310.0, "step": 17172 }, { "epoch": 2.1845821142348303, "ewc_loss": 0.06869326531887054, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003304873825982213, "grad_norm": 7.943291664123535, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8748835921287537, "num_tokens": 655193749.0, "step": 17173 }, { "epoch": 2.184709324513421, "ewc_loss": 0.0685601532459259, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033159757731482387, "grad_norm": 8.057004928588867, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8802127838134766, "num_tokens": 655235532.0, "step": 17174 }, { "epoch": 2.1848365347920113, "ewc_loss": 0.06836549937725067, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032965114223770797, "grad_norm": 7.973360061645508, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8735169172286987, "num_tokens": 655282154.0, "step": 17175 }, { "epoch": 2.184963745070602, "ewc_loss": 0.06852292269468307, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003312253102194518, "grad_norm": 8.061357498168945, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8590028285980225, "num_tokens": 655321342.0, "step": 17176 }, { "epoch": 2.1850909553491924, "ewc_loss": 0.0682687759399414, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003286839055363089, "grad_norm": 8.012836456298828, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8835511803627014, "num_tokens": 655358107.0, "step": 17177 }, { "epoch": 2.185218165627783, "ewc_loss": 0.06847899407148361, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003307860461063683, "grad_norm": 8.022098541259766, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8604332208633423, "num_tokens": 655393462.0, "step": 17178 }, { "epoch": 2.1853453759063735, "ewc_loss": 0.06843222677707672, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003303183475509286, "grad_norm": 7.995028495788574, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8678038716316223, "num_tokens": 655436114.0, "step": 17179 }, { "epoch": 2.1854725861849635, "ewc_loss": 0.06850400567054749, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033103616442531347, "grad_norm": 8.073405265808105, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8687194585800171, "num_tokens": 655473119.0, "step": 17180 }, { "epoch": 2.185599796463554, "ewc_loss": 0.06836302578449249, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003296263166703284, "grad_norm": 8.008479118347168, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8614295721054077, "num_tokens": 655515537.0, "step": 17181 }, { "epoch": 2.1857270067421446, "ewc_loss": 0.0683913305401802, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032990940962918103, "grad_norm": 7.99025297164917, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8590349555015564, "num_tokens": 655555944.0, "step": 17182 }, { "epoch": 2.185854217020735, "ewc_loss": 0.06847422569990158, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003307383449282497, "grad_norm": 8.084494590759277, "learning_rate": 1e-06, "loss": 0.519, "mean_token_accuracy": 0.8496376872062683, "num_tokens": 655589952.0, "step": 17183 }, { "epoch": 2.1859814272993257, "ewc_loss": 0.06828393042087555, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003288353618700057, "grad_norm": 7.981057167053223, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.8780362606048584, "num_tokens": 655628269.0, "step": 17184 }, { "epoch": 2.186108637577916, "ewc_loss": 0.06853333115577698, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033132938551716506, "grad_norm": 8.152082443237305, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8706014156341553, "num_tokens": 655657315.0, "step": 17185 }, { "epoch": 2.1862358478565067, "ewc_loss": 0.06815189123153687, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032751497928984463, "grad_norm": 7.948393821716309, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.866604208946228, "num_tokens": 655696117.0, "step": 17186 }, { "epoch": 2.1863630581350972, "ewc_loss": 0.06873908638954163, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003333869681227952, "grad_norm": 8.0469388961792, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8479447364807129, "num_tokens": 655738823.0, "step": 17187 }, { "epoch": 2.1864902684136878, "ewc_loss": 0.06821223348379135, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000328118447214365, "grad_norm": 7.882505416870117, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8565716743469238, "num_tokens": 655781457.0, "step": 17188 }, { "epoch": 2.1866174786922783, "ewc_loss": 0.06885118037462234, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003345079021528363, "grad_norm": 8.171927452087402, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8615933656692505, "num_tokens": 655824704.0, "step": 17189 }, { "epoch": 2.186744688970869, "ewc_loss": 0.06814122200012207, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003274083137512207, "grad_norm": 7.917677402496338, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8756318688392639, "num_tokens": 655860013.0, "step": 17190 }, { "epoch": 2.1868718992494594, "ewc_loss": 0.0688675194978714, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033467126195318997, "grad_norm": 8.135717391967773, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8641500473022461, "num_tokens": 655897146.0, "step": 17191 }, { "epoch": 2.18699910952805, "ewc_loss": 0.06845790147781372, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00032813375582918525, "grad_norm": 7.994427680969238, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8707234859466553, "num_tokens": 655934299.0, "step": 17192 }, { "epoch": 2.1871263198066404, "ewc_loss": 0.06873620301485062, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003333581262268126, "grad_norm": 8.017837524414062, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8716630935668945, "num_tokens": 655976554.0, "step": 17193 }, { "epoch": 2.187253530085231, "ewc_loss": 0.0683659166097641, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003296553040854633, "grad_norm": 8.081441879272461, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8779889941215515, "num_tokens": 656011756.0, "step": 17194 }, { "epoch": 2.1873807403638215, "ewc_loss": 0.0683743953704834, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032974008354358375, "grad_norm": 8.062347412109375, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8626316785812378, "num_tokens": 656054405.0, "step": 17195 }, { "epoch": 2.187507950642412, "ewc_loss": 0.06833037734031677, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003292998590040952, "grad_norm": 7.960163593292236, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8709235191345215, "num_tokens": 656094668.0, "step": 17196 }, { "epoch": 2.1876351609210025, "ewc_loss": 0.06853196024894714, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033131567761301994, "grad_norm": 8.138605117797852, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8653242588043213, "num_tokens": 656137664.0, "step": 17197 }, { "epoch": 2.187762371199593, "ewc_loss": 0.06817036867141724, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032769975950941443, "grad_norm": 7.920473098754883, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8584166765213013, "num_tokens": 656180611.0, "step": 17198 }, { "epoch": 2.1878895814781836, "ewc_loss": 0.06868074834346771, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033280358184129, "grad_norm": 8.134812355041504, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8577715754508972, "num_tokens": 656217789.0, "step": 17199 }, { "epoch": 2.188016791756774, "ewc_loss": 0.06812804937362671, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000327276618918404, "grad_norm": 8.00136947631836, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8622135519981384, "num_tokens": 656261940.0, "step": 17200 }, { "epoch": 2.1881440020353646, "ewc_loss": 0.06852978467941284, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003312939661554992, "grad_norm": 8.05219554901123, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8757558465003967, "num_tokens": 656296000.0, "step": 17201 }, { "epoch": 2.188271212313955, "ewc_loss": 0.06837072968482971, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032970341271720827, "grad_norm": 7.998023509979248, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8763900995254517, "num_tokens": 656340862.0, "step": 17202 }, { "epoch": 2.1883984225925452, "ewc_loss": 0.06851289421319962, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003311250184196979, "grad_norm": 8.047811508178711, "learning_rate": 1e-06, "loss": 0.3812, "mean_token_accuracy": 0.8919488191604614, "num_tokens": 656376923.0, "step": 17203 }, { "epoch": 2.188525632871136, "ewc_loss": 0.06837736070156097, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003297697112429887, "grad_norm": 8.008262634277344, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8629939556121826, "num_tokens": 656414058.0, "step": 17204 }, { "epoch": 2.1886528431497263, "ewc_loss": 0.06847937405109406, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003307898005004972, "grad_norm": 8.017435073852539, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8556349873542786, "num_tokens": 656450077.0, "step": 17205 }, { "epoch": 2.188780053428317, "ewc_loss": 0.06849604099988937, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003309565072413534, "grad_norm": 8.00979995727539, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8567918539047241, "num_tokens": 656487258.0, "step": 17206 }, { "epoch": 2.1889072637069074, "ewc_loss": 0.06862668693065643, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033226291998289526, "grad_norm": 8.027119636535645, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8791761994361877, "num_tokens": 656525141.0, "step": 17207 }, { "epoch": 2.189034473985498, "ewc_loss": 0.06858488917350769, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003318449598737061, "grad_norm": 8.011595726013184, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8700872659683228, "num_tokens": 656561517.0, "step": 17208 }, { "epoch": 2.1891616842640884, "ewc_loss": 0.06858642399311066, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033186038490384817, "grad_norm": 7.989913463592529, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8720628023147583, "num_tokens": 656595859.0, "step": 17209 }, { "epoch": 2.189288894542679, "ewc_loss": 0.0687268078327179, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003332641499582678, "grad_norm": 8.056262016296387, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.862729549407959, "num_tokens": 656627652.0, "step": 17210 }, { "epoch": 2.1894161048212695, "ewc_loss": 0.06858488917350769, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000331845018081367, "grad_norm": 7.993528366088867, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8725048303604126, "num_tokens": 656669951.0, "step": 17211 }, { "epoch": 2.18954331509986, "ewc_loss": 0.0686262995004654, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003322591364849359, "grad_norm": 8.070549011230469, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8785676956176758, "num_tokens": 656702216.0, "step": 17212 }, { "epoch": 2.1896705253784505, "ewc_loss": 0.06849110871553421, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033090717624872923, "grad_norm": 7.952696800231934, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8554673194885254, "num_tokens": 656744816.0, "step": 17213 }, { "epoch": 2.189797735657041, "ewc_loss": 0.06878514587879181, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033384759444743395, "grad_norm": 8.017705917358398, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.861161470413208, "num_tokens": 656785533.0, "step": 17214 }, { "epoch": 2.1899249459356316, "ewc_loss": 0.06839131563901901, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003323506680317223, "grad_norm": 7.956796646118164, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8648940920829773, "num_tokens": 656825375.0, "step": 17215 }, { "epoch": 2.190052156214222, "ewc_loss": 0.06856119632720947, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003340494295116514, "grad_norm": 8.058159828186035, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8743470311164856, "num_tokens": 656857951.0, "step": 17216 }, { "epoch": 2.1901793664928126, "ewc_loss": 0.06837288290262222, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033216632436960936, "grad_norm": 7.939421653747559, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8762617111206055, "num_tokens": 656896911.0, "step": 17217 }, { "epoch": 2.190306576771403, "ewc_loss": 0.0688307136297226, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033430327312089503, "grad_norm": 7.994872570037842, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8737162947654724, "num_tokens": 656935723.0, "step": 17218 }, { "epoch": 2.1904337870499937, "ewc_loss": 0.06858581304550171, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003342956188134849, "grad_norm": 10.812301635742188, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8784440159797668, "num_tokens": 656970881.0, "step": 17219 }, { "epoch": 2.1905609973285842, "ewc_loss": 0.0701032429933548, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000349469919456169, "grad_norm": 7.974365234375, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8611816167831421, "num_tokens": 657007721.0, "step": 17220 }, { "epoch": 2.1906882076071748, "ewc_loss": 0.07064113020896912, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00035484880208969116, "grad_norm": 8.488473892211914, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8769574165344238, "num_tokens": 657042359.0, "step": 17221 }, { "epoch": 2.1908154178857653, "ewc_loss": 0.06824734807014465, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003309109597466886, "grad_norm": 7.898270606994629, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8725764155387878, "num_tokens": 657084662.0, "step": 17222 }, { "epoch": 2.190942628164356, "ewc_loss": 0.07091264426708221, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00035756395664066076, "grad_norm": 8.538969039916992, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8680553436279297, "num_tokens": 657122002.0, "step": 17223 }, { "epoch": 2.1910698384429463, "ewc_loss": 0.0685378909111023, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003338164242450148, "grad_norm": 7.99168062210083, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.867438793182373, "num_tokens": 657158768.0, "step": 17224 }, { "epoch": 2.191197048721537, "ewc_loss": 0.07003432512283325, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000348780769854784, "grad_norm": 8.384788513183594, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8560848832130432, "num_tokens": 657193445.0, "step": 17225 }, { "epoch": 2.1913242590001274, "ewc_loss": 0.06878453493118286, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003362828865647316, "grad_norm": 8.061573028564453, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8654950857162476, "num_tokens": 657234922.0, "step": 17226 }, { "epoch": 2.191451469278718, "ewc_loss": 0.06960606575012207, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003420567372813821, "grad_norm": 8.329331398010254, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8577545881271362, "num_tokens": 657270774.0, "step": 17227 }, { "epoch": 2.191578679557308, "ewc_loss": 0.06878482550382614, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003338443348184228, "grad_norm": 8.055792808532715, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8815404772758484, "num_tokens": 657307046.0, "step": 17228 }, { "epoch": 2.1917058898358985, "ewc_loss": 0.06920987367630005, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033809480373747647, "grad_norm": 8.166519165039062, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8563980460166931, "num_tokens": 657352865.0, "step": 17229 }, { "epoch": 2.191833100114489, "ewc_loss": 0.06850652396678925, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033350277226418257, "grad_norm": 8.043792724609375, "learning_rate": 1e-06, "loss": 0.5224, "mean_token_accuracy": 0.8461505174636841, "num_tokens": 657393372.0, "step": 17230 }, { "epoch": 2.1919603103930796, "ewc_loss": 0.06875060498714447, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003359435359016061, "grad_norm": 8.153207778930664, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8467192649841309, "num_tokens": 657432445.0, "step": 17231 }, { "epoch": 2.19208752067167, "ewc_loss": 0.06839321553707123, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003323696437291801, "grad_norm": 8.07943344116211, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8663921356201172, "num_tokens": 657469145.0, "step": 17232 }, { "epoch": 2.1922147309502606, "ewc_loss": 0.06856654584407806, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033410300966352224, "grad_norm": 8.035231590270996, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8805088996887207, "num_tokens": 657510369.0, "step": 17233 }, { "epoch": 2.192341941228851, "ewc_loss": 0.0684669241309166, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033310672733932734, "grad_norm": 8.078093528747559, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8718255758285522, "num_tokens": 657550257.0, "step": 17234 }, { "epoch": 2.1924691515074417, "ewc_loss": 0.06849081069231033, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003333456115797162, "grad_norm": 8.034978866577148, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8691206574440002, "num_tokens": 657587095.0, "step": 17235 }, { "epoch": 2.1925963617860322, "ewc_loss": 0.0685013011097908, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003334505017846823, "grad_norm": 8.06967830657959, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.871415913105011, "num_tokens": 657619710.0, "step": 17236 }, { "epoch": 2.1927235720646228, "ewc_loss": 0.06835292279720306, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003319667011965066, "grad_norm": 7.997440814971924, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8628573417663574, "num_tokens": 657653316.0, "step": 17237 }, { "epoch": 2.1928507823432133, "ewc_loss": 0.06857238709926605, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033416133373975754, "grad_norm": 8.072490692138672, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8617234826087952, "num_tokens": 657693570.0, "step": 17238 }, { "epoch": 2.192977992621804, "ewc_loss": 0.06841397285461426, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003325772122479975, "grad_norm": 7.944792747497559, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8681163787841797, "num_tokens": 657736781.0, "step": 17239 }, { "epoch": 2.1931052029003943, "ewc_loss": 0.06866702437400818, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033510776120238006, "grad_norm": 8.005252838134766, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8579925298690796, "num_tokens": 657775603.0, "step": 17240 }, { "epoch": 2.193232413178985, "ewc_loss": 0.0685533881187439, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003339714021421969, "grad_norm": 8.024975776672363, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8643966913223267, "num_tokens": 657809065.0, "step": 17241 }, { "epoch": 2.1933596234575754, "ewc_loss": 0.06855446100234985, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003339820832479745, "grad_norm": 7.972219467163086, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8647395968437195, "num_tokens": 657851369.0, "step": 17242 }, { "epoch": 2.193486833736166, "ewc_loss": 0.06866546720266342, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033509216154925525, "grad_norm": 8.005376815795898, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8896649479866028, "num_tokens": 657889008.0, "step": 17243 }, { "epoch": 2.1936140440147565, "ewc_loss": 0.0688597559928894, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003345936129335314, "grad_norm": 8.030396461486816, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8663967251777649, "num_tokens": 657925867.0, "step": 17244 }, { "epoch": 2.193741254293347, "ewc_loss": 0.0688977986574173, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003349740873090923, "grad_norm": 8.05010986328125, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.855495810508728, "num_tokens": 657959502.0, "step": 17245 }, { "epoch": 2.1938684645719375, "ewc_loss": 0.06878459453582764, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003338420356158167, "grad_norm": 7.96715784072876, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8690062761306763, "num_tokens": 657999063.0, "step": 17246 }, { "epoch": 2.193995674850528, "ewc_loss": 0.06893403828144073, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033533651730977, "grad_norm": 8.06518840789795, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8557540774345398, "num_tokens": 658033016.0, "step": 17247 }, { "epoch": 2.1941228851291186, "ewc_loss": 0.06881891191005707, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033418519888073206, "grad_norm": 8.01551342010498, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8730037808418274, "num_tokens": 658071259.0, "step": 17248 }, { "epoch": 2.194250095407709, "ewc_loss": 0.06887978315353394, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003347939345985651, "grad_norm": 8.045875549316406, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8691573143005371, "num_tokens": 658103190.0, "step": 17249 }, { "epoch": 2.1943773056862996, "ewc_loss": 0.06874948740005493, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033349095610901713, "grad_norm": 7.985128879547119, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8587784171104431, "num_tokens": 658143346.0, "step": 17250 }, { "epoch": 2.19450451596489, "ewc_loss": 0.06870830804109573, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033552056993357837, "grad_norm": 8.081076622009277, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8758611083030701, "num_tokens": 658181747.0, "step": 17251 }, { "epoch": 2.1946317262434807, "ewc_loss": 0.06864897161722183, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003324857971165329, "grad_norm": 7.972522735595703, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.86728435754776, "num_tokens": 658221770.0, "step": 17252 }, { "epoch": 2.1947589365220708, "ewc_loss": 0.06889723986387253, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003349684993736446, "grad_norm": 8.008467674255371, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8714632987976074, "num_tokens": 658260157.0, "step": 17253 }, { "epoch": 2.1948861468006613, "ewc_loss": 0.0686807632446289, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003328037273604423, "grad_norm": 7.962881565093994, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8565502166748047, "num_tokens": 658301815.0, "step": 17254 }, { "epoch": 2.195013357079252, "ewc_loss": 0.06892450898885727, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033524117316119373, "grad_norm": 8.04627799987793, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8686884641647339, "num_tokens": 658340732.0, "step": 17255 }, { "epoch": 2.1951405673578424, "ewc_loss": 0.06874767690896988, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033347285352647305, "grad_norm": 8.020007133483887, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8604251146316528, "num_tokens": 658380569.0, "step": 17256 }, { "epoch": 2.195267777636433, "ewc_loss": 0.0687660500407219, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003336565860081464, "grad_norm": 8.015473365783691, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8789782524108887, "num_tokens": 658415842.0, "step": 17257 }, { "epoch": 2.1953949879150234, "ewc_loss": 0.06883697211742401, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033436581725254655, "grad_norm": 8.016582489013672, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8716318607330322, "num_tokens": 658452861.0, "step": 17258 }, { "epoch": 2.195522198193614, "ewc_loss": 0.06879398226737976, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003339358663652092, "grad_norm": 8.114688873291016, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8692895174026489, "num_tokens": 658487019.0, "step": 17259 }, { "epoch": 2.1956494084722045, "ewc_loss": 0.06855519115924835, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033154795528389513, "grad_norm": 8.006948471069336, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8668630719184875, "num_tokens": 658529290.0, "step": 17260 }, { "epoch": 2.195776618750795, "ewc_loss": 0.06882071495056152, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003342032141517848, "grad_norm": 8.066920280456543, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.861003041267395, "num_tokens": 658572118.0, "step": 17261 }, { "epoch": 2.1959038290293855, "ewc_loss": 0.06853009760379791, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033129710936918855, "grad_norm": 8.009481430053711, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8638574481010437, "num_tokens": 658615972.0, "step": 17262 }, { "epoch": 2.196031039307976, "ewc_loss": 0.06870265305042267, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003330226172693074, "grad_norm": 7.969183444976807, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8739986419677734, "num_tokens": 658653848.0, "step": 17263 }, { "epoch": 2.1961582495865666, "ewc_loss": 0.06869280338287354, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033292410080321133, "grad_norm": 8.043205261230469, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8678115010261536, "num_tokens": 658686247.0, "step": 17264 }, { "epoch": 2.196285459865157, "ewc_loss": 0.06864500045776367, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003324461285956204, "grad_norm": 8.022523880004883, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8625417947769165, "num_tokens": 658725787.0, "step": 17265 }, { "epoch": 2.1964126701437476, "ewc_loss": 0.06871279329061508, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003331240441184491, "grad_norm": 7.970870494842529, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8621422052383423, "num_tokens": 658765957.0, "step": 17266 }, { "epoch": 2.196539880422338, "ewc_loss": 0.06868162006139755, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000332812312990427, "grad_norm": 8.045856475830078, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8775152564048767, "num_tokens": 658800996.0, "step": 17267 }, { "epoch": 2.1966670907009287, "ewc_loss": 0.06857582926750183, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033175438875332475, "grad_norm": 8.02512264251709, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8638919591903687, "num_tokens": 658833779.0, "step": 17268 }, { "epoch": 2.196794300979519, "ewc_loss": 0.06859297305345535, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033436721423640847, "grad_norm": 7.955597400665283, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8701248168945312, "num_tokens": 658874425.0, "step": 17269 }, { "epoch": 2.1969215112581097, "ewc_loss": 0.06879693269729614, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003339654649607837, "grad_norm": 8.026786804199219, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8739843964576721, "num_tokens": 658912007.0, "step": 17270 }, { "epoch": 2.1970487215367003, "ewc_loss": 0.06857157498598099, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033415324287489057, "grad_norm": 8.023907661437988, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8582257628440857, "num_tokens": 658956775.0, "step": 17271 }, { "epoch": 2.197175931815291, "ewc_loss": 0.06850741803646088, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033351167803630233, "grad_norm": 7.97628116607666, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8794130682945251, "num_tokens": 658997132.0, "step": 17272 }, { "epoch": 2.1973031420938813, "ewc_loss": 0.06889094412326813, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033490557689219713, "grad_norm": 8.036242485046387, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.87367182970047, "num_tokens": 659040437.0, "step": 17273 }, { "epoch": 2.197430352372472, "ewc_loss": 0.06852170079946518, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003336544905323535, "grad_norm": 8.006571769714355, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8827295303344727, "num_tokens": 659074461.0, "step": 17274 }, { "epoch": 2.1975575626510624, "ewc_loss": 0.06866689026355743, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033510642242617905, "grad_norm": 8.034685134887695, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.862989068031311, "num_tokens": 659113818.0, "step": 17275 }, { "epoch": 2.197684772929653, "ewc_loss": 0.06854279339313507, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003338654641993344, "grad_norm": 8.091761589050293, "learning_rate": 1e-06, "loss": 0.5044, "mean_token_accuracy": 0.8524671792984009, "num_tokens": 659149297.0, "step": 17276 }, { "epoch": 2.1978119832082434, "ewc_loss": 0.06847396492958069, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003331771295052022, "grad_norm": 7.970300197601318, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8689275979995728, "num_tokens": 659187500.0, "step": 17277 }, { "epoch": 2.1979391934868335, "ewc_loss": 0.0687120333313942, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033555785194039345, "grad_norm": 8.073556900024414, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8402298092842102, "num_tokens": 659228624.0, "step": 17278 }, { "epoch": 2.198066403765424, "ewc_loss": 0.06847664713859558, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003332040214445442, "grad_norm": 7.978135585784912, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8736715316772461, "num_tokens": 659270317.0, "step": 17279 }, { "epoch": 2.1981936140440146, "ewc_loss": 0.0686849057674408, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033528657513670623, "grad_norm": 8.118165969848633, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.865110456943512, "num_tokens": 659302133.0, "step": 17280 }, { "epoch": 2.198320824322605, "ewc_loss": 0.06869885325431824, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003329846658743918, "grad_norm": 8.036775588989258, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8734496235847473, "num_tokens": 659342469.0, "step": 17281 }, { "epoch": 2.1984480346011956, "ewc_loss": 0.06893571466207504, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033535322290845215, "grad_norm": 8.153661727905273, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8784916400909424, "num_tokens": 659373786.0, "step": 17282 }, { "epoch": 2.198575244879786, "ewc_loss": 0.06850813329219818, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033107740455307066, "grad_norm": 7.965534210205078, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8590328097343445, "num_tokens": 659412498.0, "step": 17283 }, { "epoch": 2.1987024551583767, "ewc_loss": 0.06872561573982239, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003356936213094741, "grad_norm": 8.156791687011719, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8504965305328369, "num_tokens": 659446415.0, "step": 17284 }, { "epoch": 2.1988296654369672, "ewc_loss": 0.06851323693990707, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033112848177552223, "grad_norm": 8.00772476196289, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8584758639335632, "num_tokens": 659488824.0, "step": 17285 }, { "epoch": 2.1989568757155578, "ewc_loss": 0.06882987916469574, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003342948330100626, "grad_norm": 8.08070182800293, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8704203963279724, "num_tokens": 659523036.0, "step": 17286 }, { "epoch": 2.1990840859941483, "ewc_loss": 0.06857579946517944, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003317540686111897, "grad_norm": 8.001910209655762, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8611927628517151, "num_tokens": 659562780.0, "step": 17287 }, { "epoch": 2.199211296272739, "ewc_loss": 0.06883125007152557, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003343085409142077, "grad_norm": 8.103436470031738, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8728163242340088, "num_tokens": 659599364.0, "step": 17288 }, { "epoch": 2.1993385065513293, "ewc_loss": 0.06851988285779953, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033119492582045496, "grad_norm": 7.966740131378174, "learning_rate": 1e-06, "loss": 0.47, "mean_token_accuracy": 0.8609632849693298, "num_tokens": 659636275.0, "step": 17289 }, { "epoch": 2.19946571682992, "ewc_loss": 0.0688314139842987, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033431025804020464, "grad_norm": 8.132648468017578, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8662546277046204, "num_tokens": 659670315.0, "step": 17290 }, { "epoch": 2.1995929271085104, "ewc_loss": 0.06849642097949982, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003309603489469737, "grad_norm": 8.004227638244629, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8626554012298584, "num_tokens": 659710968.0, "step": 17291 }, { "epoch": 2.199720137387101, "ewc_loss": 0.06879434734582901, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003339395625516772, "grad_norm": 8.081365585327148, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8755947351455688, "num_tokens": 659751334.0, "step": 17292 }, { "epoch": 2.1998473476656915, "ewc_loss": 0.06885606050491333, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033211533445864916, "grad_norm": 8.093155860900879, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8681498765945435, "num_tokens": 659793867.0, "step": 17293 }, { "epoch": 2.199974557944282, "ewc_loss": 0.06847366690635681, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000330732756992802, "grad_norm": 8.024020195007324, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8685363531112671, "num_tokens": 659838724.0, "step": 17294 }, { "epoch": 2.2001017682228725, "ewc_loss": 0.06863904744386673, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003323865821585059, "grad_norm": 8.062679290771484, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8641934394836426, "num_tokens": 659876589.0, "step": 17295 }, { "epoch": 2.200228978501463, "ewc_loss": 0.06882951408624649, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003318498202133924, "grad_norm": 8.13815975189209, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8580958843231201, "num_tokens": 659914864.0, "step": 17296 }, { "epoch": 2.2003561887800536, "ewc_loss": 0.06844344735145569, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003304306010250002, "grad_norm": 7.976140022277832, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8749401569366455, "num_tokens": 659952482.0, "step": 17297 }, { "epoch": 2.200483399058644, "ewc_loss": 0.06899929791688919, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033354765037074685, "grad_norm": 8.168868064880371, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8664531111717224, "num_tokens": 659985192.0, "step": 17298 }, { "epoch": 2.2006106093372346, "ewc_loss": 0.0682702511548996, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032869863207452, "grad_norm": 7.950855255126953, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8754720091819763, "num_tokens": 660019505.0, "step": 17299 }, { "epoch": 2.200737819615825, "ewc_loss": 0.06853676587343216, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033380516106262803, "grad_norm": 8.071737289428711, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8598024845123291, "num_tokens": 660054942.0, "step": 17300 }, { "epoch": 2.2008650298944152, "ewc_loss": 0.06850077211856842, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033100383006967604, "grad_norm": 8.007132530212402, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8648123741149902, "num_tokens": 660092836.0, "step": 17301 }, { "epoch": 2.200992240173006, "ewc_loss": 0.06854312866926193, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003314273781143129, "grad_norm": 8.005378723144531, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8618729114532471, "num_tokens": 660129419.0, "step": 17302 }, { "epoch": 2.2011194504515963, "ewc_loss": 0.0686882808804512, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000332878902554512, "grad_norm": 8.043367385864258, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8706385493278503, "num_tokens": 660168116.0, "step": 17303 }, { "epoch": 2.201246660730187, "ewc_loss": 0.06862863153219223, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003322823904454708, "grad_norm": 8.045710563659668, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.860478937625885, "num_tokens": 660208006.0, "step": 17304 }, { "epoch": 2.2013738710087773, "ewc_loss": 0.0686136856675148, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003321329422760755, "grad_norm": 8.003527641296387, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8668348789215088, "num_tokens": 660254046.0, "step": 17305 }, { "epoch": 2.201501081287368, "ewc_loss": 0.06871122121810913, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033310832805000246, "grad_norm": 8.054849624633789, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.864305317401886, "num_tokens": 660289321.0, "step": 17306 }, { "epoch": 2.2016282915659584, "ewc_loss": 0.06855320930480957, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033152822288684547, "grad_norm": 10.82823657989502, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8640370965003967, "num_tokens": 660325052.0, "step": 17307 }, { "epoch": 2.201755501844549, "ewc_loss": 0.07018870860338211, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003478831786196679, "grad_norm": 8.056923866271973, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8566317558288574, "num_tokens": 660357942.0, "step": 17308 }, { "epoch": 2.2018827121231395, "ewc_loss": 0.07032731920480728, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003492692776490003, "grad_norm": 8.46519660949707, "learning_rate": 1e-06, "loss": 0.5421, "mean_token_accuracy": 0.8415340185165405, "num_tokens": 660390654.0, "step": 17309 }, { "epoch": 2.20200992240173, "ewc_loss": 0.06836998462677002, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00032969596213661134, "grad_norm": 7.932019233703613, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8668050765991211, "num_tokens": 660432020.0, "step": 17310 }, { "epoch": 2.2021371326803205, "ewc_loss": 0.07024861872196198, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003509236848913133, "grad_norm": 8.413031578063965, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.866119384765625, "num_tokens": 660465051.0, "step": 17311 }, { "epoch": 2.202264342958911, "ewc_loss": 0.06854505836963654, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003314467321615666, "grad_norm": 7.980980396270752, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8616902828216553, "num_tokens": 660504967.0, "step": 17312 }, { "epoch": 2.2023915532375016, "ewc_loss": 0.06968550384044647, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003452925884630531, "grad_norm": 8.311029434204102, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8563326597213745, "num_tokens": 660543826.0, "step": 17313 }, { "epoch": 2.202518763516092, "ewc_loss": 0.06878145039081573, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003338105452712625, "grad_norm": 8.062461853027344, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8614509105682373, "num_tokens": 660583074.0, "step": 17314 }, { "epoch": 2.2026459737946826, "ewc_loss": 0.0694657415151596, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00034065349609591067, "grad_norm": 8.181953430175781, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8839580416679382, "num_tokens": 660619974.0, "step": 17315 }, { "epoch": 2.202773184073273, "ewc_loss": 0.06890283524990082, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033502443693578243, "grad_norm": 8.06967830657959, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8717162609100342, "num_tokens": 660654005.0, "step": 17316 }, { "epoch": 2.2029003943518637, "ewc_loss": 0.0688081607222557, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003365191223565489, "grad_norm": 8.163969993591309, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8553422689437866, "num_tokens": 660689179.0, "step": 17317 }, { "epoch": 2.203027604630454, "ewc_loss": 0.06855621933937073, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033399969106540084, "grad_norm": 8.06255054473877, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8544846177101135, "num_tokens": 660724923.0, "step": 17318 }, { "epoch": 2.2031548149090447, "ewc_loss": 0.06984728574752808, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003371447091922164, "grad_norm": 36.1822395324707, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.871030867099762, "num_tokens": 660760763.0, "step": 17319 }, { "epoch": 2.2032820251876353, "ewc_loss": 0.10522280633449554, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0006957827717997134, "grad_norm": 12.456694602966309, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8670998811721802, "num_tokens": 660795520.0, "step": 17320 }, { "epoch": 2.203409235466226, "ewc_loss": 0.0668349415063858, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003119040629826486, "grad_norm": 6.7476630210876465, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8751778602600098, "num_tokens": 660834858.0, "step": 17321 }, { "epoch": 2.2035364457448163, "ewc_loss": 0.090845987200737, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0005520145641639829, "grad_norm": 11.6655855178833, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.874667763710022, "num_tokens": 660870319.0, "step": 17322 }, { "epoch": 2.203663656023407, "ewc_loss": 0.09484151005744934, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0005919697578065097, "grad_norm": 11.269628524780273, "learning_rate": 1e-06, "loss": 0.524, "mean_token_accuracy": 0.8529574275016785, "num_tokens": 660907313.0, "step": 17323 }, { "epoch": 2.2037908663019974, "ewc_loss": 0.07585329562425613, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00040208763675764203, "grad_norm": 8.185162544250488, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8810117244720459, "num_tokens": 660948087.0, "step": 17324 }, { "epoch": 2.203918076580588, "ewc_loss": 0.07991495728492737, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00044270421494729817, "grad_norm": 10.0884428024292, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8712230920791626, "num_tokens": 660984984.0, "step": 17325 }, { "epoch": 2.204045286859178, "ewc_loss": 0.08154025673866272, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0004638400278054178, "grad_norm": 9.43764877319336, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8625822067260742, "num_tokens": 661022898.0, "step": 17326 }, { "epoch": 2.2041724971377685, "ewc_loss": 0.07409356534481049, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003893731627613306, "grad_norm": 8.682991027832031, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8747619390487671, "num_tokens": 661058645.0, "step": 17327 }, { "epoch": 2.204299707416359, "ewc_loss": 0.07528766989707947, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0004013142315670848, "grad_norm": 9.243523597717285, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8894875049591064, "num_tokens": 661096954.0, "step": 17328 }, { "epoch": 2.2044269176949496, "ewc_loss": 0.07405674457550049, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00038900497020222247, "grad_norm": 8.552369117736816, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.85472571849823, "num_tokens": 661136947.0, "step": 17329 }, { "epoch": 2.20455412797354, "ewc_loss": 0.07258632779121399, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000374300783732906, "grad_norm": 8.685272216796875, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8668885231018066, "num_tokens": 661175376.0, "step": 17330 }, { "epoch": 2.2046813382521306, "ewc_loss": 0.07221848517656326, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003706223506014794, "grad_norm": 8.554065704345703, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8593682646751404, "num_tokens": 661213238.0, "step": 17331 }, { "epoch": 2.204808548530721, "ewc_loss": 0.07164613902568817, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00036489890771918, "grad_norm": 8.461774826049805, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8832748532295227, "num_tokens": 661246512.0, "step": 17332 }, { "epoch": 2.2049357588093117, "ewc_loss": 0.07070311903953552, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00035546868457458913, "grad_norm": 8.348058700561523, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8778426051139832, "num_tokens": 661288272.0, "step": 17333 }, { "epoch": 2.2050629690879022, "ewc_loss": 0.07071597874164581, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003555973235052079, "grad_norm": 8.405601501464844, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.872774064540863, "num_tokens": 661328705.0, "step": 17334 }, { "epoch": 2.2051901793664928, "ewc_loss": 0.07003199309110641, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003487574285827577, "grad_norm": 8.25867748260498, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8701273202896118, "num_tokens": 661369877.0, "step": 17335 }, { "epoch": 2.2053173896450833, "ewc_loss": 0.06999312341213226, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003483687178231776, "grad_norm": 8.327622413635254, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8700140714645386, "num_tokens": 661405209.0, "step": 17336 }, { "epoch": 2.205444599923674, "ewc_loss": 0.06949882954359055, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003434258105698973, "grad_norm": 8.208524703979492, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8712350130081177, "num_tokens": 661441083.0, "step": 17337 }, { "epoch": 2.2055718102022643, "ewc_loss": 0.06941525638103485, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00034259003587067127, "grad_norm": 8.166820526123047, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8682941794395447, "num_tokens": 661476938.0, "step": 17338 }, { "epoch": 2.205699020480855, "ewc_loss": 0.06925615668296814, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000340999016771093, "grad_norm": 8.198246955871582, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8584948778152466, "num_tokens": 661519281.0, "step": 17339 }, { "epoch": 2.2058262307594454, "ewc_loss": 0.06916186958551407, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003400561981834471, "grad_norm": 8.124667167663574, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8824453949928284, "num_tokens": 661555880.0, "step": 17340 }, { "epoch": 2.205953441038036, "ewc_loss": 0.06901298463344574, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003385673917364329, "grad_norm": 8.140825271606445, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.859626054763794, "num_tokens": 661595937.0, "step": 17341 }, { "epoch": 2.2060806513166265, "ewc_loss": 0.06893806159496307, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033781814272515476, "grad_norm": 8.10989761352539, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8525503277778625, "num_tokens": 661636868.0, "step": 17342 }, { "epoch": 2.206207861595217, "ewc_loss": 0.06890329718589783, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003374704683665186, "grad_norm": 8.101089477539062, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8594400882720947, "num_tokens": 661681503.0, "step": 17343 }, { "epoch": 2.2063350718738075, "ewc_loss": 0.06886155158281326, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033705300302244723, "grad_norm": 8.11496639251709, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8699938058853149, "num_tokens": 661718438.0, "step": 17344 }, { "epoch": 2.206462282152398, "ewc_loss": 0.06878890097141266, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033632651320658624, "grad_norm": 8.15265941619873, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8475908041000366, "num_tokens": 661759048.0, "step": 17345 }, { "epoch": 2.2065894924309886, "ewc_loss": 0.06877166032791138, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000336154131218791, "grad_norm": 8.054198265075684, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8774917721748352, "num_tokens": 661803131.0, "step": 17346 }, { "epoch": 2.206716702709579, "ewc_loss": 0.06866106390953064, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033504815655760467, "grad_norm": 8.084718704223633, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8676965236663818, "num_tokens": 661838139.0, "step": 17347 }, { "epoch": 2.2068439129881696, "ewc_loss": 0.06865039467811584, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003349414619151503, "grad_norm": 8.067262649536133, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.859474778175354, "num_tokens": 661878568.0, "step": 17348 }, { "epoch": 2.20697112326676, "ewc_loss": 0.06880317628383636, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000336469296598807, "grad_norm": 8.100472450256348, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8604145050048828, "num_tokens": 661909110.0, "step": 17349 }, { "epoch": 2.2070983335453507, "ewc_loss": 0.06870034337043762, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003354409709572792, "grad_norm": 8.042773246765137, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8805525898933411, "num_tokens": 661944472.0, "step": 17350 }, { "epoch": 2.2072255438239408, "ewc_loss": 0.06887571513652802, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003371946804691106, "grad_norm": 8.09897518157959, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8793383836746216, "num_tokens": 661985865.0, "step": 17351 }, { "epoch": 2.2073527541025313, "ewc_loss": 0.06870321184396744, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033546960912644863, "grad_norm": 8.060422897338867, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8614728450775146, "num_tokens": 662019460.0, "step": 17352 }, { "epoch": 2.207479964381122, "ewc_loss": 0.06873936951160431, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033583116601221263, "grad_norm": 8.060587882995605, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.87380051612854, "num_tokens": 662059109.0, "step": 17353 }, { "epoch": 2.2076071746597123, "ewc_loss": 0.06880433857440948, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003364808508194983, "grad_norm": 8.041996955871582, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8597785234451294, "num_tokens": 662096040.0, "step": 17354 }, { "epoch": 2.207734384938303, "ewc_loss": 0.06872393935918808, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003356768866069615, "grad_norm": 7.983397483825684, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8723001480102539, "num_tokens": 662139351.0, "step": 17355 }, { "epoch": 2.2078615952168934, "ewc_loss": 0.06893876194953918, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003378250985406339, "grad_norm": 8.098440170288086, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8684521913528442, "num_tokens": 662177353.0, "step": 17356 }, { "epoch": 2.207988805495484, "ewc_loss": 0.06877368688583374, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033617435838095844, "grad_norm": 8.059724807739258, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8568209409713745, "num_tokens": 662209519.0, "step": 17357 }, { "epoch": 2.2081160157740745, "ewc_loss": 0.06898434460163116, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003382809809409082, "grad_norm": 8.1037015914917, "learning_rate": 1e-06, "loss": 0.5206, "mean_token_accuracy": 0.8471523523330688, "num_tokens": 662251311.0, "step": 17358 }, { "epoch": 2.208243226052665, "ewc_loss": 0.06872130930423737, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033565060584805906, "grad_norm": 8.018204689025879, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.874809205532074, "num_tokens": 662282770.0, "step": 17359 }, { "epoch": 2.2083704363312555, "ewc_loss": 0.06908044219017029, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003392418730072677, "grad_norm": 8.073464393615723, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8626067638397217, "num_tokens": 662326770.0, "step": 17360 }, { "epoch": 2.208497646609846, "ewc_loss": 0.06875258684158325, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033596332650631666, "grad_norm": 8.042243003845215, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8659507036209106, "num_tokens": 662366811.0, "step": 17361 }, { "epoch": 2.2086248568884366, "ewc_loss": 0.06896812468767166, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033811875618994236, "grad_norm": 8.044747352600098, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.863747775554657, "num_tokens": 662404517.0, "step": 17362 }, { "epoch": 2.208752067167027, "ewc_loss": 0.06886462867259979, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033708379487507045, "grad_norm": 8.034585952758789, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8724352121353149, "num_tokens": 662442091.0, "step": 17363 }, { "epoch": 2.2088792774456176, "ewc_loss": 0.06887436658143997, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003371811762917787, "grad_norm": 8.04693603515625, "learning_rate": 1e-06, "loss": 0.5337, "mean_token_accuracy": 0.841688871383667, "num_tokens": 662483343.0, "step": 17364 }, { "epoch": 2.209006487724208, "ewc_loss": 0.06893178075551987, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033775530755519867, "grad_norm": 8.04375171661377, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8602848052978516, "num_tokens": 662521994.0, "step": 17365 }, { "epoch": 2.2091336980027987, "ewc_loss": 0.06939958035945892, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003375504456926137, "grad_norm": 8.060946464538574, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8627451062202454, "num_tokens": 662562737.0, "step": 17366 }, { "epoch": 2.209260908281389, "ewc_loss": 0.06887847185134888, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003372221835888922, "grad_norm": 8.074936866760254, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8622449040412903, "num_tokens": 662596927.0, "step": 17367 }, { "epoch": 2.2093881185599797, "ewc_loss": 0.06943194568157196, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003378741967026144, "grad_norm": 8.049548149108887, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8756890892982483, "num_tokens": 662632831.0, "step": 17368 }, { "epoch": 2.2095153288385703, "ewc_loss": 0.06943245232105255, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000337879202561453, "grad_norm": 8.134721755981445, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8631248474121094, "num_tokens": 662675554.0, "step": 17369 }, { "epoch": 2.209642539117161, "ewc_loss": 0.06886592507362366, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033709677518345416, "grad_norm": 7.994699001312256, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8793531656265259, "num_tokens": 662720248.0, "step": 17370 }, { "epoch": 2.2097697493957513, "ewc_loss": 0.06905308365821838, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003389683261048049, "grad_norm": 8.096778869628906, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8656922578811646, "num_tokens": 662761388.0, "step": 17371 }, { "epoch": 2.209896959674342, "ewc_loss": 0.06878568977117538, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033629441168159246, "grad_norm": 8.090843200683594, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8658246994018555, "num_tokens": 662798007.0, "step": 17372 }, { "epoch": 2.2100241699529324, "ewc_loss": 0.06907356530427933, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033917315886355937, "grad_norm": 8.301393508911133, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8635687828063965, "num_tokens": 662836042.0, "step": 17373 }, { "epoch": 2.210151380231523, "ewc_loss": 0.06858445703983307, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000334282114636153, "grad_norm": 8.405871391296387, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8571274280548096, "num_tokens": 662869465.0, "step": 17374 }, { "epoch": 2.2102785905101134, "ewc_loss": 0.06863804906606674, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003348180034663528, "grad_norm": 8.32697868347168, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8576212525367737, "num_tokens": 662909440.0, "step": 17375 }, { "epoch": 2.2104058007887035, "ewc_loss": 0.06825326383113861, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033097018604166806, "grad_norm": 7.96879768371582, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8631123304367065, "num_tokens": 662949637.0, "step": 17376 }, { "epoch": 2.210533011067294, "ewc_loss": 0.06870070099830627, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003354445507284254, "grad_norm": 8.184427261352539, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.86275315284729, "num_tokens": 662987679.0, "step": 17377 }, { "epoch": 2.2106602213458846, "ewc_loss": 0.06827445328235626, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003311820328235626, "grad_norm": 8.128043174743652, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8468325734138489, "num_tokens": 663027651.0, "step": 17378 }, { "epoch": 2.210787431624475, "ewc_loss": 0.06847319006919861, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033316941699013114, "grad_norm": 7.99819278717041, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8805006742477417, "num_tokens": 663063708.0, "step": 17379 }, { "epoch": 2.2109146419030656, "ewc_loss": 0.06862232089042664, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033466072636656463, "grad_norm": 8.133267402648926, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8584181070327759, "num_tokens": 663097255.0, "step": 17380 }, { "epoch": 2.211041852181656, "ewc_loss": 0.06844392418861389, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003328766906633973, "grad_norm": 8.010250091552734, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8617391586303711, "num_tokens": 663134496.0, "step": 17381 }, { "epoch": 2.2111690624602467, "ewc_loss": 0.06884649395942688, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003369024198036641, "grad_norm": 8.066170692443848, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8749788403511047, "num_tokens": 663170278.0, "step": 17382 }, { "epoch": 2.211296272738837, "ewc_loss": 0.0687006264925003, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003354437940288335, "grad_norm": 8.07967758178711, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8642849922180176, "num_tokens": 663205407.0, "step": 17383 }, { "epoch": 2.2114234830174277, "ewc_loss": 0.06882108747959137, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003366483433637768, "grad_norm": 8.039552688598633, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8701010942459106, "num_tokens": 663243166.0, "step": 17384 }, { "epoch": 2.2115506932960183, "ewc_loss": 0.06883363425731659, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003367738681845367, "grad_norm": 8.041160583496094, "learning_rate": 1e-06, "loss": 0.3899, "mean_token_accuracy": 0.8883557319641113, "num_tokens": 663282267.0, "step": 17385 }, { "epoch": 2.211677903574609, "ewc_loss": 0.06937505304813385, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003373051877133548, "grad_norm": 8.03828239440918, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8577251434326172, "num_tokens": 663318620.0, "step": 17386 }, { "epoch": 2.2118051138531993, "ewc_loss": 0.06930264830589294, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003365811426192522, "grad_norm": 8.0861234664917, "learning_rate": 1e-06, "loss": 0.4019, "mean_token_accuracy": 0.8833335041999817, "num_tokens": 663353256.0, "step": 17387 }, { "epoch": 2.21193232413179, "ewc_loss": 0.06923261284828186, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003358808462508023, "grad_norm": 8.034806251525879, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8692534565925598, "num_tokens": 663386691.0, "step": 17388 }, { "epoch": 2.2120595344103804, "ewc_loss": 0.06939278542995453, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033748254645615816, "grad_norm": 8.114447593688965, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8624791502952576, "num_tokens": 663416550.0, "step": 17389 }, { "epoch": 2.212186744688971, "ewc_loss": 0.0692119151353836, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033567388891242445, "grad_norm": 8.023255348205566, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8679134845733643, "num_tokens": 663464912.0, "step": 17390 }, { "epoch": 2.2123139549675614, "ewc_loss": 0.06904913485050201, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033892886131070554, "grad_norm": 8.141172409057617, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8574706315994263, "num_tokens": 663503786.0, "step": 17391 }, { "epoch": 2.212441165246152, "ewc_loss": 0.06867408752441406, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000335178425302729, "grad_norm": 8.027009963989258, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.873877227306366, "num_tokens": 663540579.0, "step": 17392 }, { "epoch": 2.2125683755247425, "ewc_loss": 0.06948830187320709, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003384377050679177, "grad_norm": 8.107851028442383, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8680093288421631, "num_tokens": 663577284.0, "step": 17393 }, { "epoch": 2.212695585803333, "ewc_loss": 0.0686875730752945, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033531326334923506, "grad_norm": 8.045344352722168, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8606173396110535, "num_tokens": 663613118.0, "step": 17394 }, { "epoch": 2.2128227960819236, "ewc_loss": 0.06888395547866821, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033727704430930316, "grad_norm": 8.10852336883545, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8719035387039185, "num_tokens": 663651548.0, "step": 17395 }, { "epoch": 2.212950006360514, "ewc_loss": 0.06872186064720154, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033565613557584584, "grad_norm": 8.044599533081055, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8479412794113159, "num_tokens": 663694381.0, "step": 17396 }, { "epoch": 2.2130772166391046, "ewc_loss": 0.06939926743507385, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033754738979041576, "grad_norm": 8.080554008483887, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8695200681686401, "num_tokens": 663734686.0, "step": 17397 }, { "epoch": 2.213204426917695, "ewc_loss": 0.06906232237815857, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000336619297740981, "grad_norm": 8.030752182006836, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8581204414367676, "num_tokens": 663768764.0, "step": 17398 }, { "epoch": 2.2133316371962852, "ewc_loss": 0.06907700002193451, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033676609746180475, "grad_norm": 8.134106636047363, "learning_rate": 1e-06, "loss": 0.3967, "mean_token_accuracy": 0.8861459493637085, "num_tokens": 663807650.0, "step": 17399 }, { "epoch": 2.213458847474876, "ewc_loss": 0.06872040033340454, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033564152545295656, "grad_norm": 7.987029075622559, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8646016716957092, "num_tokens": 663849308.0, "step": 17400 }, { "epoch": 2.2135860577534663, "ewc_loss": 0.06900385022163391, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033847600570879877, "grad_norm": 8.165191650390625, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8517529368400574, "num_tokens": 663888917.0, "step": 17401 }, { "epoch": 2.213713268032057, "ewc_loss": 0.06875083595514297, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033350446028634906, "grad_norm": 8.034490585327148, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8665955066680908, "num_tokens": 663921048.0, "step": 17402 }, { "epoch": 2.2138404783106473, "ewc_loss": 0.06905046850442886, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003389421908650547, "grad_norm": 8.139261245727539, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8705251812934875, "num_tokens": 663963433.0, "step": 17403 }, { "epoch": 2.213967688589238, "ewc_loss": 0.06871834397315979, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033317948691546917, "grad_norm": 8.03764820098877, "learning_rate": 1e-06, "loss": 0.52, "mean_token_accuracy": 0.8483593463897705, "num_tokens": 664003294.0, "step": 17404 }, { "epoch": 2.2140948988678284, "ewc_loss": 0.06891484558582306, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003375859232619405, "grad_norm": 8.097627639770508, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8542261123657227, "num_tokens": 664043161.0, "step": 17405 }, { "epoch": 2.214222109146419, "ewc_loss": 0.06913115829229355, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003348662576172501, "grad_norm": 8.04017448425293, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8769755959510803, "num_tokens": 664082457.0, "step": 17406 }, { "epoch": 2.2143493194250095, "ewc_loss": 0.06929364800453186, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033649118267931044, "grad_norm": 8.041385650634766, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8769471645355225, "num_tokens": 664125999.0, "step": 17407 }, { "epoch": 2.2144765297036, "ewc_loss": 0.06928360462188721, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003363907162565738, "grad_norm": 8.104113578796387, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8725246787071228, "num_tokens": 664167256.0, "step": 17408 }, { "epoch": 2.2146037399821905, "ewc_loss": 0.06866244971752167, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033506195177324116, "grad_norm": 8.03268051147461, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8636705875396729, "num_tokens": 664201887.0, "step": 17409 }, { "epoch": 2.214730950260781, "ewc_loss": 0.06883090734481812, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003367465396877378, "grad_norm": 8.109064102172852, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8713075518608093, "num_tokens": 664238986.0, "step": 17410 }, { "epoch": 2.2148581605393716, "ewc_loss": 0.06853725016117096, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033381005050614476, "grad_norm": 7.9957427978515625, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8626375198364258, "num_tokens": 664283436.0, "step": 17411 }, { "epoch": 2.214985370817962, "ewc_loss": 0.06919486820697784, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033794474438764155, "grad_norm": 8.05257797241211, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.859886109828949, "num_tokens": 664325196.0, "step": 17412 }, { "epoch": 2.2151125810965526, "ewc_loss": 0.06892023980617523, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033519850694574416, "grad_norm": 8.056685447692871, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.857890784740448, "num_tokens": 664365722.0, "step": 17413 }, { "epoch": 2.215239791375143, "ewc_loss": 0.06928236782550812, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033637831802479923, "grad_norm": 8.043051719665527, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.856119692325592, "num_tokens": 664411570.0, "step": 17414 }, { "epoch": 2.2153670016537337, "ewc_loss": 0.0693315789103508, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003368704637978226, "grad_norm": 8.091026306152344, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8686800599098206, "num_tokens": 664453545.0, "step": 17415 }, { "epoch": 2.215494211932324, "ewc_loss": 0.06924678385257721, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033602252369746566, "grad_norm": 8.05940055847168, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8655927777290344, "num_tokens": 664489882.0, "step": 17416 }, { "epoch": 2.2156214222109147, "ewc_loss": 0.06934529542922974, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003370076010469347, "grad_norm": 8.090498924255371, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8646382093429565, "num_tokens": 664528720.0, "step": 17417 }, { "epoch": 2.2157486324895053, "ewc_loss": 0.06930413097143173, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033659598557278514, "grad_norm": 8.13121223449707, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8570913076400757, "num_tokens": 664569645.0, "step": 17418 }, { "epoch": 2.215875842768096, "ewc_loss": 0.06919744610786438, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003355290973559022, "grad_norm": 8.095715522766113, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8616492748260498, "num_tokens": 664612033.0, "step": 17419 }, { "epoch": 2.2160030530466863, "ewc_loss": 0.06868596374988556, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003352971689309925, "grad_norm": 8.048087120056152, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8725440502166748, "num_tokens": 664651408.0, "step": 17420 }, { "epoch": 2.216130263325277, "ewc_loss": 0.06867310404777527, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033516850089654326, "grad_norm": 8.105071067810059, "learning_rate": 1e-06, "loss": 0.5147, "mean_token_accuracy": 0.8518611192703247, "num_tokens": 664695764.0, "step": 17421 }, { "epoch": 2.2162574736038674, "ewc_loss": 0.06873001158237457, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003357375680934638, "grad_norm": 8.090682029724121, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8682715892791748, "num_tokens": 664735528.0, "step": 17422 }, { "epoch": 2.216384683882458, "ewc_loss": 0.06874667853116989, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003359043039381504, "grad_norm": 8.12545394897461, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8576579093933105, "num_tokens": 664775612.0, "step": 17423 }, { "epoch": 2.216511894161048, "ewc_loss": 0.06863237917423248, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033476128010079265, "grad_norm": 8.050030708312988, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8625946044921875, "num_tokens": 664812485.0, "step": 17424 }, { "epoch": 2.2166391044396385, "ewc_loss": 0.06884987652301788, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003369362384546548, "grad_norm": 8.138758659362793, "learning_rate": 1e-06, "loss": 0.5081, "mean_token_accuracy": 0.8508450388908386, "num_tokens": 664853166.0, "step": 17425 }, { "epoch": 2.216766314718229, "ewc_loss": 0.06860001385211945, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003344376164022833, "grad_norm": 8.089288711547852, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8548949956893921, "num_tokens": 664888778.0, "step": 17426 }, { "epoch": 2.2168935249968196, "ewc_loss": 0.06885992735624313, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033703676308505237, "grad_norm": 8.106683731079102, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8640013933181763, "num_tokens": 664933783.0, "step": 17427 }, { "epoch": 2.21702073527541, "ewc_loss": 0.06867675483226776, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033520502620376647, "grad_norm": 8.164368629455566, "learning_rate": 1e-06, "loss": 0.5325, "mean_token_accuracy": 0.847508430480957, "num_tokens": 664965228.0, "step": 17428 }, { "epoch": 2.2171479455540006, "ewc_loss": 0.06918640434741974, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033541873563081026, "grad_norm": 8.143351554870605, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8641963601112366, "num_tokens": 664996691.0, "step": 17429 }, { "epoch": 2.217275155832591, "ewc_loss": 0.0686054527759552, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033449206966906786, "grad_norm": 8.06395435333252, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8767650127410889, "num_tokens": 665032006.0, "step": 17430 }, { "epoch": 2.2174023661111817, "ewc_loss": 0.06872359663248062, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003356734523549676, "grad_norm": 8.067830085754395, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8547255992889404, "num_tokens": 665071819.0, "step": 17431 }, { "epoch": 2.217529576389772, "ewc_loss": 0.06920646876096725, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000335619377437979, "grad_norm": 8.137930870056152, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.852868378162384, "num_tokens": 665118883.0, "step": 17432 }, { "epoch": 2.2176567866683627, "ewc_loss": 0.06902167946100235, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003337714879307896, "grad_norm": 8.026363372802734, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8475840091705322, "num_tokens": 665154926.0, "step": 17433 }, { "epoch": 2.2177839969469533, "ewc_loss": 0.06931702792644501, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003367249737493694, "grad_norm": 8.08604907989502, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.873053252696991, "num_tokens": 665189910.0, "step": 17434 }, { "epoch": 2.217911207225544, "ewc_loss": 0.06923538446426392, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003359084948897362, "grad_norm": 8.050084114074707, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8723167181015015, "num_tokens": 665235849.0, "step": 17435 }, { "epoch": 2.2180384175041343, "ewc_loss": 0.06907755136489868, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.000336771656293422, "grad_norm": 8.08059024810791, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8568682670593262, "num_tokens": 665274231.0, "step": 17436 }, { "epoch": 2.218165627782725, "ewc_loss": 0.06922054290771484, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003357601526658982, "grad_norm": 8.034852027893066, "learning_rate": 1e-06, "loss": 0.403, "mean_token_accuracy": 0.883367657661438, "num_tokens": 665309057.0, "step": 17437 }, { "epoch": 2.2182928380613154, "ewc_loss": 0.06936682015657425, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003372228820808232, "grad_norm": 8.17830753326416, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8615805506706238, "num_tokens": 665337031.0, "step": 17438 }, { "epoch": 2.218420048339906, "ewc_loss": 0.06891649216413498, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003351610212121159, "grad_norm": 8.027266502380371, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8654245138168335, "num_tokens": 665376557.0, "step": 17439 }, { "epoch": 2.2185472586184964, "ewc_loss": 0.06946719437837601, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033822664408944547, "grad_norm": 8.078666687011719, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.863389253616333, "num_tokens": 665419928.0, "step": 17440 }, { "epoch": 2.218674468897087, "ewc_loss": 0.06919436156749725, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003354982763994485, "grad_norm": 8.072759628295898, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8718733787536621, "num_tokens": 665458827.0, "step": 17441 }, { "epoch": 2.2188016791756775, "ewc_loss": 0.06942328810691833, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003377875254955143, "grad_norm": 8.124189376831055, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8672807812690735, "num_tokens": 665498623.0, "step": 17442 }, { "epoch": 2.218928889454268, "ewc_loss": 0.06923995912075043, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033595424611121416, "grad_norm": 8.115764617919922, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.859486997127533, "num_tokens": 665536103.0, "step": 17443 }, { "epoch": 2.2190560997328586, "ewc_loss": 0.06927383691072464, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033629307290539145, "grad_norm": 8.08553695678711, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.862934410572052, "num_tokens": 665572089.0, "step": 17444 }, { "epoch": 2.219183310011449, "ewc_loss": 0.06928011775016785, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033635590807534754, "grad_norm": 8.091409683227539, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.856595516204834, "num_tokens": 665608088.0, "step": 17445 }, { "epoch": 2.2193105202900396, "ewc_loss": 0.06926257908344269, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003361804410815239, "grad_norm": 8.128308296203613, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8739478588104248, "num_tokens": 665641883.0, "step": 17446 }, { "epoch": 2.21943773056863, "ewc_loss": 0.06923829019069672, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000335937540512532, "grad_norm": 8.07697582244873, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8689303398132324, "num_tokens": 665682211.0, "step": 17447 }, { "epoch": 2.2195649408472207, "ewc_loss": 0.06918273121118546, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033538200659677386, "grad_norm": 8.081188201904297, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8697720766067505, "num_tokens": 665726182.0, "step": 17448 }, { "epoch": 2.2196921511258108, "ewc_loss": 0.06916779279708862, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003352326457388699, "grad_norm": 8.114049911499023, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8695197105407715, "num_tokens": 665760404.0, "step": 17449 }, { "epoch": 2.2198193614044013, "ewc_loss": 0.06922182440757751, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033577295835129917, "grad_norm": 8.149981498718262, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8639571666717529, "num_tokens": 665798235.0, "step": 17450 }, { "epoch": 2.219946571682992, "ewc_loss": 0.06911444664001465, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003346991434227675, "grad_norm": 8.058244705200195, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8659766316413879, "num_tokens": 665840118.0, "step": 17451 }, { "epoch": 2.2200737819615823, "ewc_loss": 0.0691566914319992, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033512155641801655, "grad_norm": 8.129998207092285, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8640807867050171, "num_tokens": 665878498.0, "step": 17452 }, { "epoch": 2.220200992240173, "ewc_loss": 0.06910233199596405, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033457804238423705, "grad_norm": 8.051520347595215, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8608365058898926, "num_tokens": 665924179.0, "step": 17453 }, { "epoch": 2.2203282025187634, "ewc_loss": 0.0693286806344986, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003368414763826877, "grad_norm": 8.182889938354492, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8802456259727478, "num_tokens": 665963160.0, "step": 17454 }, { "epoch": 2.220455412797354, "ewc_loss": 0.06888881325721741, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033244286896660924, "grad_norm": 8.063665390014648, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8733789324760437, "num_tokens": 665998854.0, "step": 17455 }, { "epoch": 2.2205826230759445, "ewc_loss": 0.0693143829703331, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003366985183674842, "grad_norm": 8.178740501403809, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8737940192222595, "num_tokens": 666036420.0, "step": 17456 }, { "epoch": 2.220709833354535, "ewc_loss": 0.06869856268167496, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033298172638751566, "grad_norm": 8.090576171875, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8459917902946472, "num_tokens": 666076692.0, "step": 17457 }, { "epoch": 2.2208370436331255, "ewc_loss": 0.06915541738271713, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003351088671479374, "grad_norm": 8.116344451904297, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8513575792312622, "num_tokens": 666118009.0, "step": 17458 }, { "epoch": 2.220964253911716, "ewc_loss": 0.06907740980386734, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003343287680763751, "grad_norm": 8.031353950500488, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8557709455490112, "num_tokens": 666158132.0, "step": 17459 }, { "epoch": 2.2210914641903066, "ewc_loss": 0.06926538050174713, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033620852627791464, "grad_norm": 8.1734619140625, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8605585098266602, "num_tokens": 666187195.0, "step": 17460 }, { "epoch": 2.221218674468897, "ewc_loss": 0.06891896575689316, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033274435554631054, "grad_norm": 7.998741626739502, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8692905902862549, "num_tokens": 666227790.0, "step": 17461 }, { "epoch": 2.2213458847474876, "ewc_loss": 0.06935381889343262, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033709287527017295, "grad_norm": 8.162890434265137, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8580427765846252, "num_tokens": 666268517.0, "step": 17462 }, { "epoch": 2.221473095026078, "ewc_loss": 0.06849633902311325, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033340087975375354, "grad_norm": 8.002602577209473, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8818860054016113, "num_tokens": 666313727.0, "step": 17463 }, { "epoch": 2.2216003053046687, "ewc_loss": 0.06936575472354889, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033721228828653693, "grad_norm": 8.169565200805664, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8679288625717163, "num_tokens": 666352713.0, "step": 17464 }, { "epoch": 2.221727515583259, "ewc_loss": 0.06845398992300034, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003329773899167776, "grad_norm": 7.978296279907227, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8731536865234375, "num_tokens": 666395065.0, "step": 17465 }, { "epoch": 2.2218547258618497, "ewc_loss": 0.06898541748523712, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033829172025434673, "grad_norm": 8.21908950805664, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8672416806221008, "num_tokens": 666428019.0, "step": 17466 }, { "epoch": 2.2219819361404403, "ewc_loss": 0.06867122650146484, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003327084123156965, "grad_norm": 7.991562843322754, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8643001317977905, "num_tokens": 666466830.0, "step": 17467 }, { "epoch": 2.222109146419031, "ewc_loss": 0.0691419392824173, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033985692425630987, "grad_norm": 8.192463874816895, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8830362558364868, "num_tokens": 666501028.0, "step": 17468 }, { "epoch": 2.2222363566976213, "ewc_loss": 0.06859143078327179, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003343518474139273, "grad_norm": 8.056953430175781, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8788665533065796, "num_tokens": 666538554.0, "step": 17469 }, { "epoch": 2.222363566976212, "ewc_loss": 0.06954368203878403, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003389915218576789, "grad_norm": 8.179272651672363, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8653227090835571, "num_tokens": 666577019.0, "step": 17470 }, { "epoch": 2.2224907772548024, "ewc_loss": 0.06909579038619995, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033451264607720077, "grad_norm": 8.034622192382812, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8791121244430542, "num_tokens": 666616597.0, "step": 17471 }, { "epoch": 2.222617987533393, "ewc_loss": 0.06898626685142517, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003383001603651792, "grad_norm": 8.218560218811035, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8685193061828613, "num_tokens": 666651523.0, "step": 17472 }, { "epoch": 2.2227451978119834, "ewc_loss": 0.06866450607776642, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.000335082586389035, "grad_norm": 8.106841087341309, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.854701578617096, "num_tokens": 666689270.0, "step": 17473 }, { "epoch": 2.2228724080905735, "ewc_loss": 0.06883801519870758, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003368176694493741, "grad_norm": 8.131326675415039, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.863145112991333, "num_tokens": 666726836.0, "step": 17474 }, { "epoch": 2.222999618369164, "ewc_loss": 0.06871318072080612, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003355693188495934, "grad_norm": 8.141891479492188, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8625092506408691, "num_tokens": 666761207.0, "step": 17475 }, { "epoch": 2.2231268286477546, "ewc_loss": 0.06870293617248535, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033546684426255524, "grad_norm": 8.068867683410645, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8743734955787659, "num_tokens": 666800293.0, "step": 17476 }, { "epoch": 2.223254038926345, "ewc_loss": 0.06933711469173431, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003369258774910122, "grad_norm": 8.088601112365723, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8754359483718872, "num_tokens": 666843046.0, "step": 17477 }, { "epoch": 2.2233812492049356, "ewc_loss": 0.06925776600837708, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033613239065743983, "grad_norm": 8.070273399353027, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8803529739379883, "num_tokens": 666885840.0, "step": 17478 }, { "epoch": 2.223508459483526, "ewc_loss": 0.06884372234344482, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003368747129570693, "grad_norm": 8.112798690795898, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8663526773452759, "num_tokens": 666925654.0, "step": 17479 }, { "epoch": 2.2236356697621167, "ewc_loss": 0.06921684741973877, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033572313259355724, "grad_norm": 8.021127700805664, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8672677278518677, "num_tokens": 666968098.0, "step": 17480 }, { "epoch": 2.223762880040707, "ewc_loss": 0.06947384774684906, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033829311723820865, "grad_norm": 8.106130599975586, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.877291202545166, "num_tokens": 667010349.0, "step": 17481 }, { "epoch": 2.2238900903192977, "ewc_loss": 0.06870447844266891, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003354822692926973, "grad_norm": 8.051462173461914, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8577331304550171, "num_tokens": 667047086.0, "step": 17482 }, { "epoch": 2.2240173005978883, "ewc_loss": 0.06893257796764374, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033776325290091336, "grad_norm": 8.184024810791016, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8495746850967407, "num_tokens": 667083955.0, "step": 17483 }, { "epoch": 2.224144510876479, "ewc_loss": 0.06872116029262543, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.00033564906334504485, "grad_norm": 8.105195999145508, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8707383871078491, "num_tokens": 667119751.0, "step": 17484 }, { "epoch": 2.2242717211550693, "ewc_loss": 0.06932562589645386, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033681097556836903, "grad_norm": 8.562981605529785, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8598421812057495, "num_tokens": 667150980.0, "step": 17485 }, { "epoch": 2.22439893143366, "ewc_loss": 0.06873101741075516, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033086485927924514, "grad_norm": 7.982753753662109, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8613491058349609, "num_tokens": 667185198.0, "step": 17486 }, { "epoch": 2.2245261417122504, "ewc_loss": 0.06976790726184845, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000341233768267557, "grad_norm": 8.225135803222656, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8684293627738953, "num_tokens": 667225286.0, "step": 17487 }, { "epoch": 2.224653351990841, "ewc_loss": 0.06845615804195404, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.00033055763924494386, "grad_norm": 7.9422407150268555, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8604081869125366, "num_tokens": 667266294.0, "step": 17488 }, { "epoch": 2.2247805622694314, "ewc_loss": 0.06976205855607986, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000341175269568339, "grad_norm": 8.278812408447266, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8649700284004211, "num_tokens": 667302430.0, "step": 17489 }, { "epoch": 2.224907772548022, "ewc_loss": 0.06882096081972122, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003317643131595105, "grad_norm": 8.008820533752441, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8592334985733032, "num_tokens": 667342881.0, "step": 17490 }, { "epoch": 2.2250349828266125, "ewc_loss": 0.06962884962558746, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003398431290406734, "grad_norm": 8.251663208007812, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8684126138687134, "num_tokens": 667376122.0, "step": 17491 }, { "epoch": 2.225162193105203, "ewc_loss": 0.06895920634269714, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033314674510620534, "grad_norm": 8.046211242675781, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8577085137367249, "num_tokens": 667415120.0, "step": 17492 }, { "epoch": 2.2252894033837936, "ewc_loss": 0.06952416151762009, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003387962933629751, "grad_norm": 8.190143585205078, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8639224767684937, "num_tokens": 667451014.0, "step": 17493 }, { "epoch": 2.225416613662384, "ewc_loss": 0.06903707981109619, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003339255345053971, "grad_norm": 8.097392082214355, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8569422960281372, "num_tokens": 667485600.0, "step": 17494 }, { "epoch": 2.2255438239409746, "ewc_loss": 0.06922714412212372, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003358261601533741, "grad_norm": 8.102688789367676, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.860532283782959, "num_tokens": 667527561.0, "step": 17495 }, { "epoch": 2.225671034219565, "ewc_loss": 0.069156713783741, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033512181835249066, "grad_norm": 8.034568786621094, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8728829026222229, "num_tokens": 667569677.0, "step": 17496 }, { "epoch": 2.225798244498155, "ewc_loss": 0.06929566711187363, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033651135163381696, "grad_norm": 8.14843463897705, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8656548261642456, "num_tokens": 667605954.0, "step": 17497 }, { "epoch": 2.225925454776746, "ewc_loss": 0.06913749873638153, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033492970396764576, "grad_norm": 8.05300521850586, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.868894100189209, "num_tokens": 667645082.0, "step": 17498 }, { "epoch": 2.2260526650553363, "ewc_loss": 0.06938670575618744, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033742169034667313, "grad_norm": 8.149027824401855, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8629546165466309, "num_tokens": 667681390.0, "step": 17499 }, { "epoch": 2.226179875333927, "ewc_loss": 0.06909610331058502, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033451570197939873, "grad_norm": 8.079217910766602, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8555758595466614, "num_tokens": 667722772.0, "step": 17500 }, { "epoch": 2.2263070856125173, "ewc_loss": 0.06936144828796387, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033716915640980005, "grad_norm": 8.129339218139648, "learning_rate": 1e-06, "loss": 0.5407, "mean_token_accuracy": 0.8420664072036743, "num_tokens": 667758647.0, "step": 17501 }, { "epoch": 2.226434295891108, "ewc_loss": 0.06920871138572693, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033564178738743067, "grad_norm": 8.16274356842041, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8663801550865173, "num_tokens": 667790625.0, "step": 17502 }, { "epoch": 2.2265615061696984, "ewc_loss": 0.06906828284263611, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033423752756789327, "grad_norm": 8.09942626953125, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8542801141738892, "num_tokens": 667823724.0, "step": 17503 }, { "epoch": 2.226688716448289, "ewc_loss": 0.06933598965406418, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000336914585204795, "grad_norm": 8.06632137298584, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8783130645751953, "num_tokens": 667862548.0, "step": 17504 }, { "epoch": 2.2268159267268794, "ewc_loss": 0.06919950246810913, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000335549731971696, "grad_norm": 8.107117652893066, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8581324219703674, "num_tokens": 667905994.0, "step": 17505 }, { "epoch": 2.22694313700547, "ewc_loss": 0.06920886039733887, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003356433298904449, "grad_norm": 8.0759859085083, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8588536977767944, "num_tokens": 667945087.0, "step": 17506 }, { "epoch": 2.2270703472840605, "ewc_loss": 0.06927389651536942, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003362936549820006, "grad_norm": 8.072784423828125, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8743268251419067, "num_tokens": 667984736.0, "step": 17507 }, { "epoch": 2.227197557562651, "ewc_loss": 0.06915955245494843, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003351501654833555, "grad_norm": 8.0712251663208, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8591549396514893, "num_tokens": 668024020.0, "step": 17508 }, { "epoch": 2.2273247678412416, "ewc_loss": 0.0692836195230484, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003363908326718956, "grad_norm": 8.100090026855469, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.860185980796814, "num_tokens": 668059567.0, "step": 17509 }, { "epoch": 2.227451978119832, "ewc_loss": 0.06922800838947296, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003358348039910197, "grad_norm": 8.057613372802734, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8719621896743774, "num_tokens": 668099901.0, "step": 17510 }, { "epoch": 2.2275791883984226, "ewc_loss": 0.06935504078865051, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033710512798279524, "grad_norm": 8.113862991333008, "learning_rate": 1e-06, "loss": 0.411, "mean_token_accuracy": 0.8792104125022888, "num_tokens": 668135974.0, "step": 17511 }, { "epoch": 2.227706398677013, "ewc_loss": 0.0691145658493042, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003347003075759858, "grad_norm": 8.038320541381836, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8697231411933899, "num_tokens": 668179200.0, "step": 17512 }, { "epoch": 2.2278336089556037, "ewc_loss": 0.06938474625349045, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033740216167643666, "grad_norm": 8.114795684814453, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8434513211250305, "num_tokens": 668219683.0, "step": 17513 }, { "epoch": 2.227960819234194, "ewc_loss": 0.06915147602558136, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033506948966532946, "grad_norm": 8.066632270812988, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8672267198562622, "num_tokens": 668253899.0, "step": 17514 }, { "epoch": 2.2280880295127847, "ewc_loss": 0.06933487951755524, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033690352574922144, "grad_norm": 8.090856552124023, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8585450053215027, "num_tokens": 668294140.0, "step": 17515 }, { "epoch": 2.2282152397913753, "ewc_loss": 0.06926582753658295, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033621295006014407, "grad_norm": 8.083841323852539, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8699167966842651, "num_tokens": 668328442.0, "step": 17516 }, { "epoch": 2.228342450069966, "ewc_loss": 0.06933523714542389, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033690707641653717, "grad_norm": 8.109465599060059, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8673208951950073, "num_tokens": 668367903.0, "step": 17517 }, { "epoch": 2.2284696603485563, "ewc_loss": 0.06923216581344604, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003358763933647424, "grad_norm": 8.033098220825195, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8770265579223633, "num_tokens": 668409889.0, "step": 17518 }, { "epoch": 2.228596870627147, "ewc_loss": 0.0694858506321907, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033841319964267313, "grad_norm": 8.176438331604004, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8698310852050781, "num_tokens": 668451061.0, "step": 17519 }, { "epoch": 2.2287240809057374, "ewc_loss": 0.06907665729522705, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033432123018428683, "grad_norm": 8.054988861083984, "learning_rate": 1e-06, "loss": 0.5227, "mean_token_accuracy": 0.8441648483276367, "num_tokens": 668493038.0, "step": 17520 }, { "epoch": 2.228851291184328, "ewc_loss": 0.06946384906768799, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033819323289208114, "grad_norm": 8.104093551635742, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8723541498184204, "num_tokens": 668528087.0, "step": 17521 }, { "epoch": 2.228978501462918, "ewc_loss": 0.06919234991073608, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003354782238602638, "grad_norm": 8.04008960723877, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8667240142822266, "num_tokens": 668562325.0, "step": 17522 }, { "epoch": 2.2291057117415085, "ewc_loss": 0.06949705630540848, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033852524938993156, "grad_norm": 8.102237701416016, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8671137094497681, "num_tokens": 668601336.0, "step": 17523 }, { "epoch": 2.229232922020099, "ewc_loss": 0.06919702887535095, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033552502281963825, "grad_norm": 8.033649444580078, "learning_rate": 1e-06, "loss": 0.3934, "mean_token_accuracy": 0.8843883275985718, "num_tokens": 668637113.0, "step": 17524 }, { "epoch": 2.2293601322986896, "ewc_loss": 0.06945430487394333, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003380977432243526, "grad_norm": 8.167654991149902, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8608040809631348, "num_tokens": 668675645.0, "step": 17525 }, { "epoch": 2.22948734257728, "ewc_loss": 0.06916804611682892, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033523511956445873, "grad_norm": 8.042094230651855, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8672739863395691, "num_tokens": 668706480.0, "step": 17526 }, { "epoch": 2.2296145528558706, "ewc_loss": 0.06945176422595978, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033807233558036387, "grad_norm": 8.128068923950195, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8596442937850952, "num_tokens": 668743956.0, "step": 17527 }, { "epoch": 2.229741763134461, "ewc_loss": 0.0691610798239708, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033516547409817576, "grad_norm": 8.050314903259277, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8743240833282471, "num_tokens": 668779134.0, "step": 17528 }, { "epoch": 2.2298689734130517, "ewc_loss": 0.0694195032119751, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033774974872358143, "grad_norm": 8.099380493164062, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8671125173568726, "num_tokens": 668814978.0, "step": 17529 }, { "epoch": 2.229996183691642, "ewc_loss": 0.06918807327747345, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033543541212566197, "grad_norm": 8.09781265258789, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8482930660247803, "num_tokens": 668854194.0, "step": 17530 }, { "epoch": 2.2301233939702327, "ewc_loss": 0.06919202208518982, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003354749351274222, "grad_norm": 8.04748821258545, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8573462963104248, "num_tokens": 668893095.0, "step": 17531 }, { "epoch": 2.2302506042488233, "ewc_loss": 0.0693216323852539, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003367710451129824, "grad_norm": 8.098337173461914, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8736490607261658, "num_tokens": 668930626.0, "step": 17532 }, { "epoch": 2.230377814527414, "ewc_loss": 0.06910917162895203, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003346464072819799, "grad_norm": 8.110743522644043, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8739510178565979, "num_tokens": 668960941.0, "step": 17533 }, { "epoch": 2.2305050248060043, "ewc_loss": 0.06918013840913773, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003353560750838369, "grad_norm": 8.098123550415039, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8680394291877747, "num_tokens": 668997771.0, "step": 17534 }, { "epoch": 2.230632235084595, "ewc_loss": 0.06912494450807571, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033480412093922496, "grad_norm": 8.06024169921875, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8796526193618774, "num_tokens": 669038714.0, "step": 17535 }, { "epoch": 2.2307594453631854, "ewc_loss": 0.069136843085289, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000334923155605793, "grad_norm": 8.057333946228027, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8745094537734985, "num_tokens": 669072336.0, "step": 17536 }, { "epoch": 2.230886655641776, "ewc_loss": 0.06917361170053482, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033529079519212246, "grad_norm": 8.064608573913574, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8745881915092468, "num_tokens": 669110303.0, "step": 17537 }, { "epoch": 2.2310138659203664, "ewc_loss": 0.06919778883457184, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000335532589815557, "grad_norm": 8.098784446716309, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8578574657440186, "num_tokens": 669146874.0, "step": 17538 }, { "epoch": 2.231141076198957, "ewc_loss": 0.06909603625535965, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003345150616951287, "grad_norm": 8.093371391296387, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8777486681938171, "num_tokens": 669181682.0, "step": 17539 }, { "epoch": 2.2312682864775475, "ewc_loss": 0.06928513944149017, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033640608307905495, "grad_norm": 8.07580280303955, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8670752048492432, "num_tokens": 669227201.0, "step": 17540 }, { "epoch": 2.231395496756138, "ewc_loss": 0.06915414333343506, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003350961487740278, "grad_norm": 8.009004592895508, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8668980002403259, "num_tokens": 669265915.0, "step": 17541 }, { "epoch": 2.2315227070347285, "ewc_loss": 0.06935296207666397, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003370843187440187, "grad_norm": 8.103899002075195, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8698371648788452, "num_tokens": 669304130.0, "step": 17542 }, { "epoch": 2.231649917313319, "ewc_loss": 0.06908299028873444, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003343846183270216, "grad_norm": 8.107317924499512, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.863807201385498, "num_tokens": 669344225.0, "step": 17543 }, { "epoch": 2.2317771275919096, "ewc_loss": 0.06920795142650604, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000335634162183851, "grad_norm": 8.077012062072754, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8472042679786682, "num_tokens": 669385126.0, "step": 17544 }, { "epoch": 2.2319043378705, "ewc_loss": 0.06916274130344391, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033518209238536656, "grad_norm": 8.080792427062988, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8545820713043213, "num_tokens": 669425774.0, "step": 17545 }, { "epoch": 2.2320315481490907, "ewc_loss": 0.06917484104633331, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033530304790474474, "grad_norm": 8.077086448669434, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8556823134422302, "num_tokens": 669459789.0, "step": 17546 }, { "epoch": 2.2321587584276807, "ewc_loss": 0.06927735358476639, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003363282303325832, "grad_norm": 8.07126522064209, "learning_rate": 1e-06, "loss": 0.4117, "mean_token_accuracy": 0.880687415599823, "num_tokens": 669498709.0, "step": 17547 }, { "epoch": 2.2322859687062713, "ewc_loss": 0.0693061426281929, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003366161254234612, "grad_norm": 8.086846351623535, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8704215288162231, "num_tokens": 669536021.0, "step": 17548 }, { "epoch": 2.232413178984862, "ewc_loss": 0.06920361518859863, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033559079747647047, "grad_norm": 8.08460807800293, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8530642986297607, "num_tokens": 669574316.0, "step": 17549 }, { "epoch": 2.2325403892634523, "ewc_loss": 0.06924561411142349, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033601082395762205, "grad_norm": 8.073234558105469, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8563058376312256, "num_tokens": 669611826.0, "step": 17550 }, { "epoch": 2.232667599542043, "ewc_loss": 0.06925797462463379, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003361344861332327, "grad_norm": 8.051435470581055, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.862197756767273, "num_tokens": 669655277.0, "step": 17551 }, { "epoch": 2.2327948098206334, "ewc_loss": 0.06937122344970703, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033726697438396513, "grad_norm": 8.12665843963623, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8590971231460571, "num_tokens": 669693481.0, "step": 17552 }, { "epoch": 2.232922020099224, "ewc_loss": 0.06926929950714111, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003362477000337094, "grad_norm": 8.081642150878906, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8646504282951355, "num_tokens": 669724104.0, "step": 17553 }, { "epoch": 2.2330492303778144, "ewc_loss": 0.06926661729812622, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033622089540585876, "grad_norm": 8.099564552307129, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8485583066940308, "num_tokens": 669768725.0, "step": 17554 }, { "epoch": 2.233176440656405, "ewc_loss": 0.06920883804559708, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033564306795597076, "grad_norm": 7.997177600860596, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8751193284988403, "num_tokens": 669809508.0, "step": 17555 }, { "epoch": 2.2333036509349955, "ewc_loss": 0.06943219900131226, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033787672873586416, "grad_norm": 8.111730575561523, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.867030143737793, "num_tokens": 669848047.0, "step": 17556 }, { "epoch": 2.233430861213586, "ewc_loss": 0.0691429153084755, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003349838370922953, "grad_norm": 8.025209426879883, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8824461102485657, "num_tokens": 669881274.0, "step": 17557 }, { "epoch": 2.2335580714921766, "ewc_loss": 0.0694703757762909, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033825848367996514, "grad_norm": 8.126684188842773, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8602344393730164, "num_tokens": 669918517.0, "step": 17558 }, { "epoch": 2.233685281770767, "ewc_loss": 0.06922715902328491, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033582630567252636, "grad_norm": 8.060234069824219, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8582046031951904, "num_tokens": 669960990.0, "step": 17559 }, { "epoch": 2.2338124920493576, "ewc_loss": 0.06937714666128159, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033732617157511413, "grad_norm": 8.143119812011719, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8685702085494995, "num_tokens": 669995542.0, "step": 17560 }, { "epoch": 2.233939702327948, "ewc_loss": 0.06918786466121674, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003354333166498691, "grad_norm": 7.99652099609375, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8661636114120483, "num_tokens": 670036798.0, "step": 17561 }, { "epoch": 2.2340669126065387, "ewc_loss": 0.06950096786022186, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003385643649380654, "grad_norm": 8.132824897766113, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8643460273742676, "num_tokens": 670071763.0, "step": 17562 }, { "epoch": 2.234194122885129, "ewc_loss": 0.06909850239753723, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000334539741743356, "grad_norm": 8.08763313293457, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8635249137878418, "num_tokens": 670100864.0, "step": 17563 }, { "epoch": 2.2343213331637197, "ewc_loss": 0.06944873929023743, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003380421258043498, "grad_norm": 8.056008338928223, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8541237115859985, "num_tokens": 670144114.0, "step": 17564 }, { "epoch": 2.2344485434423103, "ewc_loss": 0.06930273771286011, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003365821030456573, "grad_norm": 8.057058334350586, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8700517416000366, "num_tokens": 670182719.0, "step": 17565 }, { "epoch": 2.234575753720901, "ewc_loss": 0.0694139152765274, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003376938111614436, "grad_norm": 8.121345520019531, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8681865334510803, "num_tokens": 670218059.0, "step": 17566 }, { "epoch": 2.2347029639994913, "ewc_loss": 0.06931813061237335, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033673597499728203, "grad_norm": 8.033926963806152, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8617184162139893, "num_tokens": 670259589.0, "step": 17567 }, { "epoch": 2.234830174278082, "ewc_loss": 0.06936843693256378, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003372390929143876, "grad_norm": 8.048077583312988, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8566672801971436, "num_tokens": 670301755.0, "step": 17568 }, { "epoch": 2.2349573845566724, "ewc_loss": 0.06930595636367798, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033661426277831197, "grad_norm": 8.010344505310059, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8691742420196533, "num_tokens": 670339205.0, "step": 17569 }, { "epoch": 2.235084594835263, "ewc_loss": 0.06954973191022873, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000339051999617368, "grad_norm": 8.081188201904297, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8683804273605347, "num_tokens": 670382961.0, "step": 17570 }, { "epoch": 2.2352118051138534, "ewc_loss": 0.0693804994225502, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003373596991878003, "grad_norm": 8.11461067199707, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8550810813903809, "num_tokens": 670420747.0, "step": 17571 }, { "epoch": 2.2353390153924435, "ewc_loss": 0.06942636519670486, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033781834645196795, "grad_norm": 8.023051261901855, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.881902277469635, "num_tokens": 670457906.0, "step": 17572 }, { "epoch": 2.235466225671034, "ewc_loss": 0.06958234310150146, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033937813714146614, "grad_norm": 8.11915111541748, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.855299711227417, "num_tokens": 670493187.0, "step": 17573 }, { "epoch": 2.2355934359496246, "ewc_loss": 0.06930218636989594, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033657660242170095, "grad_norm": 8.077713012695312, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8788934946060181, "num_tokens": 670529750.0, "step": 17574 }, { "epoch": 2.235720646228215, "ewc_loss": 0.06947679817676544, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003383226285222918, "grad_norm": 8.092767715454102, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8623887300491333, "num_tokens": 670570147.0, "step": 17575 }, { "epoch": 2.2358478565068056, "ewc_loss": 0.06933265179395676, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033688120311126113, "grad_norm": 8.155512809753418, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8683421015739441, "num_tokens": 670604113.0, "step": 17576 }, { "epoch": 2.235975066785396, "ewc_loss": 0.06925547122955322, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033610942773520947, "grad_norm": 8.016131401062012, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8659255504608154, "num_tokens": 670645451.0, "step": 17577 }, { "epoch": 2.2361022770639867, "ewc_loss": 0.06952771544456482, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033883185824379325, "grad_norm": 8.135104179382324, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.870759904384613, "num_tokens": 670684055.0, "step": 17578 }, { "epoch": 2.236229487342577, "ewc_loss": 0.06922497600317001, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033580444869585335, "grad_norm": 8.116286277770996, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8574866056442261, "num_tokens": 670724397.0, "step": 17579 }, { "epoch": 2.2363566976211677, "ewc_loss": 0.06951025873422623, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033865729346871376, "grad_norm": 8.111186981201172, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8767642974853516, "num_tokens": 670761282.0, "step": 17580 }, { "epoch": 2.2364839078997583, "ewc_loss": 0.06933874636888504, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033694214653223753, "grad_norm": 8.091984748840332, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8704769015312195, "num_tokens": 670800007.0, "step": 17581 }, { "epoch": 2.236611118178349, "ewc_loss": 0.06935438513755798, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000337098550517112, "grad_norm": 8.098109245300293, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8631290793418884, "num_tokens": 670840040.0, "step": 17582 }, { "epoch": 2.2367383284569393, "ewc_loss": 0.06930065155029297, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033656114828772843, "grad_norm": 8.060521125793457, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8587177991867065, "num_tokens": 670881193.0, "step": 17583 }, { "epoch": 2.23686553873553, "ewc_loss": 0.06937955319881439, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033735024044290185, "grad_norm": 8.109971046447754, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8695539236068726, "num_tokens": 670927275.0, "step": 17584 }, { "epoch": 2.2369927490141204, "ewc_loss": 0.06919248402118683, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033547947532497346, "grad_norm": 8.048946380615234, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8697085976600647, "num_tokens": 670967327.0, "step": 17585 }, { "epoch": 2.237119959292711, "ewc_loss": 0.06942085921764374, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003377632820047438, "grad_norm": 8.118818283081055, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8701949715614319, "num_tokens": 671009250.0, "step": 17586 }, { "epoch": 2.2372471695713014, "ewc_loss": 0.06932235509157181, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003367782337591052, "grad_norm": 8.059674263000488, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8684942126274109, "num_tokens": 671045068.0, "step": 17587 }, { "epoch": 2.237374379849892, "ewc_loss": 0.06941261887550354, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033768085995689034, "grad_norm": 8.137112617492676, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8528550267219543, "num_tokens": 671084786.0, "step": 17588 }, { "epoch": 2.2375015901284825, "ewc_loss": 0.06938937306404114, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003374484076630324, "grad_norm": 8.114216804504395, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8588731288909912, "num_tokens": 671123235.0, "step": 17589 }, { "epoch": 2.237628800407073, "ewc_loss": 0.06941089034080505, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003376635431777686, "grad_norm": 8.14037799835205, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8810163736343384, "num_tokens": 671161702.0, "step": 17590 }, { "epoch": 2.2377560106856635, "ewc_loss": 0.06933468580245972, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033690157579258084, "grad_norm": 8.108403205871582, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8775409460067749, "num_tokens": 671195487.0, "step": 17591 }, { "epoch": 2.237883220964254, "ewc_loss": 0.06939874589443207, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033754209289327264, "grad_norm": 8.133048057556152, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8644319772720337, "num_tokens": 671232288.0, "step": 17592 }, { "epoch": 2.2380104312428446, "ewc_loss": 0.06887727975845337, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003372103092260659, "grad_norm": 8.089444160461426, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8498716950416565, "num_tokens": 671275154.0, "step": 17593 }, { "epoch": 2.238137641521435, "ewc_loss": 0.06906845420598984, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003366806486155838, "grad_norm": 8.09812068939209, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8550302386283875, "num_tokens": 671312266.0, "step": 17594 }, { "epoch": 2.238264851800025, "ewc_loss": 0.06934778392314911, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033703248482197523, "grad_norm": 8.06868839263916, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8654758334159851, "num_tokens": 671349690.0, "step": 17595 }, { "epoch": 2.238392062078616, "ewc_loss": 0.06937123835086823, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033726703259162605, "grad_norm": 8.108675003051758, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.868887186050415, "num_tokens": 671387898.0, "step": 17596 }, { "epoch": 2.2385192723572063, "ewc_loss": 0.06939177215099335, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003374724474269897, "grad_norm": 8.052515983581543, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.8434104919433594, "num_tokens": 671428054.0, "step": 17597 }, { "epoch": 2.238646482635797, "ewc_loss": 0.06939239799976349, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003374786756467074, "grad_norm": 8.058183670043945, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8714059591293335, "num_tokens": 671467096.0, "step": 17598 }, { "epoch": 2.2387736929143873, "ewc_loss": 0.069491446018219, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003384691954124719, "grad_norm": 8.112578392028809, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8652366399765015, "num_tokens": 671506752.0, "step": 17599 }, { "epoch": 2.238900903192978, "ewc_loss": 0.06938369572162628, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033739159698598087, "grad_norm": 8.094548225402832, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8573416471481323, "num_tokens": 671542403.0, "step": 17600 }, { "epoch": 2.2390281134715684, "ewc_loss": 0.06930868327617645, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000336641474859789, "grad_norm": 8.043067932128906, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.876944899559021, "num_tokens": 671578115.0, "step": 17601 }, { "epoch": 2.239155323750159, "ewc_loss": 0.06953933089971542, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003389479825273156, "grad_norm": 8.067131996154785, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8776254653930664, "num_tokens": 671619854.0, "step": 17602 }, { "epoch": 2.2392825340287494, "ewc_loss": 0.06945547461509705, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003381094429641962, "grad_norm": 8.070372581481934, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8791197538375854, "num_tokens": 671661617.0, "step": 17603 }, { "epoch": 2.23940974430734, "ewc_loss": 0.06950804591178894, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003386351454537362, "grad_norm": 8.062753677368164, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8540521860122681, "num_tokens": 671702231.0, "step": 17604 }, { "epoch": 2.2395369545859305, "ewc_loss": 0.06955601274967194, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033911483478732407, "grad_norm": 8.093194007873535, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8657698631286621, "num_tokens": 671741483.0, "step": 17605 }, { "epoch": 2.239664164864521, "ewc_loss": 0.0695239007472992, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000338793674018234, "grad_norm": 8.146651268005371, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8604992032051086, "num_tokens": 671780168.0, "step": 17606 }, { "epoch": 2.2397913751431116, "ewc_loss": 0.06929454952478409, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033650017576292157, "grad_norm": 8.0259370803833, "learning_rate": 1e-06, "loss": 0.5304, "mean_token_accuracy": 0.8425681591033936, "num_tokens": 671818980.0, "step": 17607 }, { "epoch": 2.239918585421702, "ewc_loss": 0.06954236328601837, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003389782796148211, "grad_norm": 8.146364212036133, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8639042377471924, "num_tokens": 671850010.0, "step": 17608 }, { "epoch": 2.2400457957002926, "ewc_loss": 0.0692681074142456, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003362357383593917, "grad_norm": 8.051698684692383, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8581086993217468, "num_tokens": 671888506.0, "step": 17609 }, { "epoch": 2.240173005978883, "ewc_loss": 0.06953698396682739, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003389245248399675, "grad_norm": 8.098628997802734, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8737854957580566, "num_tokens": 671930965.0, "step": 17610 }, { "epoch": 2.2403002162574737, "ewc_loss": 0.0693439394235611, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033699412597343326, "grad_norm": 8.112869262695312, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8740983009338379, "num_tokens": 671967472.0, "step": 17611 }, { "epoch": 2.240427426536064, "ewc_loss": 0.06944590061903, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033801369136199355, "grad_norm": 8.0995512008667, "learning_rate": 1e-06, "loss": 0.3854, "mean_token_accuracy": 0.8884074687957764, "num_tokens": 672005986.0, "step": 17612 }, { "epoch": 2.2405546368146547, "ewc_loss": 0.06936946511268616, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003372493665665388, "grad_norm": 8.060009956359863, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8653191328048706, "num_tokens": 672044729.0, "step": 17613 }, { "epoch": 2.2406818470932452, "ewc_loss": 0.06950416415929794, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003385963209439069, "grad_norm": 8.130918502807617, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8658530116081238, "num_tokens": 672080141.0, "step": 17614 }, { "epoch": 2.2408090573718358, "ewc_loss": 0.06924018263816833, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003359565162099898, "grad_norm": 8.059823989868164, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8737438917160034, "num_tokens": 672119277.0, "step": 17615 }, { "epoch": 2.2409362676504263, "ewc_loss": 0.06951175630092621, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000338672281941399, "grad_norm": 8.171552658081055, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8639832735061646, "num_tokens": 672154205.0, "step": 17616 }, { "epoch": 2.241063477929017, "ewc_loss": 0.06912658363580704, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003348205063957721, "grad_norm": 8.076311111450195, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.864946722984314, "num_tokens": 672193667.0, "step": 17617 }, { "epoch": 2.2411906882076074, "ewc_loss": 0.06943616271018982, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033791628084145486, "grad_norm": 8.126635551452637, "learning_rate": 1e-06, "loss": 0.5614, "mean_token_accuracy": 0.8360998630523682, "num_tokens": 672232360.0, "step": 17618 }, { "epoch": 2.241317898486198, "ewc_loss": 0.06924687325954437, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003360233677085489, "grad_norm": 8.129215240478516, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8524556159973145, "num_tokens": 672264127.0, "step": 17619 }, { "epoch": 2.241445108764788, "ewc_loss": 0.06932909786701202, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000336845638230443, "grad_norm": 8.111721992492676, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.873053789138794, "num_tokens": 672300719.0, "step": 17620 }, { "epoch": 2.2415723190433785, "ewc_loss": 0.06938044726848602, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003373592044226825, "grad_norm": 8.054699897766113, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.876441478729248, "num_tokens": 672342796.0, "step": 17621 }, { "epoch": 2.241699529321969, "ewc_loss": 0.06940117478370667, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033756642369553447, "grad_norm": 8.114790916442871, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8678881525993347, "num_tokens": 672382597.0, "step": 17622 }, { "epoch": 2.2418267396005596, "ewc_loss": 0.06930431723594666, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033659784821793437, "grad_norm": 8.09144115447998, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8535705208778381, "num_tokens": 672427350.0, "step": 17623 }, { "epoch": 2.24195394987915, "ewc_loss": 0.0694287121295929, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000337841862346977, "grad_norm": 8.140349388122559, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8583359122276306, "num_tokens": 672468997.0, "step": 17624 }, { "epoch": 2.2420811601577406, "ewc_loss": 0.06924069672822952, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033596163848415017, "grad_norm": 8.080937385559082, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8710341453552246, "num_tokens": 672505061.0, "step": 17625 }, { "epoch": 2.242208370436331, "ewc_loss": 0.06945249438285828, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003380796406418085, "grad_norm": 8.136736869812012, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8647890090942383, "num_tokens": 672544241.0, "step": 17626 }, { "epoch": 2.2423355807149217, "ewc_loss": 0.06927457451820374, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000336300436174497, "grad_norm": 8.101175308227539, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8654202222824097, "num_tokens": 672586334.0, "step": 17627 }, { "epoch": 2.242462790993512, "ewc_loss": 0.06938423961400986, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003373970976099372, "grad_norm": 8.084477424621582, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8611494898796082, "num_tokens": 672628627.0, "step": 17628 }, { "epoch": 2.2425900012721027, "ewc_loss": 0.06937985122203827, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000337353179929778, "grad_norm": 8.131027221679688, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8563076257705688, "num_tokens": 672663046.0, "step": 17629 }, { "epoch": 2.2427172115506933, "ewc_loss": 0.06924387067556381, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033599339076317847, "grad_norm": 8.011173248291016, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8716753125190735, "num_tokens": 672703394.0, "step": 17630 }, { "epoch": 2.242844421829284, "ewc_loss": 0.06946130096912384, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003381677088327706, "grad_norm": 8.136908531188965, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8560264706611633, "num_tokens": 672743262.0, "step": 17631 }, { "epoch": 2.2429716321078743, "ewc_loss": 0.06931748986244202, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003367295430507511, "grad_norm": 8.112570762634277, "learning_rate": 1e-06, "loss": 0.5641, "mean_token_accuracy": 0.8307417631149292, "num_tokens": 672782662.0, "step": 17632 }, { "epoch": 2.243098842386465, "ewc_loss": 0.06948712468147278, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033842591801658273, "grad_norm": 8.106842994689941, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8634666204452515, "num_tokens": 672818400.0, "step": 17633 }, { "epoch": 2.2432260526650554, "ewc_loss": 0.06938165426254272, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033737128251232207, "grad_norm": 8.091842651367188, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8773379325866699, "num_tokens": 672858843.0, "step": 17634 }, { "epoch": 2.243353262943646, "ewc_loss": 0.06931758671998978, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003367305616848171, "grad_norm": 8.09209156036377, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8671472072601318, "num_tokens": 672900397.0, "step": 17635 }, { "epoch": 2.2434804732222364, "ewc_loss": 0.06937523931264877, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003373070794623345, "grad_norm": 8.158212661743164, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8642823696136475, "num_tokens": 672936025.0, "step": 17636 }, { "epoch": 2.243607683500827, "ewc_loss": 0.06924132257699966, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033596789580769837, "grad_norm": 8.086069107055664, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8456506133079529, "num_tokens": 672979180.0, "step": 17637 }, { "epoch": 2.2437348937794175, "ewc_loss": 0.0694938525557518, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003384932060725987, "grad_norm": 8.163082122802734, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8752518892288208, "num_tokens": 673018053.0, "step": 17638 }, { "epoch": 2.243862104058008, "ewc_loss": 0.06916211545467377, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003351758641656488, "grad_norm": 8.084451675415039, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8451449871063232, "num_tokens": 673057596.0, "step": 17639 }, { "epoch": 2.2439893143365985, "ewc_loss": 0.06940598785877228, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003376146196387708, "grad_norm": 8.125797271728516, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8768352270126343, "num_tokens": 673097740.0, "step": 17640 }, { "epoch": 2.244116524615189, "ewc_loss": 0.06925848126411438, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033613952109590173, "grad_norm": 8.047663688659668, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8705966472625732, "num_tokens": 673141821.0, "step": 17641 }, { "epoch": 2.2442437348937796, "ewc_loss": 0.06942424178123474, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033779710065573454, "grad_norm": 8.14638614654541, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8738853931427002, "num_tokens": 673184279.0, "step": 17642 }, { "epoch": 2.24437094517237, "ewc_loss": 0.06919535249471664, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033550820080563426, "grad_norm": 8.026320457458496, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.875089704990387, "num_tokens": 673224328.0, "step": 17643 }, { "epoch": 2.2444981554509607, "ewc_loss": 0.06962864100933075, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003398411499802023, "grad_norm": 8.223248481750488, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8511635661125183, "num_tokens": 673263328.0, "step": 17644 }, { "epoch": 2.2446253657295507, "ewc_loss": 0.06909313797950745, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033448610338382423, "grad_norm": 8.004491806030273, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8698036670684814, "num_tokens": 673301663.0, "step": 17645 }, { "epoch": 2.2447525760081413, "ewc_loss": 0.06967169046401978, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003402715374249965, "grad_norm": 8.265344619750977, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.863253116607666, "num_tokens": 673340606.0, "step": 17646 }, { "epoch": 2.244879786286732, "ewc_loss": 0.06900976598262787, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003336523659527302, "grad_norm": 7.984185695648193, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.861587405204773, "num_tokens": 673382201.0, "step": 17647 }, { "epoch": 2.2450069965653223, "ewc_loss": 0.06981326639652252, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003416873514652252, "grad_norm": 8.24251651763916, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.874659538269043, "num_tokens": 673423330.0, "step": 17648 }, { "epoch": 2.245134206843913, "ewc_loss": 0.06906192004680634, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003341738774906844, "grad_norm": 8.004890441894531, "learning_rate": 1e-06, "loss": 0.3996, "mean_token_accuracy": 0.8815796375274658, "num_tokens": 673459690.0, "step": 17649 }, { "epoch": 2.2452614171225034, "ewc_loss": 0.06982976198196411, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034185225376859307, "grad_norm": 8.207001686096191, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8619699478149414, "num_tokens": 673494827.0, "step": 17650 }, { "epoch": 2.245388627401094, "ewc_loss": 0.06923258304595947, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003358805552124977, "grad_norm": 8.079401969909668, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8535877466201782, "num_tokens": 673540568.0, "step": 17651 }, { "epoch": 2.2455158376796844, "ewc_loss": 0.06949752569198608, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033852996421046555, "grad_norm": 8.16989803314209, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8812358975410461, "num_tokens": 673574022.0, "step": 17652 }, { "epoch": 2.245643047958275, "ewc_loss": 0.06935089081525803, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003370635968167335, "grad_norm": 8.108749389648438, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8609666228294373, "num_tokens": 673612493.0, "step": 17653 }, { "epoch": 2.2457702582368655, "ewc_loss": 0.06952888518571854, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003388435288798064, "grad_norm": 8.172926902770996, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8616394400596619, "num_tokens": 673653729.0, "step": 17654 }, { "epoch": 2.245897468515456, "ewc_loss": 0.06926357746124268, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003361905110068619, "grad_norm": 7.998032093048096, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8774614334106445, "num_tokens": 673695626.0, "step": 17655 }, { "epoch": 2.2460246787940465, "ewc_loss": 0.06972905993461609, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034084523213095963, "grad_norm": 8.195642471313477, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8718386888504028, "num_tokens": 673732188.0, "step": 17656 }, { "epoch": 2.246151889072637, "ewc_loss": 0.06924563646316528, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003360109985806048, "grad_norm": 8.038834571838379, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8580713272094727, "num_tokens": 673770259.0, "step": 17657 }, { "epoch": 2.2462790993512276, "ewc_loss": 0.06976917386054993, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034124645753763616, "grad_norm": 8.227075576782227, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8614668250083923, "num_tokens": 673805018.0, "step": 17658 }, { "epoch": 2.246406309629818, "ewc_loss": 0.06935208290815353, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033707552938722074, "grad_norm": 8.096177101135254, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8581812381744385, "num_tokens": 673846058.0, "step": 17659 }, { "epoch": 2.2465335199084087, "ewc_loss": 0.06974956393241882, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034105032682418823, "grad_norm": 8.22872257232666, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8526487350463867, "num_tokens": 673886437.0, "step": 17660 }, { "epoch": 2.246660730186999, "ewc_loss": 0.06929543614387512, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033650908153504133, "grad_norm": 8.10886287689209, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8686690330505371, "num_tokens": 673925701.0, "step": 17661 }, { "epoch": 2.2467879404655897, "ewc_loss": 0.06956110894680023, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033916576649062335, "grad_norm": 8.128024101257324, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8691200017929077, "num_tokens": 673969854.0, "step": 17662 }, { "epoch": 2.2469151507441802, "ewc_loss": 0.06932879984378815, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033684269874356687, "grad_norm": 8.082256317138672, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8661962747573853, "num_tokens": 674005048.0, "step": 17663 }, { "epoch": 2.2470423610227708, "ewc_loss": 0.06939548254013062, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003375094966031611, "grad_norm": 8.125086784362793, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8608114719390869, "num_tokens": 674046548.0, "step": 17664 }, { "epoch": 2.2471695713013613, "ewc_loss": 0.0694233775138855, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003377884568180889, "grad_norm": 8.07822036743164, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8595885634422302, "num_tokens": 674086905.0, "step": 17665 }, { "epoch": 2.247296781579952, "ewc_loss": 0.06948147714138031, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033836945658549666, "grad_norm": 8.118976593017578, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8752202391624451, "num_tokens": 674124589.0, "step": 17666 }, { "epoch": 2.2474239918585424, "ewc_loss": 0.06941968202590942, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033775149495340884, "grad_norm": 8.240337371826172, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8778543472290039, "num_tokens": 674165135.0, "step": 17667 }, { "epoch": 2.247551202137133, "ewc_loss": 0.06932845711708069, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033683920628391206, "grad_norm": 8.099018096923828, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8611828088760376, "num_tokens": 674207269.0, "step": 17668 }, { "epoch": 2.2476784124157234, "ewc_loss": 0.06952275335788727, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003387821780052036, "grad_norm": 8.082418441772461, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8824071884155273, "num_tokens": 674240806.0, "step": 17669 }, { "epoch": 2.2478056226943135, "ewc_loss": 0.06945806741714478, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033813531626947224, "grad_norm": 8.12850284576416, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8567106127738953, "num_tokens": 674274697.0, "step": 17670 }, { "epoch": 2.247932832972904, "ewc_loss": 0.06946379691362381, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000338192650815472, "grad_norm": 8.077130317687988, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8571518659591675, "num_tokens": 674311362.0, "step": 17671 }, { "epoch": 2.2480600432514946, "ewc_loss": 0.0695410966873169, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003389656776562333, "grad_norm": 8.10818099975586, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8717942237854004, "num_tokens": 674352505.0, "step": 17672 }, { "epoch": 2.248187253530085, "ewc_loss": 0.06953711062669754, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003389258054085076, "grad_norm": 8.12020206451416, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8696791529655457, "num_tokens": 674383938.0, "step": 17673 }, { "epoch": 2.2483144638086756, "ewc_loss": 0.06962195038795471, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033977421117015183, "grad_norm": 8.093830108642578, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8641981482505798, "num_tokens": 674421665.0, "step": 17674 }, { "epoch": 2.248441674087266, "ewc_loss": 0.06954576075077057, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033901227288879454, "grad_norm": 8.095555305480957, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8678780794143677, "num_tokens": 674466065.0, "step": 17675 }, { "epoch": 2.2485688843658567, "ewc_loss": 0.06959011405706406, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003394558443687856, "grad_norm": 8.158961296081543, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8667885065078735, "num_tokens": 674506868.0, "step": 17676 }, { "epoch": 2.248696094644447, "ewc_loss": 0.06948436796665192, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003383983566891402, "grad_norm": 8.146546363830566, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8604460954666138, "num_tokens": 674545756.0, "step": 17677 }, { "epoch": 2.2488233049230377, "ewc_loss": 0.06951723247766495, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033872699714265764, "grad_norm": 8.150955200195312, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8614233136177063, "num_tokens": 674590680.0, "step": 17678 }, { "epoch": 2.2489505152016283, "ewc_loss": 0.06953118741512299, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003388665500096977, "grad_norm": 8.11699104309082, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.871258556842804, "num_tokens": 674631624.0, "step": 17679 }, { "epoch": 2.249077725480219, "ewc_loss": 0.06953823566436768, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033893706859089434, "grad_norm": 8.129796981811523, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8605325222015381, "num_tokens": 674667365.0, "step": 17680 }, { "epoch": 2.2492049357588093, "ewc_loss": 0.06955350935459137, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033908983459696174, "grad_norm": 8.11981201171875, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8647143840789795, "num_tokens": 674706587.0, "step": 17681 }, { "epoch": 2.2493321460374, "ewc_loss": 0.06956824660301208, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003392370999790728, "grad_norm": 8.149927139282227, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8640222549438477, "num_tokens": 674744480.0, "step": 17682 }, { "epoch": 2.2494593563159904, "ewc_loss": 0.06949550658464432, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003385097661521286, "grad_norm": 8.124960899353027, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8654534816741943, "num_tokens": 674783104.0, "step": 17683 }, { "epoch": 2.249586566594581, "ewc_loss": 0.06965507566928864, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003401054418645799, "grad_norm": 8.241708755493164, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8580325841903687, "num_tokens": 674823488.0, "step": 17684 }, { "epoch": 2.2497137768731714, "ewc_loss": 0.069437175989151, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003379264962859452, "grad_norm": 8.115506172180176, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8705757856369019, "num_tokens": 674857322.0, "step": 17685 }, { "epoch": 2.249840987151762, "ewc_loss": 0.06970901787281036, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034064482315443456, "grad_norm": 8.176885604858398, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8560320734977722, "num_tokens": 674901566.0, "step": 17686 }, { "epoch": 2.2499681974303525, "ewc_loss": 0.06936860084533691, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033724066452123225, "grad_norm": 8.040943145751953, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8703234195709229, "num_tokens": 674942803.0, "step": 17687 }, { "epoch": 2.250095407708943, "ewc_loss": 0.0698046088218689, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034160082577727735, "grad_norm": 8.200263977050781, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8563927412033081, "num_tokens": 674983160.0, "step": 17688 }, { "epoch": 2.2502226179875335, "ewc_loss": 0.06939129531383514, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003374675870873034, "grad_norm": 8.101710319519043, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8661490678787231, "num_tokens": 675021243.0, "step": 17689 }, { "epoch": 2.250349828266124, "ewc_loss": 0.06980668753385544, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034162154770456254, "grad_norm": 8.238693237304688, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8614726066589355, "num_tokens": 675059435.0, "step": 17690 }, { "epoch": 2.2504770385447146, "ewc_loss": 0.06941988319158554, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033775350311771035, "grad_norm": 8.153441429138184, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8693854212760925, "num_tokens": 675091689.0, "step": 17691 }, { "epoch": 2.250604248823305, "ewc_loss": 0.06963782012462616, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003399328561499715, "grad_norm": 8.169386863708496, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8596053719520569, "num_tokens": 675130368.0, "step": 17692 }, { "epoch": 2.250731459101895, "ewc_loss": 0.06955591589212418, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033911384525708854, "grad_norm": 8.118502616882324, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8686061501502991, "num_tokens": 675171745.0, "step": 17693 }, { "epoch": 2.250858669380486, "ewc_loss": 0.06957696378231049, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003393242950551212, "grad_norm": 8.168736457824707, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8633196353912354, "num_tokens": 675206052.0, "step": 17694 }, { "epoch": 2.2509858796590763, "ewc_loss": 0.0694146603345871, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003377012617420405, "grad_norm": 8.103150367736816, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8601109981536865, "num_tokens": 675243231.0, "step": 17695 }, { "epoch": 2.251113089937667, "ewc_loss": 0.06967069208621979, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003402616421226412, "grad_norm": 8.272047996520996, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8580600619316101, "num_tokens": 675273582.0, "step": 17696 }, { "epoch": 2.2512403002162573, "ewc_loss": 0.06938682496547699, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033742297091521323, "grad_norm": 8.133959770202637, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8628259897232056, "num_tokens": 675314247.0, "step": 17697 }, { "epoch": 2.251367510494848, "ewc_loss": 0.06956200301647186, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033917473047040403, "grad_norm": 8.188063621520996, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8666070699691772, "num_tokens": 675355675.0, "step": 17698 }, { "epoch": 2.2514947207734384, "ewc_loss": 0.06931738555431366, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033672849531285465, "grad_norm": 8.124137878417969, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8625432252883911, "num_tokens": 675394290.0, "step": 17699 }, { "epoch": 2.251621931052029, "ewc_loss": 0.06957051903009415, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033925988827832043, "grad_norm": 8.186190605163574, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8680528998374939, "num_tokens": 675434251.0, "step": 17700 }, { "epoch": 2.2517491413306194, "ewc_loss": 0.06933678686618805, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033692250144667923, "grad_norm": 8.083130836486816, "learning_rate": 1e-06, "loss": 0.439, "mean_token_accuracy": 0.8713931441307068, "num_tokens": 675474731.0, "step": 17701 }, { "epoch": 2.25187635160921, "ewc_loss": 0.06949439644813538, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033849861938506365, "grad_norm": 8.108572006225586, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8603509664535522, "num_tokens": 675515180.0, "step": 17702 }, { "epoch": 2.2520035618878005, "ewc_loss": 0.06940551847219467, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033760987571440637, "grad_norm": 8.110382080078125, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.853726327419281, "num_tokens": 675553429.0, "step": 17703 }, { "epoch": 2.252130772166391, "ewc_loss": 0.06942813098430634, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033783604158088565, "grad_norm": 8.149468421936035, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8678317070007324, "num_tokens": 675592919.0, "step": 17704 }, { "epoch": 2.2522579824449815, "ewc_loss": 0.06939533352851868, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003375080705154687, "grad_norm": 8.142708778381348, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8723686933517456, "num_tokens": 675627528.0, "step": 17705 }, { "epoch": 2.252385192723572, "ewc_loss": 0.06948032230138779, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033835790236480534, "grad_norm": 8.17373275756836, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8743657469749451, "num_tokens": 675663721.0, "step": 17706 }, { "epoch": 2.2525124030021626, "ewc_loss": 0.06935662031173706, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033712093136273324, "grad_norm": 8.085679054260254, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8594780564308167, "num_tokens": 675705187.0, "step": 17707 }, { "epoch": 2.252639613280753, "ewc_loss": 0.06962469965219498, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000339801685186103, "grad_norm": 8.380041122436523, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8757123947143555, "num_tokens": 675752713.0, "step": 17708 }, { "epoch": 2.2527668235593437, "ewc_loss": 0.06906184554100037, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033417314989492297, "grad_norm": 8.085978507995605, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8651955723762512, "num_tokens": 675791009.0, "step": 17709 }, { "epoch": 2.252894033837934, "ewc_loss": 0.06968306005001068, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034038530429825187, "grad_norm": 8.311881065368652, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.86864173412323, "num_tokens": 675828890.0, "step": 17710 }, { "epoch": 2.2530212441165247, "ewc_loss": 0.06901240348815918, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033367876312695444, "grad_norm": 10.527284622192383, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8663037419319153, "num_tokens": 675862855.0, "step": 17711 }, { "epoch": 2.2531484543951152, "ewc_loss": 0.06975598633289337, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003411145298741758, "grad_norm": 8.045083999633789, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8630399703979492, "num_tokens": 675903032.0, "step": 17712 }, { "epoch": 2.2532756646737058, "ewc_loss": 0.07093571126461029, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003529118257574737, "grad_norm": 8.434504508972168, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8608349561691284, "num_tokens": 675943035.0, "step": 17713 }, { "epoch": 2.2534028749522963, "ewc_loss": 0.06856471300125122, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000329201779095456, "grad_norm": 7.913533687591553, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8677652478218079, "num_tokens": 675983073.0, "step": 17714 }, { "epoch": 2.253530085230887, "ewc_loss": 0.07121802121400833, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003557348973117769, "grad_norm": 8.483094215393066, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8709073066711426, "num_tokens": 676022341.0, "step": 17715 }, { "epoch": 2.2536572955094774, "ewc_loss": 0.06909479945898056, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033450269256718457, "grad_norm": 8.014269828796387, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8624659776687622, "num_tokens": 676060907.0, "step": 17716 }, { "epoch": 2.253784505788068, "ewc_loss": 0.07053551077842712, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034890981623902917, "grad_norm": 8.368996620178223, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.862794041633606, "num_tokens": 676100693.0, "step": 17717 }, { "epoch": 2.253911716066658, "ewc_loss": 0.06950153410434723, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033856998197734356, "grad_norm": 8.06202507019043, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8797721862792969, "num_tokens": 676137526.0, "step": 17718 }, { "epoch": 2.254038926345249, "ewc_loss": 0.07035897672176361, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003471444360911846, "grad_norm": 8.310705184936523, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8681163787841797, "num_tokens": 676177245.0, "step": 17719 }, { "epoch": 2.254166136623839, "ewc_loss": 0.06942623853683472, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003378170949872583, "grad_norm": 8.106943130493164, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.872250497341156, "num_tokens": 676214568.0, "step": 17720 }, { "epoch": 2.2542933469024296, "ewc_loss": 0.07009232044219971, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003444779140409082, "grad_norm": 8.250614166259766, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.866197407245636, "num_tokens": 676252552.0, "step": 17721 }, { "epoch": 2.25442055718102, "ewc_loss": 0.06956541538238525, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033920881105586886, "grad_norm": 8.15678596496582, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8468320965766907, "num_tokens": 676290480.0, "step": 17722 }, { "epoch": 2.2545477674596106, "ewc_loss": 0.06980015337467194, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034155623870901763, "grad_norm": 8.288301467895508, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8768342137336731, "num_tokens": 676324950.0, "step": 17723 }, { "epoch": 2.254674977738201, "ewc_loss": 0.06958712637424469, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000339425983838737, "grad_norm": 8.151966094970703, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8579568862915039, "num_tokens": 676363851.0, "step": 17724 }, { "epoch": 2.2548021880167917, "ewc_loss": 0.06970047950744629, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003405595198273659, "grad_norm": 8.225293159484863, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8745747804641724, "num_tokens": 676399920.0, "step": 17725 }, { "epoch": 2.254929398295382, "ewc_loss": 0.06948287039995193, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033838339732028544, "grad_norm": 8.073400497436523, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8629293441772461, "num_tokens": 676443243.0, "step": 17726 }, { "epoch": 2.2550566085739727, "ewc_loss": 0.06984603404998779, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003420150314923376, "grad_norm": 8.2987699508667, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8520209789276123, "num_tokens": 676480272.0, "step": 17727 }, { "epoch": 2.2551838188525632, "ewc_loss": 0.06944119930267334, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000337966630468145, "grad_norm": 8.098042488098145, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.861321210861206, "num_tokens": 676516630.0, "step": 17728 }, { "epoch": 2.2553110291311538, "ewc_loss": 0.0699751079082489, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034330575726926327, "grad_norm": 8.255165100097656, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8698372840881348, "num_tokens": 676557842.0, "step": 17729 }, { "epoch": 2.2554382394097443, "ewc_loss": 0.06944853067398071, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003380399430170655, "grad_norm": 8.079717636108398, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8749818801879883, "num_tokens": 676599413.0, "step": 17730 }, { "epoch": 2.255565449688335, "ewc_loss": 0.06996437162160873, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003431983932387084, "grad_norm": 8.265114784240723, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8562974333763123, "num_tokens": 676637791.0, "step": 17731 }, { "epoch": 2.2556926599669254, "ewc_loss": 0.06940455734729767, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033760021324269474, "grad_norm": 8.092594146728516, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8608080148696899, "num_tokens": 676677108.0, "step": 17732 }, { "epoch": 2.255819870245516, "ewc_loss": 0.07005737721920013, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034412843524478376, "grad_norm": 8.241789817810059, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.868765115737915, "num_tokens": 676720698.0, "step": 17733 }, { "epoch": 2.2559470805241064, "ewc_loss": 0.06945348531007767, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003380895359441638, "grad_norm": 8.08703899383545, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8530285954475403, "num_tokens": 676763719.0, "step": 17734 }, { "epoch": 2.256074290802697, "ewc_loss": 0.06989502906799316, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034250496537424624, "grad_norm": 8.27668285369873, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8664569854736328, "num_tokens": 676801083.0, "step": 17735 }, { "epoch": 2.2562015010812875, "ewc_loss": 0.06958671659231186, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033942185109481215, "grad_norm": 8.13022518157959, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8579242825508118, "num_tokens": 676840861.0, "step": 17736 }, { "epoch": 2.256328711359878, "ewc_loss": 0.06999081373214722, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003434628597460687, "grad_norm": 8.280349731445312, "learning_rate": 1e-06, "loss": 0.4227, "mean_token_accuracy": 0.8784627318382263, "num_tokens": 676876864.0, "step": 17737 }, { "epoch": 2.2564559216384685, "ewc_loss": 0.06948871910572052, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033844183781184256, "grad_norm": 8.075347900390625, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8848682641983032, "num_tokens": 676920983.0, "step": 17738 }, { "epoch": 2.256583131917059, "ewc_loss": 0.06994070112705231, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034296169178560376, "grad_norm": 8.481264114379883, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8750945329666138, "num_tokens": 676961294.0, "step": 17739 }, { "epoch": 2.2567103421956496, "ewc_loss": 0.0691482201218605, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003350368933752179, "grad_norm": 8.083277702331543, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8580745458602905, "num_tokens": 676996318.0, "step": 17740 }, { "epoch": 2.2568375524742397, "ewc_loss": 0.07016854733228683, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003452401433605701, "grad_norm": 8.334870338439941, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8667148351669312, "num_tokens": 677034083.0, "step": 17741 }, { "epoch": 2.2569647627528306, "ewc_loss": 0.06916894018650055, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033524411264806986, "grad_norm": 8.092843055725098, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8667466044425964, "num_tokens": 677071206.0, "step": 17742 }, { "epoch": 2.2570919730314207, "ewc_loss": 0.06994006782770157, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003429553471505642, "grad_norm": 8.333016395568848, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8697166442871094, "num_tokens": 677103573.0, "step": 17743 }, { "epoch": 2.2572191833100113, "ewc_loss": 0.06926706433296204, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003362253191880882, "grad_norm": 8.127182960510254, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8745050430297852, "num_tokens": 677141406.0, "step": 17744 }, { "epoch": 2.257346393588602, "ewc_loss": 0.0697924941778183, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003414796374272555, "grad_norm": 8.202022552490234, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8554383516311646, "num_tokens": 677185968.0, "step": 17745 }, { "epoch": 2.2574736038671923, "ewc_loss": 0.06936943531036377, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003372489882167429, "grad_norm": 8.077925682067871, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8571444153785706, "num_tokens": 677229004.0, "step": 17746 }, { "epoch": 2.257600814145783, "ewc_loss": 0.06982383131980896, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003417930274736136, "grad_norm": 8.262025833129883, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8762664198875427, "num_tokens": 677266317.0, "step": 17747 }, { "epoch": 2.2577280244243734, "ewc_loss": 0.06933537125587463, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033690844429656863, "grad_norm": 8.130757331848145, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8745089769363403, "num_tokens": 677302344.0, "step": 17748 }, { "epoch": 2.257855234702964, "ewc_loss": 0.06983029842376709, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034185772528871894, "grad_norm": 8.222732543945312, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8570804595947266, "num_tokens": 677342103.0, "step": 17749 }, { "epoch": 2.2579824449815544, "ewc_loss": 0.06940002739429474, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003375549567863345, "grad_norm": 8.151158332824707, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.852527379989624, "num_tokens": 677378602.0, "step": 17750 }, { "epoch": 2.258109655260145, "ewc_loss": 0.0697895735502243, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034145047538913786, "grad_norm": 8.29703140258789, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8645743131637573, "num_tokens": 677415005.0, "step": 17751 }, { "epoch": 2.2582368655387355, "ewc_loss": 0.06946607679128647, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033821543911471963, "grad_norm": 8.108014106750488, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8467282056808472, "num_tokens": 677457908.0, "step": 17752 }, { "epoch": 2.258364075817326, "ewc_loss": 0.06972566246986389, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034081132616847754, "grad_norm": 8.180028915405273, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8551430702209473, "num_tokens": 677492759.0, "step": 17753 }, { "epoch": 2.2584912860959165, "ewc_loss": 0.06958543509244919, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003394090454094112, "grad_norm": 8.133244514465332, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.862969696521759, "num_tokens": 677535771.0, "step": 17754 }, { "epoch": 2.258618496374507, "ewc_loss": 0.06963811814785004, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000339935882948339, "grad_norm": 8.114322662353516, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8785492181777954, "num_tokens": 677575325.0, "step": 17755 }, { "epoch": 2.2587457066530976, "ewc_loss": 0.069704070687294, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003405953466426581, "grad_norm": 8.176010131835938, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8664432764053345, "num_tokens": 677614132.0, "step": 17756 }, { "epoch": 2.258872916931688, "ewc_loss": 0.06958608329296112, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033941553556360304, "grad_norm": 8.221954345703125, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8593978881835938, "num_tokens": 677645657.0, "step": 17757 }, { "epoch": 2.2590001272102787, "ewc_loss": 0.06952756643295288, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003388303448446095, "grad_norm": 8.116262435913086, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8736512064933777, "num_tokens": 677686238.0, "step": 17758 }, { "epoch": 2.259127337488869, "ewc_loss": 0.06970742344856262, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034062890335917473, "grad_norm": 8.174269676208496, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8695403337478638, "num_tokens": 677724080.0, "step": 17759 }, { "epoch": 2.2592545477674597, "ewc_loss": 0.06949660181999207, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003385206509847194, "grad_norm": 8.11280632019043, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8652133941650391, "num_tokens": 677765768.0, "step": 17760 }, { "epoch": 2.2593817580460502, "ewc_loss": 0.06965374201536179, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034009211231023073, "grad_norm": 8.148322105407715, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8663288354873657, "num_tokens": 677803937.0, "step": 17761 }, { "epoch": 2.2595089683246408, "ewc_loss": 0.06946499645709991, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033820467069745064, "grad_norm": 8.141082763671875, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8767685890197754, "num_tokens": 677839685.0, "step": 17762 }, { "epoch": 2.2596361786032313, "ewc_loss": 0.06954961270093918, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033905080636031926, "grad_norm": 8.126791954040527, "learning_rate": 1e-06, "loss": 0.5231, "mean_token_accuracy": 0.8513041734695435, "num_tokens": 677880411.0, "step": 17763 }, { "epoch": 2.259763388881822, "ewc_loss": 0.0696619302034378, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034017395228147507, "grad_norm": 8.138655662536621, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8743841648101807, "num_tokens": 677921177.0, "step": 17764 }, { "epoch": 2.2598905991604123, "ewc_loss": 0.06958886235952377, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033944330061785877, "grad_norm": 8.095877647399902, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.874556303024292, "num_tokens": 677956393.0, "step": 17765 }, { "epoch": 2.2600178094390024, "ewc_loss": 0.0697912946343422, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034146764664910734, "grad_norm": 8.214717864990234, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8644294142723083, "num_tokens": 677988364.0, "step": 17766 }, { "epoch": 2.2601450197175934, "ewc_loss": 0.0694776177406311, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003383308940101415, "grad_norm": 8.064105987548828, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8731293082237244, "num_tokens": 678028093.0, "step": 17767 }, { "epoch": 2.2602722299961835, "ewc_loss": 0.0699077844619751, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034263249835930765, "grad_norm": 8.159002304077148, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8669874668121338, "num_tokens": 678068174.0, "step": 17768 }, { "epoch": 2.260399440274774, "ewc_loss": 0.06952695548534393, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033882420393638313, "grad_norm": 8.11495590209961, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8680640459060669, "num_tokens": 678105784.0, "step": 17769 }, { "epoch": 2.2605266505533645, "ewc_loss": 0.06974974274635315, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034105213126167655, "grad_norm": 8.143540382385254, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8766433000564575, "num_tokens": 678142873.0, "step": 17770 }, { "epoch": 2.260653860831955, "ewc_loss": 0.06956471502780914, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033920182613655925, "grad_norm": 8.166099548339844, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.863029956817627, "num_tokens": 678181527.0, "step": 17771 }, { "epoch": 2.2607810711105456, "ewc_loss": 0.06969820708036423, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003405367606319487, "grad_norm": 8.137933731079102, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8672336339950562, "num_tokens": 678221225.0, "step": 17772 }, { "epoch": 2.260908281389136, "ewc_loss": 0.06965893507003784, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034014409175142646, "grad_norm": 8.131610870361328, "learning_rate": 1e-06, "loss": 0.4241, "mean_token_accuracy": 0.8738628625869751, "num_tokens": 678254767.0, "step": 17773 }, { "epoch": 2.2610354916677267, "ewc_loss": 0.06959933042526245, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033954798709601164, "grad_norm": 8.148188591003418, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8674555420875549, "num_tokens": 678290830.0, "step": 17774 }, { "epoch": 2.261162701946317, "ewc_loss": 0.06962482631206512, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003398029657546431, "grad_norm": 8.147505760192871, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8753970265388489, "num_tokens": 678327102.0, "step": 17775 }, { "epoch": 2.2612899122249077, "ewc_loss": 0.06968935579061508, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404482267796993, "grad_norm": 8.149563789367676, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8790885210037231, "num_tokens": 678364872.0, "step": 17776 }, { "epoch": 2.2614171225034982, "ewc_loss": 0.06946849822998047, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003382396243978292, "grad_norm": 8.111759185791016, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8589465618133545, "num_tokens": 678399233.0, "step": 17777 }, { "epoch": 2.2615443327820888, "ewc_loss": 0.06971635669469833, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003407182521186769, "grad_norm": 8.20205307006836, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8742897510528564, "num_tokens": 678434577.0, "step": 17778 }, { "epoch": 2.2616715430606793, "ewc_loss": 0.06952238082885742, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033877845271490514, "grad_norm": 8.137645721435547, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8718405961990356, "num_tokens": 678466835.0, "step": 17779 }, { "epoch": 2.26179875333927, "ewc_loss": 0.06954769790172577, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003390316851437092, "grad_norm": 8.087069511413574, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8810365200042725, "num_tokens": 678506817.0, "step": 17780 }, { "epoch": 2.2619259636178604, "ewc_loss": 0.06964897364377975, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034004441113211215, "grad_norm": 8.1254243850708, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8669904470443726, "num_tokens": 678543734.0, "step": 17781 }, { "epoch": 2.262053173896451, "ewc_loss": 0.06955781579017639, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033913287916220725, "grad_norm": 8.125171661376953, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8614311218261719, "num_tokens": 678585066.0, "step": 17782 }, { "epoch": 2.2621803841750414, "ewc_loss": 0.0695643201470375, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003391978971194476, "grad_norm": 8.160989761352539, "learning_rate": 1e-06, "loss": 0.4043, "mean_token_accuracy": 0.8804620504379272, "num_tokens": 678614951.0, "step": 17783 }, { "epoch": 2.262307594453632, "ewc_loss": 0.06956224143505096, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033917714608833194, "grad_norm": 8.102570533752441, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8689353466033936, "num_tokens": 678649671.0, "step": 17784 }, { "epoch": 2.2624348047322225, "ewc_loss": 0.0697290450334549, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034084508661180735, "grad_norm": 8.123701095581055, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8831239938735962, "num_tokens": 678693418.0, "step": 17785 }, { "epoch": 2.262562015010813, "ewc_loss": 0.06958523392677307, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033940697903744876, "grad_norm": 8.13975715637207, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8765126466751099, "num_tokens": 678730891.0, "step": 17786 }, { "epoch": 2.2626892252894035, "ewc_loss": 0.06962434947490692, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003397981636226177, "grad_norm": 8.14949893951416, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8745145797729492, "num_tokens": 678766369.0, "step": 17787 }, { "epoch": 2.262816435567994, "ewc_loss": 0.06955289840698242, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033908369368873537, "grad_norm": 8.104158401489258, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.856067955493927, "num_tokens": 678809118.0, "step": 17788 }, { "epoch": 2.2629436458465846, "ewc_loss": 0.06970459967851639, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003406006726436317, "grad_norm": 8.159205436706543, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8648998141288757, "num_tokens": 678850057.0, "step": 17789 }, { "epoch": 2.263070856125175, "ewc_loss": 0.06954096257686615, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003389643388800323, "grad_norm": 8.147610664367676, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8658222556114197, "num_tokens": 678890253.0, "step": 17790 }, { "epoch": 2.263198066403765, "ewc_loss": 0.0696474239230156, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034002892789430916, "grad_norm": 8.173016548156738, "learning_rate": 1e-06, "loss": 0.412, "mean_token_accuracy": 0.8800033330917358, "num_tokens": 678922535.0, "step": 17791 }, { "epoch": 2.263325276682356, "ewc_loss": 0.06959182024002075, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033947284100577235, "grad_norm": 8.181004524230957, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8650398254394531, "num_tokens": 678956136.0, "step": 17792 }, { "epoch": 2.2634524869609463, "ewc_loss": 0.06957897543907166, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003393444640096277, "grad_norm": 9.013653755187988, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8686120510101318, "num_tokens": 678990864.0, "step": 17793 }, { "epoch": 2.263579697239537, "ewc_loss": 0.06886353343725204, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033219001488760114, "grad_norm": 7.887258529663086, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8781822919845581, "num_tokens": 679028345.0, "step": 17794 }, { "epoch": 2.2637069075181273, "ewc_loss": 0.07063818722963333, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003499365411698818, "grad_norm": 8.384268760681152, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8531184196472168, "num_tokens": 679073679.0, "step": 17795 }, { "epoch": 2.263834117796718, "ewc_loss": 0.0687950849533081, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033150558010675013, "grad_norm": 7.9241862297058105, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8652300834655762, "num_tokens": 679109274.0, "step": 17796 }, { "epoch": 2.2639613280753084, "ewc_loss": 0.07061140239238739, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034966872772201896, "grad_norm": 8.380227088928223, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8674871921539307, "num_tokens": 679141644.0, "step": 17797 }, { "epoch": 2.264088538353899, "ewc_loss": 0.06911216676235199, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033467632601968944, "grad_norm": 7.9748992919921875, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8508206605911255, "num_tokens": 679184351.0, "step": 17798 }, { "epoch": 2.2642157486324894, "ewc_loss": 0.07037615776062012, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034731629421003163, "grad_norm": 8.337529182434082, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8636483550071716, "num_tokens": 679222509.0, "step": 17799 }, { "epoch": 2.26434295891108, "ewc_loss": 0.069452203810215, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033807673025876284, "grad_norm": 8.06460952758789, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8502360582351685, "num_tokens": 679263362.0, "step": 17800 }, { "epoch": 2.2644701691896705, "ewc_loss": 0.07009458541870117, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000344500585924834, "grad_norm": 8.241593360900879, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8532415628433228, "num_tokens": 679307859.0, "step": 17801 }, { "epoch": 2.264597379468261, "ewc_loss": 0.0696268230676651, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003398228727746755, "grad_norm": 8.141379356384277, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8756936192512512, "num_tokens": 679344801.0, "step": 17802 }, { "epoch": 2.2647245897468515, "ewc_loss": 0.06987936794757843, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003423483285587281, "grad_norm": 8.21159553527832, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8585011959075928, "num_tokens": 679382795.0, "step": 17803 }, { "epoch": 2.264851800025442, "ewc_loss": 0.06965799629688263, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034013460390269756, "grad_norm": 8.134521484375, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8638089299201965, "num_tokens": 679425282.0, "step": 17804 }, { "epoch": 2.2649790103040326, "ewc_loss": 0.06972567737102509, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003408114134799689, "grad_norm": 8.18950080871582, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8544390797615051, "num_tokens": 679466881.0, "step": 17805 }, { "epoch": 2.265106220582623, "ewc_loss": 0.06966300308704376, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003401847498025745, "grad_norm": 8.102832794189453, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8787283301353455, "num_tokens": 679509268.0, "step": 17806 }, { "epoch": 2.2652334308612136, "ewc_loss": 0.06974548101425171, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000341009465046227, "grad_norm": 8.177997589111328, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8638230562210083, "num_tokens": 679545082.0, "step": 17807 }, { "epoch": 2.265360641139804, "ewc_loss": 0.06958959996700287, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033945063478313386, "grad_norm": 8.1628999710083, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8672072291374207, "num_tokens": 679581795.0, "step": 17808 }, { "epoch": 2.2654878514183947, "ewc_loss": 0.06956516951322556, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033920639543794096, "grad_norm": 8.067733764648438, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8635808825492859, "num_tokens": 679618978.0, "step": 17809 }, { "epoch": 2.2656150616969852, "ewc_loss": 0.06971365213394165, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003406911564525217, "grad_norm": 8.160894393920898, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8713378310203552, "num_tokens": 679655002.0, "step": 17810 }, { "epoch": 2.2657422719755758, "ewc_loss": 0.06960636377334595, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033961833105422556, "grad_norm": 8.082701683044434, "learning_rate": 1e-06, "loss": 0.4022, "mean_token_accuracy": 0.8814502954483032, "num_tokens": 679691143.0, "step": 17811 }, { "epoch": 2.2658694822541663, "ewc_loss": 0.06977108120918274, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003412655496504158, "grad_norm": 8.119804382324219, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8687082529067993, "num_tokens": 679729168.0, "step": 17812 }, { "epoch": 2.265996692532757, "ewc_loss": 0.06949298083782196, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033848450402729213, "grad_norm": 8.067190170288086, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8665212392807007, "num_tokens": 679773385.0, "step": 17813 }, { "epoch": 2.2661239028113473, "ewc_loss": 0.06984160840511322, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003419708227738738, "grad_norm": 8.15963363647461, "learning_rate": 1e-06, "loss": 0.4166, "mean_token_accuracy": 0.8775111436843872, "num_tokens": 679813884.0, "step": 17814 }, { "epoch": 2.266251113089938, "ewc_loss": 0.06959065794944763, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033946128678508103, "grad_norm": 8.12697982788086, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.867511510848999, "num_tokens": 679850834.0, "step": 17815 }, { "epoch": 2.266378323368528, "ewc_loss": 0.06984270364046097, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003419817076064646, "grad_norm": 8.165386199951172, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8717660903930664, "num_tokens": 679890210.0, "step": 17816 }, { "epoch": 2.266505533647119, "ewc_loss": 0.06964659690856934, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034002066240645945, "grad_norm": 8.173080444335938, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8758811354637146, "num_tokens": 679926252.0, "step": 17817 }, { "epoch": 2.266632743925709, "ewc_loss": 0.06959644705057144, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033951917430385947, "grad_norm": 8.11790943145752, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8642435073852539, "num_tokens": 679959626.0, "step": 17818 }, { "epoch": 2.2667599542042995, "ewc_loss": 0.06980490684509277, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034160379436798394, "grad_norm": 8.131282806396484, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8622197508811951, "num_tokens": 679999973.0, "step": 17819 }, { "epoch": 2.26688716448289, "ewc_loss": 0.06965391337871552, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003400938294362277, "grad_norm": 8.125165939331055, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8610235452651978, "num_tokens": 680041081.0, "step": 17820 }, { "epoch": 2.2670143747614806, "ewc_loss": 0.06975001841783524, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003410548670217395, "grad_norm": 8.150691986083984, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8704034686088562, "num_tokens": 680078953.0, "step": 17821 }, { "epoch": 2.267141585040071, "ewc_loss": 0.0695842057466507, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033939676359295845, "grad_norm": 8.108230590820312, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8744440078735352, "num_tokens": 680112309.0, "step": 17822 }, { "epoch": 2.2672687953186617, "ewc_loss": 0.06977378576993942, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003412925580050796, "grad_norm": 8.18065357208252, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8618509769439697, "num_tokens": 680148607.0, "step": 17823 }, { "epoch": 2.267396005597252, "ewc_loss": 0.06969954073429108, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003405501483939588, "grad_norm": 8.075815200805664, "learning_rate": 1e-06, "loss": 0.4032, "mean_token_accuracy": 0.8845617771148682, "num_tokens": 680189617.0, "step": 17824 }, { "epoch": 2.2675232158758427, "ewc_loss": 0.06984199583530426, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003419746062718332, "grad_norm": 8.24018383026123, "learning_rate": 1e-06, "loss": 0.4029, "mean_token_accuracy": 0.8837643265724182, "num_tokens": 680227940.0, "step": 17825 }, { "epoch": 2.2676504261544332, "ewc_loss": 0.06956011056900024, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003391558420844376, "grad_norm": 8.109989166259766, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8799473643302917, "num_tokens": 680263473.0, "step": 17826 }, { "epoch": 2.2677776364330238, "ewc_loss": 0.06977187097072601, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034127337858080864, "grad_norm": 8.23713207244873, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8585707545280457, "num_tokens": 680297895.0, "step": 17827 }, { "epoch": 2.2679048467116143, "ewc_loss": 0.06945283710956573, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003380830166861415, "grad_norm": 8.922253608703613, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8682606220245361, "num_tokens": 680337658.0, "step": 17828 }, { "epoch": 2.268032056990205, "ewc_loss": 0.0689137876033783, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033269260893575847, "grad_norm": 7.9488420486450195, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8624891042709351, "num_tokens": 680375912.0, "step": 17829 }, { "epoch": 2.2681592672687954, "ewc_loss": 0.07033215463161469, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034687621518969536, "grad_norm": 8.346444129943848, "learning_rate": 1e-06, "loss": 0.427, "mean_token_accuracy": 0.8777207136154175, "num_tokens": 680415231.0, "step": 17830 }, { "epoch": 2.268286477547386, "ewc_loss": 0.06878729164600372, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000331427640048787, "grad_norm": 7.90847110748291, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8625717163085938, "num_tokens": 680460037.0, "step": 17831 }, { "epoch": 2.2684136878259764, "ewc_loss": 0.07064437121152878, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034999841591343284, "grad_norm": 8.361653327941895, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8445230722427368, "num_tokens": 680505521.0, "step": 17832 }, { "epoch": 2.268540898104567, "ewc_loss": 0.06912046670913696, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003347593010403216, "grad_norm": 7.9992265701293945, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8809289932250977, "num_tokens": 680547682.0, "step": 17833 }, { "epoch": 2.2686681083831575, "ewc_loss": 0.07042054831981659, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003477602149359882, "grad_norm": 8.34821605682373, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8719705939292908, "num_tokens": 680587510.0, "step": 17834 }, { "epoch": 2.268795318661748, "ewc_loss": 0.0693562775850296, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033711743890307844, "grad_norm": 8.084830284118652, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8694602847099304, "num_tokens": 680624233.0, "step": 17835 }, { "epoch": 2.2689225289403385, "ewc_loss": 0.07011319696903229, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034468661760911345, "grad_norm": 8.22529125213623, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8622825145721436, "num_tokens": 680666091.0, "step": 17836 }, { "epoch": 2.269049739218929, "ewc_loss": 0.06953781843185425, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033893290674313903, "grad_norm": 8.087101936340332, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8748224973678589, "num_tokens": 680703062.0, "step": 17837 }, { "epoch": 2.2691769494975196, "ewc_loss": 0.0700259655714035, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003438142884988338, "grad_norm": 8.213765144348145, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8796157240867615, "num_tokens": 680735966.0, "step": 17838 }, { "epoch": 2.2693041597761097, "ewc_loss": 0.06964747607707977, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034002942265942693, "grad_norm": 8.095398902893066, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8678911924362183, "num_tokens": 680780010.0, "step": 17839 }, { "epoch": 2.2694313700547006, "ewc_loss": 0.06988126784563065, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003423673624638468, "grad_norm": 8.18721866607666, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.866121232509613, "num_tokens": 680819516.0, "step": 17840 }, { "epoch": 2.2695585803332907, "ewc_loss": 0.06973831355571747, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034093784051947296, "grad_norm": 8.150278091430664, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8651480674743652, "num_tokens": 680861374.0, "step": 17841 }, { "epoch": 2.2696857906118812, "ewc_loss": 0.0698002353310585, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003415570536162704, "grad_norm": 8.149981498718262, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.858791708946228, "num_tokens": 680902447.0, "step": 17842 }, { "epoch": 2.2698130008904718, "ewc_loss": 0.0697379857301712, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034093455178663135, "grad_norm": 8.097362518310547, "learning_rate": 1e-06, "loss": 0.3984, "mean_token_accuracy": 0.8843443989753723, "num_tokens": 680942376.0, "step": 17843 }, { "epoch": 2.2699402111690623, "ewc_loss": 0.06986874341964722, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034224215778522193, "grad_norm": 8.147199630737305, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8678387403488159, "num_tokens": 680977060.0, "step": 17844 }, { "epoch": 2.270067421447653, "ewc_loss": 0.06990829110145569, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003426375915296376, "grad_norm": 8.197330474853516, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8502350449562073, "num_tokens": 681014504.0, "step": 17845 }, { "epoch": 2.2701946317262434, "ewc_loss": 0.06979865580797195, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003415412502363324, "grad_norm": 8.175992965698242, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8506472706794739, "num_tokens": 681053919.0, "step": 17846 }, { "epoch": 2.270321842004834, "ewc_loss": 0.069891557097435, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003424702154006809, "grad_norm": 8.148468971252441, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8673126101493835, "num_tokens": 681090883.0, "step": 17847 }, { "epoch": 2.2704490522834244, "ewc_loss": 0.06983836740255356, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034193837200291455, "grad_norm": 8.167013168334961, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8817527294158936, "num_tokens": 681128819.0, "step": 17848 }, { "epoch": 2.270576262562015, "ewc_loss": 0.06983321905136108, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034188682911917567, "grad_norm": 8.157764434814453, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8561872243881226, "num_tokens": 681171434.0, "step": 17849 }, { "epoch": 2.2707034728406055, "ewc_loss": 0.06947380304336548, "ewc_loss_diag": 3.504753112792969e-05, "ewc_loss_parallel": 0.0003431755176279694, "grad_norm": 8.20197868347168, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8767078518867493, "num_tokens": 681206297.0, "step": 17850 }, { "epoch": 2.270830683119196, "ewc_loss": 0.06977640092372894, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034131872234866023, "grad_norm": 8.178308486938477, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8575636744499207, "num_tokens": 681242521.0, "step": 17851 }, { "epoch": 2.2709578933977865, "ewc_loss": 0.069811150431633, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034166622208431363, "grad_norm": 8.152294158935547, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8770841956138611, "num_tokens": 681282474.0, "step": 17852 }, { "epoch": 2.271085103676377, "ewc_loss": 0.069983571767807, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000343390362104401, "grad_norm": 8.20898151397705, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8632596731185913, "num_tokens": 681317280.0, "step": 17853 }, { "epoch": 2.2712123139549676, "ewc_loss": 0.06966139376163483, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003401686262805015, "grad_norm": 8.15284538269043, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8644986152648926, "num_tokens": 681349155.0, "step": 17854 }, { "epoch": 2.271339524233558, "ewc_loss": 0.06980657577514648, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034162041265517473, "grad_norm": 8.197797775268555, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8661196231842041, "num_tokens": 681380863.0, "step": 17855 }, { "epoch": 2.2714667345121486, "ewc_loss": 0.06981194019317627, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003416741092223674, "grad_norm": 8.137828826904297, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8705408573150635, "num_tokens": 681421970.0, "step": 17856 }, { "epoch": 2.271593944790739, "ewc_loss": 0.0698380172252655, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003419349086470902, "grad_norm": 8.140382766723633, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8808126449584961, "num_tokens": 681453511.0, "step": 17857 }, { "epoch": 2.2717211550693297, "ewc_loss": 0.0697617456316948, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034117212635464966, "grad_norm": 8.090412139892578, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8544487953186035, "num_tokens": 681496293.0, "step": 17858 }, { "epoch": 2.2718483653479202, "ewc_loss": 0.0698891133069992, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003424458554945886, "grad_norm": 8.187460899353027, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8593431711196899, "num_tokens": 681537074.0, "step": 17859 }, { "epoch": 2.2719755756265108, "ewc_loss": 0.069871686398983, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034227155265398324, "grad_norm": 8.142667770385742, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.860747218132019, "num_tokens": 681578911.0, "step": 17860 }, { "epoch": 2.2721027859051013, "ewc_loss": 0.0698404312133789, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034195894841104746, "grad_norm": 8.136131286621094, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8649986982345581, "num_tokens": 681622134.0, "step": 17861 }, { "epoch": 2.272229996183692, "ewc_loss": 0.06982384622097015, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003417931729927659, "grad_norm": 8.15881633758545, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.868133008480072, "num_tokens": 681655919.0, "step": 17862 }, { "epoch": 2.2723572064622823, "ewc_loss": 0.06985816359519958, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003421363653615117, "grad_norm": 8.182341575622559, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8581717014312744, "num_tokens": 681696539.0, "step": 17863 }, { "epoch": 2.2724844167408724, "ewc_loss": 0.06981700658798218, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034172472078353167, "grad_norm": 8.143004417419434, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8644351959228516, "num_tokens": 681734630.0, "step": 17864 }, { "epoch": 2.2726116270194634, "ewc_loss": 0.06994462758302689, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034300098195672035, "grad_norm": 8.12857437133789, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8551151752471924, "num_tokens": 681779669.0, "step": 17865 }, { "epoch": 2.2727388372980535, "ewc_loss": 0.06992071866989136, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034276192309334874, "grad_norm": 8.12844467163086, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8644204139709473, "num_tokens": 681821920.0, "step": 17866 }, { "epoch": 2.272866047576644, "ewc_loss": 0.06997653841972351, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034332001814618707, "grad_norm": 8.167366981506348, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8641633987426758, "num_tokens": 681859239.0, "step": 17867 }, { "epoch": 2.2729932578552345, "ewc_loss": 0.06988964974880219, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003424512397032231, "grad_norm": 8.136492729187012, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8532382845878601, "num_tokens": 681896569.0, "step": 17868 }, { "epoch": 2.273120468133825, "ewc_loss": 0.0700652152299881, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034420687006786466, "grad_norm": 8.21843433380127, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8811540603637695, "num_tokens": 681932549.0, "step": 17869 }, { "epoch": 2.2732476784124156, "ewc_loss": 0.06977704167366028, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034132509608753026, "grad_norm": 8.116109848022461, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8629139065742493, "num_tokens": 681973981.0, "step": 17870 }, { "epoch": 2.273374888691006, "ewc_loss": 0.06998606026172638, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003434153040871024, "grad_norm": 8.163420677185059, "learning_rate": 1e-06, "loss": 0.5484, "mean_token_accuracy": 0.8428951501846313, "num_tokens": 682009512.0, "step": 17871 }, { "epoch": 2.2735020989695967, "ewc_loss": 0.06985561549663544, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034211084130220115, "grad_norm": 8.190196990966797, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8628604412078857, "num_tokens": 682048058.0, "step": 17872 }, { "epoch": 2.273629309248187, "ewc_loss": 0.06980657577514648, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034162041265517473, "grad_norm": 8.157172203063965, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8592698574066162, "num_tokens": 682084181.0, "step": 17873 }, { "epoch": 2.2737565195267777, "ewc_loss": 0.06996671110391617, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003432217927183956, "grad_norm": 8.15198040008545, "learning_rate": 1e-06, "loss": 0.3981, "mean_token_accuracy": 0.8868675231933594, "num_tokens": 682119613.0, "step": 17874 }, { "epoch": 2.2738837298053682, "ewc_loss": 0.06971865892410278, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003407412732485682, "grad_norm": 8.108739852905273, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8693205118179321, "num_tokens": 682159501.0, "step": 17875 }, { "epoch": 2.2740109400839588, "ewc_loss": 0.06987661123275757, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034232085454277694, "grad_norm": 8.102906227111816, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8750274181365967, "num_tokens": 682197323.0, "step": 17876 }, { "epoch": 2.2741381503625493, "ewc_loss": 0.06982891261577606, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034184378455393016, "grad_norm": 8.184293746948242, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8676947355270386, "num_tokens": 682231150.0, "step": 17877 }, { "epoch": 2.27426536064114, "ewc_loss": 0.06980684399604797, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034162314841523767, "grad_norm": 8.14208698272705, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8677840232849121, "num_tokens": 682269351.0, "step": 17878 }, { "epoch": 2.2743925709197303, "ewc_loss": 0.0698729157447815, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003422838053666055, "grad_norm": 8.171314239501953, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8763595819473267, "num_tokens": 682303633.0, "step": 17879 }, { "epoch": 2.274519781198321, "ewc_loss": 0.06982249021530151, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003417795815039426, "grad_norm": 8.14153003692627, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8651108145713806, "num_tokens": 682340890.0, "step": 17880 }, { "epoch": 2.2746469914769114, "ewc_loss": 0.0697818249464035, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003413729427848011, "grad_norm": 8.13106632232666, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8815015554428101, "num_tokens": 682374253.0, "step": 17881 }, { "epoch": 2.274774201755502, "ewc_loss": 0.06973770260810852, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034093172871507704, "grad_norm": 8.09683609008789, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8674231767654419, "num_tokens": 682412675.0, "step": 17882 }, { "epoch": 2.2749014120340925, "ewc_loss": 0.0699276551604271, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003428312484174967, "grad_norm": 8.138333320617676, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8776116371154785, "num_tokens": 682447864.0, "step": 17883 }, { "epoch": 2.275028622312683, "ewc_loss": 0.06984144449234009, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034196916385553777, "grad_norm": 8.122230529785156, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.868201494216919, "num_tokens": 682486445.0, "step": 17884 }, { "epoch": 2.2751558325912735, "ewc_loss": 0.06992622464895248, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034281692933291197, "grad_norm": 8.169655799865723, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8658351898193359, "num_tokens": 682518173.0, "step": 17885 }, { "epoch": 2.275283042869864, "ewc_loss": 0.0698174238204956, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000341728882631287, "grad_norm": 8.097319602966309, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8664551973342896, "num_tokens": 682558627.0, "step": 17886 }, { "epoch": 2.2754102531484546, "ewc_loss": 0.06996579468250275, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003432126250118017, "grad_norm": 8.151268005371094, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8723950386047363, "num_tokens": 682600821.0, "step": 17887 }, { "epoch": 2.275537463427045, "ewc_loss": 0.06982719153165817, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003418266132939607, "grad_norm": 8.114187240600586, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.855431079864502, "num_tokens": 682643947.0, "step": 17888 }, { "epoch": 2.275664673705635, "ewc_loss": 0.07001898437738419, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000343744526617229, "grad_norm": 8.174731254577637, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8619120717048645, "num_tokens": 682683045.0, "step": 17889 }, { "epoch": 2.275791883984226, "ewc_loss": 0.06971141695976257, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003406688629183918, "grad_norm": 8.141398429870605, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8664995431900024, "num_tokens": 682721617.0, "step": 17890 }, { "epoch": 2.2759190942628162, "ewc_loss": 0.06985379755496979, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003420927096158266, "grad_norm": 8.169334411621094, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8598206043243408, "num_tokens": 682758339.0, "step": 17891 }, { "epoch": 2.2760463045414068, "ewc_loss": 0.06974996626377106, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003410543722566217, "grad_norm": 8.1222562789917, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8719558715820312, "num_tokens": 682800501.0, "step": 17892 }, { "epoch": 2.2761735148199973, "ewc_loss": 0.06988316029310226, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003423862799536437, "grad_norm": 8.210770606994629, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8656609654426575, "num_tokens": 682833231.0, "step": 17893 }, { "epoch": 2.276300725098588, "ewc_loss": 0.06968457996845245, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034040052560158074, "grad_norm": 8.065505027770996, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8717193603515625, "num_tokens": 682873945.0, "step": 17894 }, { "epoch": 2.2764279353771784, "ewc_loss": 0.07004211097955704, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003439757856540382, "grad_norm": 8.220102310180664, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8651111721992493, "num_tokens": 682918329.0, "step": 17895 }, { "epoch": 2.276555145655769, "ewc_loss": 0.06967784464359283, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003403331502340734, "grad_norm": 8.149880409240723, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8738681077957153, "num_tokens": 682956277.0, "step": 17896 }, { "epoch": 2.2766823559343594, "ewc_loss": 0.06993655860424042, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003429202188272029, "grad_norm": 8.193635940551758, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8696226477622986, "num_tokens": 682990648.0, "step": 17897 }, { "epoch": 2.27680956621295, "ewc_loss": 0.06975725293159485, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034112719004042447, "grad_norm": 8.156668663024902, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8795090317726135, "num_tokens": 683029046.0, "step": 17898 }, { "epoch": 2.2769367764915405, "ewc_loss": 0.06992711126804352, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034282574779354036, "grad_norm": 8.223114967346191, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8666753768920898, "num_tokens": 683064541.0, "step": 17899 }, { "epoch": 2.277063986770131, "ewc_loss": 0.06970188021659851, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034057351876981556, "grad_norm": 8.164358139038086, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8660227060317993, "num_tokens": 683105015.0, "step": 17900 }, { "epoch": 2.2771911970487215, "ewc_loss": 0.06984800100326538, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003420347347855568, "grad_norm": 8.195357322692871, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8593319058418274, "num_tokens": 683142938.0, "step": 17901 }, { "epoch": 2.277318407327312, "ewc_loss": 0.06962757557630539, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033983043977059424, "grad_norm": 8.071579933166504, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8808166980743408, "num_tokens": 683181593.0, "step": 17902 }, { "epoch": 2.2774456176059026, "ewc_loss": 0.06997453421354294, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003433000238146633, "grad_norm": 8.28744125366211, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8549820184707642, "num_tokens": 683218641.0, "step": 17903 }, { "epoch": 2.277572827884493, "ewc_loss": 0.06955160200595856, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033907065517269075, "grad_norm": 8.088080406188965, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8498988151550293, "num_tokens": 683261479.0, "step": 17904 }, { "epoch": 2.2777000381630836, "ewc_loss": 0.07000888884067535, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003436435363255441, "grad_norm": 8.23245906829834, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8408666253089905, "num_tokens": 683300602.0, "step": 17905 }, { "epoch": 2.277827248441674, "ewc_loss": 0.06961236894130707, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033967840136028826, "grad_norm": 8.133922576904297, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8651859760284424, "num_tokens": 683337012.0, "step": 17906 }, { "epoch": 2.2779544587202647, "ewc_loss": 0.0699426457285881, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003429811622481793, "grad_norm": 8.231430053710938, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8743950128555298, "num_tokens": 683368949.0, "step": 17907 }, { "epoch": 2.2780816689988552, "ewc_loss": 0.06953632831573486, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003389180055819452, "grad_norm": 8.099099159240723, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8649054765701294, "num_tokens": 683408741.0, "step": 17908 }, { "epoch": 2.2782088792774458, "ewc_loss": 0.0699266642332077, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003428212949074805, "grad_norm": 8.23116683959961, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.856922447681427, "num_tokens": 683447843.0, "step": 17909 }, { "epoch": 2.2783360895560363, "ewc_loss": 0.06965942680835724, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034014901029877365, "grad_norm": 8.102100372314453, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8637540340423584, "num_tokens": 683489695.0, "step": 17910 }, { "epoch": 2.278463299834627, "ewc_loss": 0.0698748230934143, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003423029265832156, "grad_norm": 8.228572845458984, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8641102910041809, "num_tokens": 683528381.0, "step": 17911 }, { "epoch": 2.2785905101132173, "ewc_loss": 0.06971536576747894, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034070838592015207, "grad_norm": 8.116876602172852, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8687825798988342, "num_tokens": 683567261.0, "step": 17912 }, { "epoch": 2.278717720391808, "ewc_loss": 0.06989133358001709, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003424680617172271, "grad_norm": 8.24255084991455, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8515671491622925, "num_tokens": 683601083.0, "step": 17913 }, { "epoch": 2.278844930670398, "ewc_loss": 0.06963256001472473, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033988026552833617, "grad_norm": 8.122294425964355, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8600858449935913, "num_tokens": 683643494.0, "step": 17914 }, { "epoch": 2.278972140948989, "ewc_loss": 0.06999911367893219, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034354583476670086, "grad_norm": 8.236059188842773, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8623456954956055, "num_tokens": 683684267.0, "step": 17915 }, { "epoch": 2.279099351227579, "ewc_loss": 0.06957267224788666, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003392813669051975, "grad_norm": 8.101787567138672, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8546688556671143, "num_tokens": 683723292.0, "step": 17916 }, { "epoch": 2.2792265615061695, "ewc_loss": 0.07005774229764938, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003441321023274213, "grad_norm": 8.195540428161621, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.8781070709228516, "num_tokens": 683761378.0, "step": 17917 }, { "epoch": 2.27935377178476, "ewc_loss": 0.06972062587738037, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003407609765417874, "grad_norm": 8.196438789367676, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8580320477485657, "num_tokens": 683794838.0, "step": 17918 }, { "epoch": 2.2794809820633506, "ewc_loss": 0.06973021477460861, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034085684455931187, "grad_norm": 8.163703918457031, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8481953144073486, "num_tokens": 683832307.0, "step": 17919 }, { "epoch": 2.279608192341941, "ewc_loss": 0.06981092691421509, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034166392288170755, "grad_norm": 8.133455276489258, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8595524430274963, "num_tokens": 683871848.0, "step": 17920 }, { "epoch": 2.2797354026205316, "ewc_loss": 0.06970269978046417, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000340581638738513, "grad_norm": 8.171980857849121, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8697525262832642, "num_tokens": 683915086.0, "step": 17921 }, { "epoch": 2.279862612899122, "ewc_loss": 0.06970681250095367, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034062284976243973, "grad_norm": 8.173938751220703, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8589465022087097, "num_tokens": 683945411.0, "step": 17922 }, { "epoch": 2.2799898231777127, "ewc_loss": 0.06970974802970886, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034065215731970966, "grad_norm": 8.151388168334961, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8656306862831116, "num_tokens": 683979664.0, "step": 17923 }, { "epoch": 2.2801170334563032, "ewc_loss": 0.06971929222345352, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034074761788360775, "grad_norm": 8.184176445007324, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8570334911346436, "num_tokens": 684025526.0, "step": 17924 }, { "epoch": 2.2802442437348938, "ewc_loss": 0.06967727839946747, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003403275040909648, "grad_norm": 8.172490119934082, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.85996413230896, "num_tokens": 684056992.0, "step": 17925 }, { "epoch": 2.2803714540134843, "ewc_loss": 0.06978368014097214, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034139148192480206, "grad_norm": 8.16356086730957, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8708118200302124, "num_tokens": 684094515.0, "step": 17926 }, { "epoch": 2.280498664292075, "ewc_loss": 0.06968895345926285, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034044423955492675, "grad_norm": 8.100394248962402, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8728819489479065, "num_tokens": 684132294.0, "step": 17927 }, { "epoch": 2.2806258745706653, "ewc_loss": 0.0697833001613617, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034138772753067315, "grad_norm": 8.191368103027344, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8696864247322083, "num_tokens": 684164010.0, "step": 17928 }, { "epoch": 2.280753084849256, "ewc_loss": 0.06964439898729324, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033999865991063416, "grad_norm": 8.105188369750977, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8756043910980225, "num_tokens": 684202077.0, "step": 17929 }, { "epoch": 2.2808802951278464, "ewc_loss": 0.06977576017379761, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034131231950595975, "grad_norm": 8.156149864196777, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8401013016700745, "num_tokens": 684240799.0, "step": 17930 }, { "epoch": 2.281007505406437, "ewc_loss": 0.06969250738620758, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404797753319144, "grad_norm": 8.126009941101074, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8639185428619385, "num_tokens": 684284080.0, "step": 17931 }, { "epoch": 2.2811347156850275, "ewc_loss": 0.0698162391781807, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000341717095579952, "grad_norm": 8.235586166381836, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8499077558517456, "num_tokens": 684317915.0, "step": 17932 }, { "epoch": 2.281261925963618, "ewc_loss": 0.06953746825456619, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033892938517965376, "grad_norm": 8.023847579956055, "learning_rate": 1e-06, "loss": 0.3417, "mean_token_accuracy": 0.9037157893180847, "num_tokens": 684354665.0, "step": 17933 }, { "epoch": 2.2813891362422085, "ewc_loss": 0.07003317028284073, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003438863786868751, "grad_norm": 8.219517707824707, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8660717606544495, "num_tokens": 684392866.0, "step": 17934 }, { "epoch": 2.281516346520799, "ewc_loss": 0.06932403147220612, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003367949975654483, "grad_norm": 8.040587425231934, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8688336610794067, "num_tokens": 684428422.0, "step": 17935 }, { "epoch": 2.2816435567993896, "ewc_loss": 0.07009781152009964, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445328038651496, "grad_norm": 8.227385520935059, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8523833155632019, "num_tokens": 684467129.0, "step": 17936 }, { "epoch": 2.2817707670779797, "ewc_loss": 0.06949654966592789, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003385201853234321, "grad_norm": 8.138199806213379, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.858531653881073, "num_tokens": 684500093.0, "step": 17937 }, { "epoch": 2.2818979773565706, "ewc_loss": 0.06988734006881714, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034242810215801, "grad_norm": 8.166288375854492, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8573291301727295, "num_tokens": 684541422.0, "step": 17938 }, { "epoch": 2.2820251876351607, "ewc_loss": 0.06978431344032288, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003413978556636721, "grad_norm": 8.19675064086914, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8615609407424927, "num_tokens": 684579164.0, "step": 17939 }, { "epoch": 2.2821523979137512, "ewc_loss": 0.06942479312419891, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003378026594873518, "grad_norm": 8.110657691955566, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8500516414642334, "num_tokens": 684618617.0, "step": 17940 }, { "epoch": 2.2822796081923418, "ewc_loss": 0.06988590955734253, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034241381217725575, "grad_norm": 8.232756614685059, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8508254289627075, "num_tokens": 684655525.0, "step": 17941 }, { "epoch": 2.2824068184709323, "ewc_loss": 0.06946378201246262, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003381925052963197, "grad_norm": 8.097810745239258, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8651542663574219, "num_tokens": 684694525.0, "step": 17942 }, { "epoch": 2.282534028749523, "ewc_loss": 0.06988799571990967, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034243467962369323, "grad_norm": 8.174885749816895, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8657824993133545, "num_tokens": 684733247.0, "step": 17943 }, { "epoch": 2.2826612390281134, "ewc_loss": 0.06950317323207855, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003385863674338907, "grad_norm": 8.10971450805664, "learning_rate": 1e-06, "loss": 0.4153, "mean_token_accuracy": 0.877613365650177, "num_tokens": 684763642.0, "step": 17944 }, { "epoch": 2.282788449306704, "ewc_loss": 0.06979943811893463, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003415490791667253, "grad_norm": 8.231523513793945, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.871619462966919, "num_tokens": 684806164.0, "step": 17945 }, { "epoch": 2.2829156595852944, "ewc_loss": 0.06957559287548065, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033931058715097606, "grad_norm": 8.109618186950684, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8661646842956543, "num_tokens": 684844193.0, "step": 17946 }, { "epoch": 2.283042869863885, "ewc_loss": 0.06984943151473999, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003420490538701415, "grad_norm": 8.189393043518066, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8715118169784546, "num_tokens": 684882103.0, "step": 17947 }, { "epoch": 2.2831700801424755, "ewc_loss": 0.06961763650178909, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003397310501895845, "grad_norm": 8.12971305847168, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8796864151954651, "num_tokens": 684917667.0, "step": 17948 }, { "epoch": 2.283297290421066, "ewc_loss": 0.06981946527957916, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034174934262409806, "grad_norm": 8.210521697998047, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8703148365020752, "num_tokens": 684954007.0, "step": 17949 }, { "epoch": 2.2834245006996565, "ewc_loss": 0.06951717287302017, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003387264150660485, "grad_norm": 8.0975923538208, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8614194989204407, "num_tokens": 684994693.0, "step": 17950 }, { "epoch": 2.283551710978247, "ewc_loss": 0.06983177363872528, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003418724227230996, "grad_norm": 8.170005798339844, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8530658483505249, "num_tokens": 685032713.0, "step": 17951 }, { "epoch": 2.2836789212568376, "ewc_loss": 0.06955251097679138, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033907979377545416, "grad_norm": 8.110474586486816, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8594196438789368, "num_tokens": 685066597.0, "step": 17952 }, { "epoch": 2.283806131535428, "ewc_loss": 0.06984201073646545, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003419747808948159, "grad_norm": 8.139384269714355, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8595067262649536, "num_tokens": 685106243.0, "step": 17953 }, { "epoch": 2.2839333418140186, "ewc_loss": 0.06961676478385925, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003397223190404475, "grad_norm": 8.201369285583496, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.856917679309845, "num_tokens": 685145177.0, "step": 17954 }, { "epoch": 2.284060552092609, "ewc_loss": 0.0696532353758812, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034008704824373126, "grad_norm": 8.086529731750488, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8726195096969604, "num_tokens": 685189130.0, "step": 17955 }, { "epoch": 2.2841877623711997, "ewc_loss": 0.06990066915750504, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034256139770150185, "grad_norm": 8.200765609741211, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8691660165786743, "num_tokens": 685227536.0, "step": 17956 }, { "epoch": 2.28431497264979, "ewc_loss": 0.06958013772964478, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000339356018230319, "grad_norm": 8.119951248168945, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8708946704864502, "num_tokens": 685267213.0, "step": 17957 }, { "epoch": 2.2844421829283807, "ewc_loss": 0.07002851366996765, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003438398416619748, "grad_norm": 8.223864555358887, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8638675212860107, "num_tokens": 685301115.0, "step": 17958 }, { "epoch": 2.2845693932069713, "ewc_loss": 0.0694098249077797, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003376529202796519, "grad_norm": 8.07602596282959, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8709241151809692, "num_tokens": 685338497.0, "step": 17959 }, { "epoch": 2.284696603485562, "ewc_loss": 0.0700489804148674, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034404449979774654, "grad_norm": 8.256881713867188, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8549388647079468, "num_tokens": 685374983.0, "step": 17960 }, { "epoch": 2.2848238137641523, "ewc_loss": 0.06945112347602844, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033806590363383293, "grad_norm": 8.069670677185059, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8718578815460205, "num_tokens": 685408859.0, "step": 17961 }, { "epoch": 2.2849510240427424, "ewc_loss": 0.06998912990093231, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003434459795244038, "grad_norm": 8.258905410766602, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8625935316085815, "num_tokens": 685445103.0, "step": 17962 }, { "epoch": 2.2850782343213334, "ewc_loss": 0.06954579055309296, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033901262213476, "grad_norm": 8.04160213470459, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8601866960525513, "num_tokens": 685484657.0, "step": 17963 }, { "epoch": 2.2852054445999235, "ewc_loss": 0.07001033425331116, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003436580009292811, "grad_norm": 8.219816207885742, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8741699457168579, "num_tokens": 685524034.0, "step": 17964 }, { "epoch": 2.285332654878514, "ewc_loss": 0.06955656409263611, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003391202772036195, "grad_norm": 8.17001724243164, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.872005820274353, "num_tokens": 685560368.0, "step": 17965 }, { "epoch": 2.2854598651571045, "ewc_loss": 0.06973597407341003, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034091444103978574, "grad_norm": 8.188504219055176, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8622822761535645, "num_tokens": 685598192.0, "step": 17966 }, { "epoch": 2.285587075435695, "ewc_loss": 0.06953197717666626, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003388744662515819, "grad_norm": 8.144623756408691, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8682954907417297, "num_tokens": 685636300.0, "step": 17967 }, { "epoch": 2.2857142857142856, "ewc_loss": 0.06969329714775085, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404876624699682, "grad_norm": 8.190505027770996, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8640167713165283, "num_tokens": 685671457.0, "step": 17968 }, { "epoch": 2.285841495992876, "ewc_loss": 0.06950827687978745, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033863744465634227, "grad_norm": 8.105619430541992, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8544955253601074, "num_tokens": 685709687.0, "step": 17969 }, { "epoch": 2.2859687062714666, "ewc_loss": 0.06969964504241943, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003405511670280248, "grad_norm": 8.151960372924805, "learning_rate": 1e-06, "loss": 0.532, "mean_token_accuracy": 0.8442462682723999, "num_tokens": 685747854.0, "step": 17970 }, { "epoch": 2.286095916550057, "ewc_loss": 0.06968537718057632, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034040844184346497, "grad_norm": 8.148482322692871, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.884745717048645, "num_tokens": 685785215.0, "step": 17971 }, { "epoch": 2.2862231268286477, "ewc_loss": 0.06951414793729782, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033869617618620396, "grad_norm": 8.125455856323242, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.870441198348999, "num_tokens": 685818871.0, "step": 17972 }, { "epoch": 2.2863503371072382, "ewc_loss": 0.06970009207725525, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003405555908102542, "grad_norm": 8.220818519592285, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8590079545974731, "num_tokens": 685859089.0, "step": 17973 }, { "epoch": 2.2864775473858288, "ewc_loss": 0.0693356990814209, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033691167482174933, "grad_norm": 8.084579467773438, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8572261333465576, "num_tokens": 685900346.0, "step": 17974 }, { "epoch": 2.2866047576644193, "ewc_loss": 0.06975261867046356, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034108091494999826, "grad_norm": 8.17347526550293, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8642475605010986, "num_tokens": 685938186.0, "step": 17975 }, { "epoch": 2.28673196794301, "ewc_loss": 0.06943793594837189, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000337934005074203, "grad_norm": 8.14210319519043, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8726509809494019, "num_tokens": 685974911.0, "step": 17976 }, { "epoch": 2.2868591782216003, "ewc_loss": 0.06963777542114258, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033993247780017555, "grad_norm": 8.163687705993652, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8678303360939026, "num_tokens": 686010430.0, "step": 17977 }, { "epoch": 2.286986388500191, "ewc_loss": 0.0694834440946579, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033838915987871587, "grad_norm": 8.13128662109375, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8766765594482422, "num_tokens": 686042727.0, "step": 17978 }, { "epoch": 2.2871135987787814, "ewc_loss": 0.06964097917079926, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003399644629098475, "grad_norm": 8.196940422058105, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.867659330368042, "num_tokens": 686078819.0, "step": 17979 }, { "epoch": 2.287240809057372, "ewc_loss": 0.06945383548736572, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003380930284038186, "grad_norm": 8.083925247192383, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8662501573562622, "num_tokens": 686117760.0, "step": 17980 }, { "epoch": 2.2873680193359625, "ewc_loss": 0.0697413831949234, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034096857416443527, "grad_norm": 8.154696464538574, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8576722145080566, "num_tokens": 686152992.0, "step": 17981 }, { "epoch": 2.287495229614553, "ewc_loss": 0.06941154599189758, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033767014974728227, "grad_norm": 8.122720718383789, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8607439994812012, "num_tokens": 686195353.0, "step": 17982 }, { "epoch": 2.2876224398931435, "ewc_loss": 0.06964397430419922, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003399944107513875, "grad_norm": 8.128779411315918, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8623734712600708, "num_tokens": 686232990.0, "step": 17983 }, { "epoch": 2.287749650171734, "ewc_loss": 0.06955143064260483, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003390689962543547, "grad_norm": 8.175350189208984, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8702884912490845, "num_tokens": 686265933.0, "step": 17984 }, { "epoch": 2.2878768604503246, "ewc_loss": 0.06945090740919113, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033806374995037913, "grad_norm": 8.083062171936035, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8721489906311035, "num_tokens": 686307111.0, "step": 17985 }, { "epoch": 2.288004070728915, "ewc_loss": 0.0697031021118164, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000340585655067116, "grad_norm": 8.22177505493164, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8675617575645447, "num_tokens": 686343363.0, "step": 17986 }, { "epoch": 2.288131281007505, "ewc_loss": 0.06939653307199478, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033752003218978643, "grad_norm": 8.102375984191895, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8638414740562439, "num_tokens": 686380888.0, "step": 17987 }, { "epoch": 2.288258491286096, "ewc_loss": 0.06972312927246094, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003407859767321497, "grad_norm": 8.19348430633545, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8580622673034668, "num_tokens": 686417350.0, "step": 17988 }, { "epoch": 2.2883857015646862, "ewc_loss": 0.06941720843315125, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033772681490518153, "grad_norm": 8.10207748413086, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8738589882850647, "num_tokens": 686458658.0, "step": 17989 }, { "epoch": 2.2885129118432768, "ewc_loss": 0.06975119560956955, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034106665407307446, "grad_norm": 8.190638542175293, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8688029050827026, "num_tokens": 686493945.0, "step": 17990 }, { "epoch": 2.2886401221218673, "ewc_loss": 0.06941292434930801, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033768394496291876, "grad_norm": 8.11821174621582, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8650043606758118, "num_tokens": 686527608.0, "step": 17991 }, { "epoch": 2.288767332400458, "ewc_loss": 0.06969758123159409, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003405305033084005, "grad_norm": 8.228668212890625, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8764209151268005, "num_tokens": 686562272.0, "step": 17992 }, { "epoch": 2.2888945426790483, "ewc_loss": 0.0693298727273941, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033685340895317495, "grad_norm": 8.098339080810547, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8772172927856445, "num_tokens": 686599937.0, "step": 17993 }, { "epoch": 2.289021752957639, "ewc_loss": 0.06973152607679367, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003408699412830174, "grad_norm": 8.305094718933105, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8672304749488831, "num_tokens": 686632103.0, "step": 17994 }, { "epoch": 2.2891489632362294, "ewc_loss": 0.06929914653301239, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033654618891887367, "grad_norm": 8.088775634765625, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8739947080612183, "num_tokens": 686662802.0, "step": 17995 }, { "epoch": 2.28927617351482, "ewc_loss": 0.06980597972869873, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034161453368142247, "grad_norm": 8.16518783569336, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.887752890586853, "num_tokens": 686699386.0, "step": 17996 }, { "epoch": 2.2894033837934105, "ewc_loss": 0.06935765594244003, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033713123411871493, "grad_norm": 8.114298820495605, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8694710731506348, "num_tokens": 686738369.0, "step": 17997 }, { "epoch": 2.289530594072001, "ewc_loss": 0.06983096897602081, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003418643609620631, "grad_norm": 8.190444946289062, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8591347932815552, "num_tokens": 686774543.0, "step": 17998 }, { "epoch": 2.2896578043505915, "ewc_loss": 0.06940548121929169, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033760949736461043, "grad_norm": 8.069548606872559, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8573623299598694, "num_tokens": 686807048.0, "step": 17999 }, { "epoch": 2.289785014629182, "ewc_loss": 0.06986711919307709, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034222585964016616, "grad_norm": 8.267723083496094, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8671441078186035, "num_tokens": 686844678.0, "step": 18000 }, { "epoch": 2.2899122249077726, "ewc_loss": 0.06940847635269165, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003376394452061504, "grad_norm": 8.126163482666016, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8660930395126343, "num_tokens": 686877721.0, "step": 18001 }, { "epoch": 2.290039435186363, "ewc_loss": 0.06979617476463318, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003415164246689528, "grad_norm": 8.259490966796875, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8569657802581787, "num_tokens": 686918329.0, "step": 18002 }, { "epoch": 2.2901666454649536, "ewc_loss": 0.06947053223848343, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003382599970791489, "grad_norm": 8.098237037658691, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8615222573280334, "num_tokens": 686960050.0, "step": 18003 }, { "epoch": 2.290293855743544, "ewc_loss": 0.06989015638828278, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003424563037697226, "grad_norm": 8.267231941223145, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8759445548057556, "num_tokens": 686997849.0, "step": 18004 }, { "epoch": 2.2904210660221347, "ewc_loss": 0.06939954310655594, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003375501255504787, "grad_norm": 8.123517990112305, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8691525459289551, "num_tokens": 687032363.0, "step": 18005 }, { "epoch": 2.290548276300725, "ewc_loss": 0.06977568566799164, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034131159191019833, "grad_norm": 8.207268714904785, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8500463962554932, "num_tokens": 687069606.0, "step": 18006 }, { "epoch": 2.2906754865793157, "ewc_loss": 0.06948097795248032, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033836447983048856, "grad_norm": 8.177332878112793, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8561334609985352, "num_tokens": 687112857.0, "step": 18007 }, { "epoch": 2.2908026968579063, "ewc_loss": 0.06964781880378723, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003400328569114208, "grad_norm": 8.171371459960938, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.863725483417511, "num_tokens": 687152245.0, "step": 18008 }, { "epoch": 2.290929907136497, "ewc_loss": 0.06955161690711975, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003390708297956735, "grad_norm": 8.126930236816406, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8647099733352661, "num_tokens": 687193988.0, "step": 18009 }, { "epoch": 2.2910571174150873, "ewc_loss": 0.06966163963079453, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003401711001060903, "grad_norm": 8.250927925109863, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8611371517181396, "num_tokens": 687232375.0, "step": 18010 }, { "epoch": 2.291184327693678, "ewc_loss": 0.06945140659809113, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033806878491304815, "grad_norm": 8.098930358886719, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8697551488876343, "num_tokens": 687274566.0, "step": 18011 }, { "epoch": 2.291311537972268, "ewc_loss": 0.06967612355947495, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000340315920766443, "grad_norm": 8.206890106201172, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8646609783172607, "num_tokens": 687313231.0, "step": 18012 }, { "epoch": 2.291438748250859, "ewc_loss": 0.0694979876279831, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003385345626156777, "grad_norm": 8.159948348999023, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8712517023086548, "num_tokens": 687354502.0, "step": 18013 }, { "epoch": 2.291565958529449, "ewc_loss": 0.07041756808757782, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003404062008485198, "grad_norm": 8.331847190856934, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8624007105827332, "num_tokens": 687399160.0, "step": 18014 }, { "epoch": 2.2916931688080395, "ewc_loss": 0.06928534805774689, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033640817855484784, "grad_norm": 8.084135055541992, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.856022834777832, "num_tokens": 687438775.0, "step": 18015 }, { "epoch": 2.29182037908663, "ewc_loss": 0.07009552419185638, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003420685534365475, "grad_norm": 8.265647888183594, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8713082075119019, "num_tokens": 687471435.0, "step": 18016 }, { "epoch": 2.2919475893652206, "ewc_loss": 0.06936649978160858, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033721973886713386, "grad_norm": 8.115607261657715, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8664647340774536, "num_tokens": 687508657.0, "step": 18017 }, { "epoch": 2.292074799643811, "ewc_loss": 0.06985136866569519, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003420683497097343, "grad_norm": 8.238494873046875, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8748396039009094, "num_tokens": 687545024.0, "step": 18018 }, { "epoch": 2.2922020099224016, "ewc_loss": 0.06955589354038239, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033911358332261443, "grad_norm": 8.27432918548584, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8690154552459717, "num_tokens": 687579432.0, "step": 18019 }, { "epoch": 2.292329220200992, "ewc_loss": 0.06970728933811188, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034062762279063463, "grad_norm": 8.370014190673828, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8558257818222046, "num_tokens": 687618378.0, "step": 18020 }, { "epoch": 2.2924564304795827, "ewc_loss": 0.06946172565221786, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003381719288881868, "grad_norm": 8.419365882873535, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.863405168056488, "num_tokens": 687657119.0, "step": 18021 }, { "epoch": 2.2925836407581732, "ewc_loss": 0.06925506889820099, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00033610532409511507, "grad_norm": 8.238729476928711, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8559584617614746, "num_tokens": 687691924.0, "step": 18022 }, { "epoch": 2.2927108510367638, "ewc_loss": 0.06932424008846283, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003367971221450716, "grad_norm": 8.234868049621582, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8425804972648621, "num_tokens": 687724221.0, "step": 18023 }, { "epoch": 2.2928380613153543, "ewc_loss": 0.06917069852352142, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003352616913616657, "grad_norm": 8.178955078125, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8669309616088867, "num_tokens": 687765049.0, "step": 18024 }, { "epoch": 2.292965271593945, "ewc_loss": 0.06947420537471771, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003382967843208462, "grad_norm": 8.170127868652344, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8550746440887451, "num_tokens": 687810222.0, "step": 18025 }, { "epoch": 2.2930924818725353, "ewc_loss": 0.06928667426109314, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033642147900536656, "grad_norm": 8.203825950622559, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8593524694442749, "num_tokens": 687851961.0, "step": 18026 }, { "epoch": 2.293219692151126, "ewc_loss": 0.0694422498345375, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003379771951586008, "grad_norm": 8.214983940124512, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8548150658607483, "num_tokens": 687890448.0, "step": 18027 }, { "epoch": 2.2933469024297164, "ewc_loss": 0.06923739612102509, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033592869294807315, "grad_norm": 8.150973320007324, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8741421699523926, "num_tokens": 687929509.0, "step": 18028 }, { "epoch": 2.293474112708307, "ewc_loss": 0.06949023902416229, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000338457030011341, "grad_norm": 8.22197437286377, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8554103374481201, "num_tokens": 687966814.0, "step": 18029 }, { "epoch": 2.2936013229868975, "ewc_loss": 0.06937825679779053, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003373372310306877, "grad_norm": 8.323907852172852, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8723236322402954, "num_tokens": 688006398.0, "step": 18030 }, { "epoch": 2.293728533265488, "ewc_loss": 0.06927388906478882, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033629353856667876, "grad_norm": 8.262675285339355, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8626900315284729, "num_tokens": 688044604.0, "step": 18031 }, { "epoch": 2.2938557435440785, "ewc_loss": 0.0692860335111618, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003364150761626661, "grad_norm": 8.19178581237793, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8616379499435425, "num_tokens": 688082969.0, "step": 18032 }, { "epoch": 2.293982953822669, "ewc_loss": 0.06933751702308655, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003369298065081239, "grad_norm": 8.147077560424805, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8726381063461304, "num_tokens": 688121442.0, "step": 18033 }, { "epoch": 2.2941101641012596, "ewc_loss": 0.06958729028701782, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003394276136532426, "grad_norm": 8.413614273071289, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8663405776023865, "num_tokens": 688161096.0, "step": 18034 }, { "epoch": 2.2942373743798496, "ewc_loss": 0.06908708810806274, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003344255965203047, "grad_norm": 8.128213882446289, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.866857647895813, "num_tokens": 688200782.0, "step": 18035 }, { "epoch": 2.2943645846584406, "ewc_loss": 0.06996861100196838, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034324079751968384, "grad_norm": 8.662364959716797, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8730450868606567, "num_tokens": 688241333.0, "step": 18036 }, { "epoch": 2.2944917949370307, "ewc_loss": 0.06881944835186005, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033174915006384254, "grad_norm": 7.95977258682251, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8632652759552002, "num_tokens": 688278297.0, "step": 18037 }, { "epoch": 2.2946190052156212, "ewc_loss": 0.07060183584690094, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034957309253513813, "grad_norm": 9.147154808044434, "learning_rate": 1e-06, "loss": 0.5392, "mean_token_accuracy": 0.8414493203163147, "num_tokens": 688318835.0, "step": 18038 }, { "epoch": 2.2947462154942118, "ewc_loss": 0.06891779601573944, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003327326849102974, "grad_norm": 7.905776023864746, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8681869506835938, "num_tokens": 688354814.0, "step": 18039 }, { "epoch": 2.2948734257728023, "ewc_loss": 0.07158263027667999, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000359380996087566, "grad_norm": 9.067346572875977, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8684508800506592, "num_tokens": 688390008.0, "step": 18040 }, { "epoch": 2.295000636051393, "ewc_loss": 0.06936176121234894, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003371722996234894, "grad_norm": 8.091318130493164, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8667318820953369, "num_tokens": 688430938.0, "step": 18041 }, { "epoch": 2.2951278463299833, "ewc_loss": 0.07145309448242188, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035808567190542817, "grad_norm": 8.649654388427734, "learning_rate": 1e-06, "loss": 0.5809, "mean_token_accuracy": 0.8359766602516174, "num_tokens": 688466730.0, "step": 18042 }, { "epoch": 2.295255056608574, "ewc_loss": 0.06974337995052338, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003409885393921286, "grad_norm": 8.56122875213623, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8644663095474243, "num_tokens": 688506666.0, "step": 18043 }, { "epoch": 2.2953822668871644, "ewc_loss": 0.07014578580856323, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003450124931987375, "grad_norm": 8.28210735321045, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8512248992919922, "num_tokens": 688547726.0, "step": 18044 }, { "epoch": 2.295509477165755, "ewc_loss": 0.06990315020084381, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034258613595739007, "grad_norm": 8.3499174118042, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8612356185913086, "num_tokens": 688587487.0, "step": 18045 }, { "epoch": 2.2956366874443455, "ewc_loss": 0.06960156559944153, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003395703970454633, "grad_norm": 8.329419136047363, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8592081665992737, "num_tokens": 688632454.0, "step": 18046 }, { "epoch": 2.295763897722936, "ewc_loss": 0.0697440356016159, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034099502954632044, "grad_norm": 8.256689071655273, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.871528148651123, "num_tokens": 688668489.0, "step": 18047 }, { "epoch": 2.2958911080015265, "ewc_loss": 0.06963864713907242, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033994115074165165, "grad_norm": 8.213581085205078, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8680692315101624, "num_tokens": 688707899.0, "step": 18048 }, { "epoch": 2.296018318280117, "ewc_loss": 0.06969503313302994, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034050503745675087, "grad_norm": 8.334268569946289, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.871238112449646, "num_tokens": 688747171.0, "step": 18049 }, { "epoch": 2.2961455285587076, "ewc_loss": 0.06938521564006805, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033740687649697065, "grad_norm": 8.183262825012207, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8631296157836914, "num_tokens": 688789553.0, "step": 18050 }, { "epoch": 2.296272738837298, "ewc_loss": 0.06971201300621033, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000340674800099805, "grad_norm": 8.201141357421875, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8689385056495667, "num_tokens": 688827632.0, "step": 18051 }, { "epoch": 2.2963999491158886, "ewc_loss": 0.06967251002788544, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034027977380901575, "grad_norm": 8.272979736328125, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8745712041854858, "num_tokens": 688861505.0, "step": 18052 }, { "epoch": 2.296527159394479, "ewc_loss": 0.06954728066921234, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033902752329595387, "grad_norm": 8.195246696472168, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8581622242927551, "num_tokens": 688898270.0, "step": 18053 }, { "epoch": 2.2966543696730697, "ewc_loss": 0.06977404654026031, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003412952064536512, "grad_norm": 8.219644546508789, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8655352592468262, "num_tokens": 688937420.0, "step": 18054 }, { "epoch": 2.29678157995166, "ewc_loss": 0.06962601095438004, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000339814811013639, "grad_norm": 8.21536922454834, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8709322214126587, "num_tokens": 688972426.0, "step": 18055 }, { "epoch": 2.2969087902302507, "ewc_loss": 0.06989394873380661, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003424941678531468, "grad_norm": 8.299254417419434, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8641905784606934, "num_tokens": 689005801.0, "step": 18056 }, { "epoch": 2.2970360005088413, "ewc_loss": 0.06968623399734497, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034041699836961925, "grad_norm": 8.280632019042969, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8607950210571289, "num_tokens": 689039999.0, "step": 18057 }, { "epoch": 2.297163210787432, "ewc_loss": 0.06965955346822739, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034015020355582237, "grad_norm": 8.227082252502441, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8499863147735596, "num_tokens": 689079404.0, "step": 18058 }, { "epoch": 2.2972904210660223, "ewc_loss": 0.06960339099168777, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033958861604332924, "grad_norm": 8.166818618774414, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.8420765399932861, "num_tokens": 689116197.0, "step": 18059 }, { "epoch": 2.2974176313446124, "ewc_loss": 0.06986474990844727, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003422021691221744, "grad_norm": 8.265114784240723, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8657951951026917, "num_tokens": 689153446.0, "step": 18060 }, { "epoch": 2.2975448416232034, "ewc_loss": 0.06945870816707611, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033814177731983364, "grad_norm": 8.17827320098877, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8475884199142456, "num_tokens": 689185310.0, "step": 18061 }, { "epoch": 2.2976720519017935, "ewc_loss": 0.06973686814308167, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034092337591573596, "grad_norm": 8.238285064697266, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8749501705169678, "num_tokens": 689224150.0, "step": 18062 }, { "epoch": 2.297799262180384, "ewc_loss": 0.06955794990062714, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033913415973074734, "grad_norm": 8.173934936523438, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8560908436775208, "num_tokens": 689261206.0, "step": 18063 }, { "epoch": 2.2979264724589745, "ewc_loss": 0.06971190869808197, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003406737232580781, "grad_norm": 8.22718620300293, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8701620697975159, "num_tokens": 689299129.0, "step": 18064 }, { "epoch": 2.298053682737565, "ewc_loss": 0.06962919980287552, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003398466797079891, "grad_norm": 8.193700790405273, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8821498155593872, "num_tokens": 689330554.0, "step": 18065 }, { "epoch": 2.2981808930161556, "ewc_loss": 0.06967267394065857, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034028146183118224, "grad_norm": 8.137068748474121, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8725416660308838, "num_tokens": 689364084.0, "step": 18066 }, { "epoch": 2.298308103294746, "ewc_loss": 0.06976814568042755, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034123618388548493, "grad_norm": 8.143467903137207, "learning_rate": 1e-06, "loss": 0.516, "mean_token_accuracy": 0.8535735607147217, "num_tokens": 689405254.0, "step": 18067 }, { "epoch": 2.2984353135733366, "ewc_loss": 0.06961918622255325, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033974656253121793, "grad_norm": 8.157125473022461, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8571093082427979, "num_tokens": 689441109.0, "step": 18068 }, { "epoch": 2.298562523851927, "ewc_loss": 0.06977243721485138, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003412790538277477, "grad_norm": 8.178969383239746, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8633103370666504, "num_tokens": 689474937.0, "step": 18069 }, { "epoch": 2.2986897341305177, "ewc_loss": 0.06968854367733002, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404401068110019, "grad_norm": 8.126184463500977, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8603047132492065, "num_tokens": 689515971.0, "step": 18070 }, { "epoch": 2.298816944409108, "ewc_loss": 0.06997303664684296, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034328506444580853, "grad_norm": 8.227019309997559, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8419651985168457, "num_tokens": 689553620.0, "step": 18071 }, { "epoch": 2.2989441546876987, "ewc_loss": 0.06969083845615387, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034046306973323226, "grad_norm": 8.151349067687988, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8692240715026855, "num_tokens": 689592157.0, "step": 18072 }, { "epoch": 2.2990713649662893, "ewc_loss": 0.06992257386445999, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003427804331295192, "grad_norm": 8.224916458129883, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8777521252632141, "num_tokens": 689626889.0, "step": 18073 }, { "epoch": 2.29919857524488, "ewc_loss": 0.06974650174379349, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034101970959454775, "grad_norm": 8.192740440368652, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8762936592102051, "num_tokens": 689666161.0, "step": 18074 }, { "epoch": 2.2993257855234703, "ewc_loss": 0.0698833018541336, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034238770604133606, "grad_norm": 8.249177932739258, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.856328010559082, "num_tokens": 689713713.0, "step": 18075 }, { "epoch": 2.299452995802061, "ewc_loss": 0.06967245787382126, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034027924994006753, "grad_norm": 8.15755844116211, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8579850196838379, "num_tokens": 689755146.0, "step": 18076 }, { "epoch": 2.2995802060806514, "ewc_loss": 0.06991585344076157, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003427132323849946, "grad_norm": 8.220945358276367, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8754675388336182, "num_tokens": 689795925.0, "step": 18077 }, { "epoch": 2.299707416359242, "ewc_loss": 0.0697021409869194, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003405761090107262, "grad_norm": 8.179827690124512, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8758491277694702, "num_tokens": 689830673.0, "step": 18078 }, { "epoch": 2.2998346266378324, "ewc_loss": 0.06987333297729492, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003422879963181913, "grad_norm": 8.245570182800293, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8641804456710815, "num_tokens": 689863649.0, "step": 18079 }, { "epoch": 2.299961836916423, "ewc_loss": 0.06962662935256958, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033982095192186534, "grad_norm": 8.169528007507324, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8638343214988708, "num_tokens": 689901946.0, "step": 18080 }, { "epoch": 2.3000890471950135, "ewc_loss": 0.06985776126384735, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003421323490329087, "grad_norm": 8.177483558654785, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8589770793914795, "num_tokens": 689939968.0, "step": 18081 }, { "epoch": 2.300216257473604, "ewc_loss": 0.06968852877616882, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404400194995105, "grad_norm": 8.137072563171387, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8733975887298584, "num_tokens": 689979792.0, "step": 18082 }, { "epoch": 2.3003434677521946, "ewc_loss": 0.06981170922517776, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034167178091593087, "grad_norm": 8.207881927490234, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8772274255752563, "num_tokens": 690013974.0, "step": 18083 }, { "epoch": 2.300470678030785, "ewc_loss": 0.06965718418359756, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003401265130378306, "grad_norm": 8.129995346069336, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8728101253509521, "num_tokens": 690056540.0, "step": 18084 }, { "epoch": 2.300597888309375, "ewc_loss": 0.07005848735570908, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034413955290801823, "grad_norm": 8.21591567993164, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8586455583572388, "num_tokens": 690101032.0, "step": 18085 }, { "epoch": 2.300725098587966, "ewc_loss": 0.06971454620361328, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034070012043230236, "grad_norm": 8.098413467407227, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8773259520530701, "num_tokens": 690135417.0, "step": 18086 }, { "epoch": 2.3008523088665562, "ewc_loss": 0.07008107006549835, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034436534042470157, "grad_norm": 8.236058235168457, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.86700838804245, "num_tokens": 690178552.0, "step": 18087 }, { "epoch": 2.3009795191451468, "ewc_loss": 0.06967677921056747, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034032249823212624, "grad_norm": 8.136951446533203, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8625735640525818, "num_tokens": 690221928.0, "step": 18088 }, { "epoch": 2.3011067294237373, "ewc_loss": 0.06999731063842773, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003435277903918177, "grad_norm": 8.231263160705566, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8714606761932373, "num_tokens": 690257040.0, "step": 18089 }, { "epoch": 2.301233939702328, "ewc_loss": 0.06968236714601517, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034037837758660316, "grad_norm": 8.132274627685547, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8659102916717529, "num_tokens": 690302720.0, "step": 18090 }, { "epoch": 2.3013611499809183, "ewc_loss": 0.07004930824041367, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003440477594267577, "grad_norm": 8.270211219787598, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8588917851448059, "num_tokens": 690337042.0, "step": 18091 }, { "epoch": 2.301488360259509, "ewc_loss": 0.06970798969268799, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034063460770994425, "grad_norm": 8.113107681274414, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8715968132019043, "num_tokens": 690380247.0, "step": 18092 }, { "epoch": 2.3016155705380994, "ewc_loss": 0.07001160085201263, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034367069019936025, "grad_norm": 8.18337631225586, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8684947490692139, "num_tokens": 690425361.0, "step": 18093 }, { "epoch": 2.30174278081669, "ewc_loss": 0.06975127011537552, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003410673816688359, "grad_norm": 8.129510879516602, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8601754307746887, "num_tokens": 690465137.0, "step": 18094 }, { "epoch": 2.3018699910952805, "ewc_loss": 0.0699707493185997, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034326218883506954, "grad_norm": 8.254246711730957, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8635834455490112, "num_tokens": 690501437.0, "step": 18095 }, { "epoch": 2.301997201373871, "ewc_loss": 0.06962370872497559, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033979176077991724, "grad_norm": 8.122323989868164, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8601387739181519, "num_tokens": 690544134.0, "step": 18096 }, { "epoch": 2.3021244116524615, "ewc_loss": 0.07010512053966522, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034460591268725693, "grad_norm": 8.233186721801758, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8744839429855347, "num_tokens": 690581062.0, "step": 18097 }, { "epoch": 2.302251621931052, "ewc_loss": 0.06963813304901123, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003399360575713217, "grad_norm": 8.231514930725098, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8707495927810669, "num_tokens": 690615145.0, "step": 18098 }, { "epoch": 2.3023788322096426, "ewc_loss": 0.06991997361183167, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034275444340892136, "grad_norm": 8.151013374328613, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8712854385375977, "num_tokens": 690655483.0, "step": 18099 }, { "epoch": 2.302506042488233, "ewc_loss": 0.06991398334503174, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034269451862201095, "grad_norm": 8.206246376037598, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8634305000305176, "num_tokens": 690694133.0, "step": 18100 }, { "epoch": 2.3026332527668236, "ewc_loss": 0.06966491043567657, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034020384191535413, "grad_norm": 8.195987701416016, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8446602821350098, "num_tokens": 690729939.0, "step": 18101 }, { "epoch": 2.302760463045414, "ewc_loss": 0.06988686323165894, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034242335823364556, "grad_norm": 8.217838287353516, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.87112957239151, "num_tokens": 690770139.0, "step": 18102 }, { "epoch": 2.3028876733240047, "ewc_loss": 0.0697421282529831, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003409759665373713, "grad_norm": 8.159109115600586, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8678661584854126, "num_tokens": 690811379.0, "step": 18103 }, { "epoch": 2.303014883602595, "ewc_loss": 0.0700148344039917, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003437030827626586, "grad_norm": 8.26927375793457, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8617571592330933, "num_tokens": 690844194.0, "step": 18104 }, { "epoch": 2.3031420938811857, "ewc_loss": 0.06966407597064972, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034019548911601305, "grad_norm": 8.13452434539795, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8596845269203186, "num_tokens": 690882129.0, "step": 18105 }, { "epoch": 2.3032693041597763, "ewc_loss": 0.07012520730495453, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003448067291174084, "grad_norm": 8.310502052307129, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8784612417221069, "num_tokens": 690915568.0, "step": 18106 }, { "epoch": 2.303396514438367, "ewc_loss": 0.06962746381759644, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003398293047212064, "grad_norm": 8.10623836517334, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8617911338806152, "num_tokens": 690953412.0, "step": 18107 }, { "epoch": 2.3035237247169573, "ewc_loss": 0.07008303701877594, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034438507282175124, "grad_norm": 8.2857666015625, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8624727725982666, "num_tokens": 690990047.0, "step": 18108 }, { "epoch": 2.303650934995548, "ewc_loss": 0.0695485770702362, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033904044539667666, "grad_norm": 8.10245418548584, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8713191747665405, "num_tokens": 691031010.0, "step": 18109 }, { "epoch": 2.303778145274138, "ewc_loss": 0.07020565867424011, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034561127540655434, "grad_norm": 8.295219421386719, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.867704451084137, "num_tokens": 691065830.0, "step": 18110 }, { "epoch": 2.303905355552729, "ewc_loss": 0.06961721181869507, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033972677192650735, "grad_norm": 8.145326614379883, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8773120641708374, "num_tokens": 691101418.0, "step": 18111 }, { "epoch": 2.304032565831319, "ewc_loss": 0.07014882564544678, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003450429649092257, "grad_norm": 8.27900505065918, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8501585125923157, "num_tokens": 691139447.0, "step": 18112 }, { "epoch": 2.3041597761099095, "ewc_loss": 0.0696176290512085, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033973093377426267, "grad_norm": 8.199441909790039, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8465248346328735, "num_tokens": 691168809.0, "step": 18113 }, { "epoch": 2.3042869863885, "ewc_loss": 0.0699802041053772, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000343356718076393, "grad_norm": 8.235786437988281, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8706014156341553, "num_tokens": 691200420.0, "step": 18114 }, { "epoch": 2.3044141966670906, "ewc_loss": 0.06976847350597382, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003412393853068352, "grad_norm": 8.139917373657227, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8598402738571167, "num_tokens": 691238587.0, "step": 18115 }, { "epoch": 2.304541406945681, "ewc_loss": 0.0699019581079483, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034257431980222464, "grad_norm": 8.241410255432129, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8633903861045837, "num_tokens": 691277137.0, "step": 18116 }, { "epoch": 2.3046686172242716, "ewc_loss": 0.06976740062236786, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003412286750972271, "grad_norm": 8.133158683776855, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8623729944229126, "num_tokens": 691319564.0, "step": 18117 }, { "epoch": 2.304795827502862, "ewc_loss": 0.07004839181900024, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034403864992782474, "grad_norm": 8.312675476074219, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8644320964813232, "num_tokens": 691359652.0, "step": 18118 }, { "epoch": 2.3049230377814527, "ewc_loss": 0.06966695189476013, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034022421459667385, "grad_norm": 8.156865119934082, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8655848503112793, "num_tokens": 691399705.0, "step": 18119 }, { "epoch": 2.305050248060043, "ewc_loss": 0.07015906274318695, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003451453521847725, "grad_norm": 8.292459487915039, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.857842743396759, "num_tokens": 691439447.0, "step": 18120 }, { "epoch": 2.3051774583386337, "ewc_loss": 0.06964343786239624, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033998911385424435, "grad_norm": 8.115734100341797, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.853635311126709, "num_tokens": 691477453.0, "step": 18121 }, { "epoch": 2.3053046686172243, "ewc_loss": 0.0701572597026825, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003451272496022284, "grad_norm": 8.265883445739746, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8646434545516968, "num_tokens": 691517766.0, "step": 18122 }, { "epoch": 2.305431878895815, "ewc_loss": 0.06973959505558014, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003409506462048739, "grad_norm": 8.165763854980469, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8634886741638184, "num_tokens": 691553956.0, "step": 18123 }, { "epoch": 2.3055590891744053, "ewc_loss": 0.07004566490650177, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003440113505348563, "grad_norm": 8.257742881774902, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.863929271697998, "num_tokens": 691592522.0, "step": 18124 }, { "epoch": 2.305686299452996, "ewc_loss": 0.06965110450983047, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003400657151360065, "grad_norm": 8.19985580444336, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8647657632827759, "num_tokens": 691629202.0, "step": 18125 }, { "epoch": 2.3058135097315864, "ewc_loss": 0.06982327997684479, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034178749774582684, "grad_norm": 8.189151763916016, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8677290678024292, "num_tokens": 691665324.0, "step": 18126 }, { "epoch": 2.305940720010177, "ewc_loss": 0.06989966332912445, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034255129867233336, "grad_norm": 8.192947387695312, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8745008111000061, "num_tokens": 691704223.0, "step": 18127 }, { "epoch": 2.3060679302887674, "ewc_loss": 0.0697028785943985, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003405834431760013, "grad_norm": 8.150984764099121, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8635424375534058, "num_tokens": 691745061.0, "step": 18128 }, { "epoch": 2.306195140567358, "ewc_loss": 0.06990224123001099, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000342577084666118, "grad_norm": 8.194215774536133, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8765841126441956, "num_tokens": 691783630.0, "step": 18129 }, { "epoch": 2.3063223508459485, "ewc_loss": 0.06977219879627228, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034127672552131116, "grad_norm": 8.139406204223633, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8637380599975586, "num_tokens": 691818283.0, "step": 18130 }, { "epoch": 2.306449561124539, "ewc_loss": 0.06988468766212463, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003424016176722944, "grad_norm": 8.19455623626709, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8542227149009705, "num_tokens": 691854585.0, "step": 18131 }, { "epoch": 2.3065767714031296, "ewc_loss": 0.06989404559135437, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003424950991757214, "grad_norm": 8.173433303833008, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8485973477363586, "num_tokens": 691891784.0, "step": 18132 }, { "epoch": 2.3067039816817196, "ewc_loss": 0.07002948224544525, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034384947502985597, "grad_norm": 8.209312438964844, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8663983345031738, "num_tokens": 691927389.0, "step": 18133 }, { "epoch": 2.3068311919603106, "ewc_loss": 0.06969423592090607, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404970047995448, "grad_norm": 8.148348808288574, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8630973100662231, "num_tokens": 691962928.0, "step": 18134 }, { "epoch": 2.3069584022389007, "ewc_loss": 0.0700436383485794, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003439911233726889, "grad_norm": 8.170616149902344, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8641546964645386, "num_tokens": 692007064.0, "step": 18135 }, { "epoch": 2.3070856125174912, "ewc_loss": 0.06989940255880356, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003425487084314227, "grad_norm": 8.186565399169922, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8696302175521851, "num_tokens": 692040668.0, "step": 18136 }, { "epoch": 2.3072128227960818, "ewc_loss": 0.06997539848089218, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034330866765230894, "grad_norm": 8.206262588500977, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8659436702728271, "num_tokens": 692077710.0, "step": 18137 }, { "epoch": 2.3073400330746723, "ewc_loss": 0.06994011253118515, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003429558128118515, "grad_norm": 8.205404281616211, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8723552227020264, "num_tokens": 692112316.0, "step": 18138 }, { "epoch": 2.307467243353263, "ewc_loss": 0.06982854753732681, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034184017567895353, "grad_norm": 8.161646842956543, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8703498840332031, "num_tokens": 692153201.0, "step": 18139 }, { "epoch": 2.3075944536318533, "ewc_loss": 0.06998279690742493, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003433826204854995, "grad_norm": 8.185700416564941, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8583221435546875, "num_tokens": 692192942.0, "step": 18140 }, { "epoch": 2.307721663910444, "ewc_loss": 0.06988415122032166, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034239617525599897, "grad_norm": 8.178258895874023, "learning_rate": 1e-06, "loss": 0.4175, "mean_token_accuracy": 0.8796200156211853, "num_tokens": 692235515.0, "step": 18141 }, { "epoch": 2.3078488741890344, "ewc_loss": 0.07007741928100586, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003443288733251393, "grad_norm": 8.289915084838867, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8548727035522461, "num_tokens": 692271338.0, "step": 18142 }, { "epoch": 2.307976084467625, "ewc_loss": 0.06977611780166626, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000341315841069445, "grad_norm": 8.102808952331543, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8467816114425659, "num_tokens": 692311469.0, "step": 18143 }, { "epoch": 2.3081032947462155, "ewc_loss": 0.07035188376903534, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000347073539160192, "grad_norm": 8.289109230041504, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.8466825485229492, "num_tokens": 692352052.0, "step": 18144 }, { "epoch": 2.308230505024806, "ewc_loss": 0.06971520185470581, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003407067561056465, "grad_norm": 8.148707389831543, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.848612368106842, "num_tokens": 692382978.0, "step": 18145 }, { "epoch": 2.3083577153033965, "ewc_loss": 0.07028308510780334, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034638558281585574, "grad_norm": 8.311339378356934, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8648400902748108, "num_tokens": 692415651.0, "step": 18146 }, { "epoch": 2.308484925581987, "ewc_loss": 0.06974250078201294, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034097975003533065, "grad_norm": 8.1233491897583, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8470208644866943, "num_tokens": 692455937.0, "step": 18147 }, { "epoch": 2.3086121358605776, "ewc_loss": 0.07028941810131073, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003464488254394382, "grad_norm": 8.271388053894043, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8600354194641113, "num_tokens": 692496277.0, "step": 18148 }, { "epoch": 2.308739346139168, "ewc_loss": 0.06983115524053574, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034186625271104276, "grad_norm": 8.23304271697998, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8654594421386719, "num_tokens": 692530485.0, "step": 18149 }, { "epoch": 2.3088665564177586, "ewc_loss": 0.07019791752099991, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034553385921753943, "grad_norm": 8.298677444458008, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8613071441650391, "num_tokens": 692564794.0, "step": 18150 }, { "epoch": 2.308993766696349, "ewc_loss": 0.06981892883777618, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003417440166231245, "grad_norm": 8.218347549438477, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8545867204666138, "num_tokens": 692610359.0, "step": 18151 }, { "epoch": 2.3091209769749397, "ewc_loss": 0.07009917497634888, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445464826654643, "grad_norm": 8.271780967712402, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8727349042892456, "num_tokens": 692641560.0, "step": 18152 }, { "epoch": 2.30924818725353, "ewc_loss": 0.0698438286781311, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003419929416850209, "grad_norm": 8.200961112976074, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8617056608200073, "num_tokens": 692680114.0, "step": 18153 }, { "epoch": 2.3093753975321207, "ewc_loss": 0.07000108063220978, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034356553805992007, "grad_norm": 8.289567947387695, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8512982130050659, "num_tokens": 692714199.0, "step": 18154 }, { "epoch": 2.3095026078107113, "ewc_loss": 0.06981439888477325, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003416987310629338, "grad_norm": 8.1353178024292, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8675236701965332, "num_tokens": 692752331.0, "step": 18155 }, { "epoch": 2.309629818089302, "ewc_loss": 0.07006233930587769, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003441780572757125, "grad_norm": 8.296597480773926, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8705461025238037, "num_tokens": 692789167.0, "step": 18156 }, { "epoch": 2.3097570283678923, "ewc_loss": 0.06970978528261185, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003406525356695056, "grad_norm": 8.148573875427246, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8638533353805542, "num_tokens": 692829234.0, "step": 18157 }, { "epoch": 2.3098842386464824, "ewc_loss": 0.07000275701284409, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003435822727624327, "grad_norm": 8.329069137573242, "learning_rate": 1e-06, "loss": 0.5301, "mean_token_accuracy": 0.8437292575836182, "num_tokens": 692863749.0, "step": 18158 }, { "epoch": 2.3100114489250734, "ewc_loss": 0.06965567171573639, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034011134994216263, "grad_norm": 8.188090324401855, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8615647554397583, "num_tokens": 692900972.0, "step": 18159 }, { "epoch": 2.3101386592036635, "ewc_loss": 0.0699751079082489, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003433057281654328, "grad_norm": 8.271103858947754, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8581864833831787, "num_tokens": 692938522.0, "step": 18160 }, { "epoch": 2.310265869482254, "ewc_loss": 0.06963790953159332, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003399337292648852, "grad_norm": 8.205851554870605, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8735758066177368, "num_tokens": 692973653.0, "step": 18161 }, { "epoch": 2.3103930797608445, "ewc_loss": 0.06991463899612427, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003427011251915246, "grad_norm": 8.310611724853516, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8569332361221313, "num_tokens": 693016325.0, "step": 18162 }, { "epoch": 2.310520290039435, "ewc_loss": 0.06952264904975891, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033878113026730716, "grad_norm": 8.152362823486328, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8751356601715088, "num_tokens": 693052022.0, "step": 18163 }, { "epoch": 2.3106475003180256, "ewc_loss": 0.06988945603370667, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003424492897465825, "grad_norm": 8.249185562133789, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8688201308250427, "num_tokens": 693093936.0, "step": 18164 }, { "epoch": 2.310774710596616, "ewc_loss": 0.06961910426616669, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033974574762396514, "grad_norm": 8.260810852050781, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8788059949874878, "num_tokens": 693130160.0, "step": 18165 }, { "epoch": 2.3109019208752066, "ewc_loss": 0.0697004422545433, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003405591123737395, "grad_norm": 8.253152847290039, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8672851324081421, "num_tokens": 693172405.0, "step": 18166 }, { "epoch": 2.311029131153797, "ewc_loss": 0.06969140470027924, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404686867725104, "grad_norm": 8.28365707397461, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8669730424880981, "num_tokens": 693206499.0, "step": 18167 }, { "epoch": 2.3111563414323877, "ewc_loss": 0.06965246796607971, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003400793648324907, "grad_norm": 8.29728889465332, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8635184168815613, "num_tokens": 693244287.0, "step": 18168 }, { "epoch": 2.311283551710978, "ewc_loss": 0.06956232339143753, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033917793189175427, "grad_norm": 8.232123374938965, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8555604219436646, "num_tokens": 693286164.0, "step": 18169 }, { "epoch": 2.3114107619895687, "ewc_loss": 0.06957574933767319, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003393121878616512, "grad_norm": 8.203805923461914, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.871044933795929, "num_tokens": 693330961.0, "step": 18170 }, { "epoch": 2.3115379722681593, "ewc_loss": 0.06959836184978485, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033953835372813046, "grad_norm": 8.201900482177734, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8695849180221558, "num_tokens": 693375169.0, "step": 18171 }, { "epoch": 2.31166518254675, "ewc_loss": 0.0696445181965828, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033999988227151334, "grad_norm": 8.289294242858887, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.858826756477356, "num_tokens": 693414127.0, "step": 18172 }, { "epoch": 2.3117923928253403, "ewc_loss": 0.0695517510175705, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033907219767570496, "grad_norm": 8.26349925994873, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8610109686851501, "num_tokens": 693450446.0, "step": 18173 }, { "epoch": 2.311919603103931, "ewc_loss": 0.06961855292320251, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033974021789617836, "grad_norm": 8.279842376708984, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8542873859405518, "num_tokens": 693490973.0, "step": 18174 }, { "epoch": 2.3120468133825214, "ewc_loss": 0.069522425532341, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033877891837619245, "grad_norm": 8.287530899047852, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8679307103157043, "num_tokens": 693526815.0, "step": 18175 }, { "epoch": 2.312174023661112, "ewc_loss": 0.0695779025554657, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003393337538000196, "grad_norm": 8.297998428344727, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8577966690063477, "num_tokens": 693566949.0, "step": 18176 }, { "epoch": 2.3123012339397024, "ewc_loss": 0.06951555609703064, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003387102042324841, "grad_norm": 8.299233436584473, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8573876619338989, "num_tokens": 693601116.0, "step": 18177 }, { "epoch": 2.312428444218293, "ewc_loss": 0.06949695944786072, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033852425985969603, "grad_norm": 8.17098331451416, "learning_rate": 1e-06, "loss": 0.3822, "mean_token_accuracy": 0.8947142958641052, "num_tokens": 693640588.0, "step": 18178 }, { "epoch": 2.3125556544968835, "ewc_loss": 0.06960777938365936, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003396324464119971, "grad_norm": 8.224812507629395, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8655456304550171, "num_tokens": 693683352.0, "step": 18179 }, { "epoch": 2.312682864775474, "ewc_loss": 0.06947885453701019, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00033834323403425515, "grad_norm": 8.230253219604492, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8692365884780884, "num_tokens": 693722579.0, "step": 18180 }, { "epoch": 2.3128100750540646, "ewc_loss": 0.06960640847682953, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003396187676116824, "grad_norm": 8.222268104553223, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8651326894760132, "num_tokens": 693766743.0, "step": 18181 }, { "epoch": 2.312937285332655, "ewc_loss": 0.06961268186569214, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003396814863663167, "grad_norm": 8.198844909667969, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8673983812332153, "num_tokens": 693809699.0, "step": 18182 }, { "epoch": 2.313064495611245, "ewc_loss": 0.06966093182563782, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003401639696676284, "grad_norm": 8.295442581176758, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8695641756057739, "num_tokens": 693845508.0, "step": 18183 }, { "epoch": 2.313191705889836, "ewc_loss": 0.06952869892120361, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003388416371308267, "grad_norm": 8.173301696777344, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.864604115486145, "num_tokens": 693883763.0, "step": 18184 }, { "epoch": 2.313318916168426, "ewc_loss": 0.06971919536590576, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034074659924954176, "grad_norm": 8.251872062683105, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8699957728385925, "num_tokens": 693917093.0, "step": 18185 }, { "epoch": 2.3134461264470167, "ewc_loss": 0.06966612488031387, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034021594910882413, "grad_norm": 8.170486450195312, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8659340143203735, "num_tokens": 693949219.0, "step": 18186 }, { "epoch": 2.3135733367256073, "ewc_loss": 0.06975799798965454, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003411346988286823, "grad_norm": 8.155598640441895, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8587059378623962, "num_tokens": 693992755.0, "step": 18187 }, { "epoch": 2.313700547004198, "ewc_loss": 0.06984302401542664, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003419848799239844, "grad_norm": 8.222381591796875, "learning_rate": 1e-06, "loss": 0.5223, "mean_token_accuracy": 0.851096510887146, "num_tokens": 694030386.0, "step": 18188 }, { "epoch": 2.3138277572827883, "ewc_loss": 0.06987455487251282, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003423002490308136, "grad_norm": 8.209802627563477, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8728286027908325, "num_tokens": 694065019.0, "step": 18189 }, { "epoch": 2.313954967561379, "ewc_loss": 0.06994485855102539, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000343003252055496, "grad_norm": 8.219825744628906, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8622957468032837, "num_tokens": 694106645.0, "step": 18190 }, { "epoch": 2.3140821778399694, "ewc_loss": 0.06981345266103745, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034168921411037445, "grad_norm": 8.226234436035156, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8691831231117249, "num_tokens": 694149146.0, "step": 18191 }, { "epoch": 2.31420938811856, "ewc_loss": 0.06989756226539612, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034253031481057405, "grad_norm": 8.2108793258667, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8485361337661743, "num_tokens": 694193567.0, "step": 18192 }, { "epoch": 2.3143365983971504, "ewc_loss": 0.0699407309293747, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034296195372007787, "grad_norm": 8.182098388671875, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8764829635620117, "num_tokens": 694232160.0, "step": 18193 }, { "epoch": 2.314463808675741, "ewc_loss": 0.06987044960260391, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003422591835260391, "grad_norm": 8.169078826904297, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8665649890899658, "num_tokens": 694275153.0, "step": 18194 }, { "epoch": 2.3145910189543315, "ewc_loss": 0.06989879906177521, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034254262573085725, "grad_norm": 8.218843460083008, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.870431125164032, "num_tokens": 694313722.0, "step": 18195 }, { "epoch": 2.314718229232922, "ewc_loss": 0.07000240683555603, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003435787803027779, "grad_norm": 8.257413864135742, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8686811327934265, "num_tokens": 694350544.0, "step": 18196 }, { "epoch": 2.3148454395115126, "ewc_loss": 0.0698883980512619, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034243869595229626, "grad_norm": 8.264780044555664, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8663549423217773, "num_tokens": 694386552.0, "step": 18197 }, { "epoch": 2.314972649790103, "ewc_loss": 0.06993422657251358, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000342896964866668, "grad_norm": 8.191742897033691, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.860523521900177, "num_tokens": 694434368.0, "step": 18198 }, { "epoch": 2.3150998600686936, "ewc_loss": 0.06996835768222809, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034323829459026456, "grad_norm": 8.189850807189941, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8637953996658325, "num_tokens": 694479856.0, "step": 18199 }, { "epoch": 2.315227070347284, "ewc_loss": 0.07006430625915527, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034419778967276216, "grad_norm": 8.234647750854492, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.86451256275177, "num_tokens": 694521107.0, "step": 18200 }, { "epoch": 2.3153542806258747, "ewc_loss": 0.06989122927188873, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003424669266678393, "grad_norm": 8.209329605102539, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8649718761444092, "num_tokens": 694560886.0, "step": 18201 }, { "epoch": 2.315481490904465, "ewc_loss": 0.06993897259235382, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003429444332141429, "grad_norm": 8.202033996582031, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8690667748451233, "num_tokens": 694598404.0, "step": 18202 }, { "epoch": 2.3156087011830557, "ewc_loss": 0.07005460560321808, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003441007575020194, "grad_norm": 8.238816261291504, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8593193292617798, "num_tokens": 694637162.0, "step": 18203 }, { "epoch": 2.3157359114616463, "ewc_loss": 0.06985338777303696, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003420885477680713, "grad_norm": 8.162710189819336, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8684154748916626, "num_tokens": 694677435.0, "step": 18204 }, { "epoch": 2.315863121740237, "ewc_loss": 0.07019466161727905, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034550135023891926, "grad_norm": 8.264679908752441, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8737441301345825, "num_tokens": 694713980.0, "step": 18205 }, { "epoch": 2.3159903320188273, "ewc_loss": 0.06983113288879395, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034186599077656865, "grad_norm": 8.128094673156738, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8608806133270264, "num_tokens": 694753884.0, "step": 18206 }, { "epoch": 2.316117542297418, "ewc_loss": 0.07033073902130127, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034686209983192384, "grad_norm": 8.298611640930176, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8782713413238525, "num_tokens": 694785739.0, "step": 18207 }, { "epoch": 2.316244752576008, "ewc_loss": 0.06989237666130066, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003424784808885306, "grad_norm": 8.177201271057129, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8644409775733948, "num_tokens": 694827629.0, "step": 18208 }, { "epoch": 2.316371962854599, "ewc_loss": 0.07033564895391464, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003469111688900739, "grad_norm": 8.270029067993164, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8611196279525757, "num_tokens": 694868241.0, "step": 18209 }, { "epoch": 2.316499173133189, "ewc_loss": 0.07002273201942444, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034378195414319634, "grad_norm": 8.192704200744629, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8550108671188354, "num_tokens": 694905298.0, "step": 18210 }, { "epoch": 2.3166263834117795, "ewc_loss": 0.07026447355747223, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034619943471625447, "grad_norm": 8.296409606933594, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8575760126113892, "num_tokens": 694938054.0, "step": 18211 }, { "epoch": 2.31675359369037, "ewc_loss": 0.06983055174350739, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003418602282181382, "grad_norm": 8.140634536743164, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8600238561630249, "num_tokens": 694978761.0, "step": 18212 }, { "epoch": 2.3168808039689606, "ewc_loss": 0.07033628225326538, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000346917484421283, "grad_norm": 8.265768051147461, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.860448956489563, "num_tokens": 695015073.0, "step": 18213 }, { "epoch": 2.317008014247551, "ewc_loss": 0.06994034349918365, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003429580829106271, "grad_norm": 8.182761192321777, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8587918877601624, "num_tokens": 695052288.0, "step": 18214 }, { "epoch": 2.3171352245261416, "ewc_loss": 0.07023011893033981, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003458558931015432, "grad_norm": 8.253868103027344, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8537248373031616, "num_tokens": 695088976.0, "step": 18215 }, { "epoch": 2.317262434804732, "ewc_loss": 0.07009246945381165, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003444793983362615, "grad_norm": 8.172868728637695, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.868767261505127, "num_tokens": 695128482.0, "step": 18216 }, { "epoch": 2.3173896450833227, "ewc_loss": 0.07023539394140244, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003459086292423308, "grad_norm": 8.23097038269043, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8608102202415466, "num_tokens": 695164963.0, "step": 18217 }, { "epoch": 2.317516855361913, "ewc_loss": 0.07012967020273209, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003448514034971595, "grad_norm": 8.19197940826416, "learning_rate": 1e-06, "loss": 0.4021, "mean_token_accuracy": 0.8863072395324707, "num_tokens": 695201515.0, "step": 18218 }, { "epoch": 2.3176440656405037, "ewc_loss": 0.07038871943950653, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000347441848134622, "grad_norm": 8.276440620422363, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8708333969116211, "num_tokens": 695242057.0, "step": 18219 }, { "epoch": 2.3177712759190943, "ewc_loss": 0.07014133036136627, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003449680225457996, "grad_norm": 8.190361976623535, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8657531142234802, "num_tokens": 695277128.0, "step": 18220 }, { "epoch": 2.317898486197685, "ewc_loss": 0.07039546966552734, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034750933991745114, "grad_norm": 8.309298515319824, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8783485889434814, "num_tokens": 695311984.0, "step": 18221 }, { "epoch": 2.3180256964762753, "ewc_loss": 0.07008423656225204, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034439703449606895, "grad_norm": 8.217866897583008, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8517464399337769, "num_tokens": 695348457.0, "step": 18222 }, { "epoch": 2.318152906754866, "ewc_loss": 0.07021132111549377, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003456678823567927, "grad_norm": 8.198140144348145, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8675792813301086, "num_tokens": 695386518.0, "step": 18223 }, { "epoch": 2.3182801170334564, "ewc_loss": 0.07025322318077087, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003460869484115392, "grad_norm": 8.297759056091309, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8608301877975464, "num_tokens": 695425365.0, "step": 18224 }, { "epoch": 2.318407327312047, "ewc_loss": 0.07009698450565338, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445245383772999, "grad_norm": 8.209311485290527, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8660378456115723, "num_tokens": 695459559.0, "step": 18225 }, { "epoch": 2.3185345375906374, "ewc_loss": 0.07030637562274933, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034661844256334007, "grad_norm": 8.298612594604492, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8603936433792114, "num_tokens": 695491386.0, "step": 18226 }, { "epoch": 2.318661747869228, "ewc_loss": 0.07005681097507477, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003441228182055056, "grad_norm": 8.188773155212402, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8470351696014404, "num_tokens": 695534660.0, "step": 18227 }, { "epoch": 2.3187889581478185, "ewc_loss": 0.07039204239845276, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003474751429166645, "grad_norm": 8.30912971496582, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8561478853225708, "num_tokens": 695578127.0, "step": 18228 }, { "epoch": 2.318916168426409, "ewc_loss": 0.06990598142147064, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034261448308825493, "grad_norm": 8.191817283630371, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8520870804786682, "num_tokens": 695614181.0, "step": 18229 }, { "epoch": 2.3190433787049995, "ewc_loss": 0.07023528218269348, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003459074650891125, "grad_norm": 8.211849212646484, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8634796142578125, "num_tokens": 695652753.0, "step": 18230 }, { "epoch": 2.3191705889835896, "ewc_loss": 0.06996968388557434, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034325153683312237, "grad_norm": 8.130499839782715, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8715949654579163, "num_tokens": 695692449.0, "step": 18231 }, { "epoch": 2.3192977992621806, "ewc_loss": 0.07034805417060852, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034703523851931095, "grad_norm": 8.270240783691406, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.865147054195404, "num_tokens": 695730812.0, "step": 18232 }, { "epoch": 2.3194250095407707, "ewc_loss": 0.07000401616096497, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034359481651335955, "grad_norm": 8.1863431930542, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8798139095306396, "num_tokens": 695767577.0, "step": 18233 }, { "epoch": 2.319552219819361, "ewc_loss": 0.0702887773513794, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034644242259673774, "grad_norm": 8.211335182189941, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8639101386070251, "num_tokens": 695812372.0, "step": 18234 }, { "epoch": 2.3196794300979517, "ewc_loss": 0.07011331617832184, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003446878690738231, "grad_norm": 8.178382873535156, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8809847831726074, "num_tokens": 695847470.0, "step": 18235 }, { "epoch": 2.3198066403765423, "ewc_loss": 0.07027491927146912, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003463038883637637, "grad_norm": 8.210744857788086, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8802459239959717, "num_tokens": 695888082.0, "step": 18236 }, { "epoch": 2.319933850655133, "ewc_loss": 0.07019960880279541, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003455508267506957, "grad_norm": 8.194777488708496, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8731163740158081, "num_tokens": 695928904.0, "step": 18237 }, { "epoch": 2.3200610609337233, "ewc_loss": 0.07035596668720245, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003471143136266619, "grad_norm": 8.327041625976562, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8528114557266235, "num_tokens": 695967549.0, "step": 18238 }, { "epoch": 2.320188271212314, "ewc_loss": 0.07002803683280945, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034383509773761034, "grad_norm": 8.17190933227539, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8593952059745789, "num_tokens": 696005611.0, "step": 18239 }, { "epoch": 2.3203154814909044, "ewc_loss": 0.07036908715963364, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034724557190202177, "grad_norm": 8.322918891906738, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8678080439567566, "num_tokens": 696034864.0, "step": 18240 }, { "epoch": 2.320442691769495, "ewc_loss": 0.06983692944049835, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003419239947106689, "grad_norm": 8.169036865234375, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8620216846466064, "num_tokens": 696072137.0, "step": 18241 }, { "epoch": 2.3205699020480854, "ewc_loss": 0.0703132301568985, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003466870402917266, "grad_norm": 8.23945426940918, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8671172857284546, "num_tokens": 696115744.0, "step": 18242 }, { "epoch": 2.320697112326676, "ewc_loss": 0.06991112232208252, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034266593866050243, "grad_norm": 8.139351844787598, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.859683632850647, "num_tokens": 696151293.0, "step": 18243 }, { "epoch": 2.3208243226052665, "ewc_loss": 0.07030443847179413, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003465990594122559, "grad_norm": 8.211264610290527, "learning_rate": 1e-06, "loss": 0.4062, "mean_token_accuracy": 0.8834071159362793, "num_tokens": 696190742.0, "step": 18244 }, { "epoch": 2.320951532883857, "ewc_loss": 0.07001000642776489, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034365474130026996, "grad_norm": 8.157787322998047, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8538111448287964, "num_tokens": 696232807.0, "step": 18245 }, { "epoch": 2.3210787431624476, "ewc_loss": 0.07007481157779694, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003443028253968805, "grad_norm": 8.18244457244873, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8621826767921448, "num_tokens": 696269416.0, "step": 18246 }, { "epoch": 2.321205953441038, "ewc_loss": 0.07026992738246918, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034625400439836085, "grad_norm": 8.251642227172852, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8666828274726868, "num_tokens": 696305144.0, "step": 18247 }, { "epoch": 2.3213331637196286, "ewc_loss": 0.07010936737060547, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034464840427972376, "grad_norm": 8.168737411499023, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8562491536140442, "num_tokens": 696344854.0, "step": 18248 }, { "epoch": 2.321460373998219, "ewc_loss": 0.07025009393692017, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003460556035861373, "grad_norm": 8.213032722473145, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8838223218917847, "num_tokens": 696380154.0, "step": 18249 }, { "epoch": 2.3215875842768097, "ewc_loss": 0.07026353478431702, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003461900632828474, "grad_norm": 8.235008239746094, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8509608507156372, "num_tokens": 696416458.0, "step": 18250 }, { "epoch": 2.3217147945554, "ewc_loss": 0.07034631073474884, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003470178344286978, "grad_norm": 8.223397254943848, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8659647703170776, "num_tokens": 696453232.0, "step": 18251 }, { "epoch": 2.3218420048339907, "ewc_loss": 0.07016457617282867, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000345200503943488, "grad_norm": 8.272080421447754, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8440378904342651, "num_tokens": 696487930.0, "step": 18252 }, { "epoch": 2.3219692151125813, "ewc_loss": 0.07019856572151184, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034554029116407037, "grad_norm": 8.288320541381836, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8549723625183105, "num_tokens": 696523824.0, "step": 18253 }, { "epoch": 2.322096425391172, "ewc_loss": 0.0700845792889595, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003444004978518933, "grad_norm": 8.15073299407959, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8743788003921509, "num_tokens": 696567007.0, "step": 18254 }, { "epoch": 2.3222236356697623, "ewc_loss": 0.07031149417161942, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034666963620111346, "grad_norm": 8.196890830993652, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.88123619556427, "num_tokens": 696610055.0, "step": 18255 }, { "epoch": 2.3223508459483524, "ewc_loss": 0.07005926966667175, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034414741094224155, "grad_norm": 8.233881950378418, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8505253791809082, "num_tokens": 696644058.0, "step": 18256 }, { "epoch": 2.3224780562269434, "ewc_loss": 0.07023297250270844, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034588444395922124, "grad_norm": 8.192483901977539, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8730093240737915, "num_tokens": 696686381.0, "step": 18257 }, { "epoch": 2.3226052665055335, "ewc_loss": 0.07018636167049408, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034541828790679574, "grad_norm": 8.236729621887207, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8598150610923767, "num_tokens": 696727567.0, "step": 18258 }, { "epoch": 2.322732476784124, "ewc_loss": 0.07010094821453094, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034456420689821243, "grad_norm": 8.152423858642578, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8558042645454407, "num_tokens": 696767266.0, "step": 18259 }, { "epoch": 2.3228596870627145, "ewc_loss": 0.07026997208595276, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034625435364432633, "grad_norm": 8.226494789123535, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8782455921173096, "num_tokens": 696806059.0, "step": 18260 }, { "epoch": 2.322986897341305, "ewc_loss": 0.070123590528965, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003447905764915049, "grad_norm": 8.276130676269531, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.862777829170227, "num_tokens": 696844232.0, "step": 18261 }, { "epoch": 2.3231141076198956, "ewc_loss": 0.07015341520309448, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034508880344219506, "grad_norm": 8.171622276306152, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8670028448104858, "num_tokens": 696886223.0, "step": 18262 }, { "epoch": 2.323241317898486, "ewc_loss": 0.07023371756076813, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034589183633215725, "grad_norm": 8.198136329650879, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8511002063751221, "num_tokens": 696922625.0, "step": 18263 }, { "epoch": 2.3233685281770766, "ewc_loss": 0.07016962766647339, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034525091177783906, "grad_norm": 8.163822174072266, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8521131277084351, "num_tokens": 696965160.0, "step": 18264 }, { "epoch": 2.323495738455667, "ewc_loss": 0.07036974281072617, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034725212026387453, "grad_norm": 8.24012565612793, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8703052401542664, "num_tokens": 697000854.0, "step": 18265 }, { "epoch": 2.3236229487342577, "ewc_loss": 0.07018917798995972, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034544646041467786, "grad_norm": 8.186002731323242, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.879662811756134, "num_tokens": 697041264.0, "step": 18266 }, { "epoch": 2.323750159012848, "ewc_loss": 0.07036581635475159, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034721283009275794, "grad_norm": 8.213051795959473, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8620597124099731, "num_tokens": 697080079.0, "step": 18267 }, { "epoch": 2.3238773692914387, "ewc_loss": 0.07022581994533539, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003458129067439586, "grad_norm": 8.222761154174805, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8610537648200989, "num_tokens": 697123755.0, "step": 18268 }, { "epoch": 2.3240045795700293, "ewc_loss": 0.07034462690353394, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034700092510320246, "grad_norm": 8.245771408081055, "learning_rate": 1e-06, "loss": 0.5601, "mean_token_accuracy": 0.835025429725647, "num_tokens": 697162012.0, "step": 18269 }, { "epoch": 2.32413178984862, "ewc_loss": 0.0701909139752388, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034546383540146053, "grad_norm": 8.197315216064453, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8739194869995117, "num_tokens": 697200188.0, "step": 18270 }, { "epoch": 2.3242590001272103, "ewc_loss": 0.07020576298236847, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034561226493678987, "grad_norm": 8.234824180603027, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8675010800361633, "num_tokens": 697240536.0, "step": 18271 }, { "epoch": 2.324386210405801, "ewc_loss": 0.07017716765403748, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003453263489063829, "grad_norm": 8.251276016235352, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8761920928955078, "num_tokens": 697279459.0, "step": 18272 }, { "epoch": 2.3245134206843914, "ewc_loss": 0.07012741267681122, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003448288480285555, "grad_norm": 8.208626747131348, "learning_rate": 1e-06, "loss": 0.4011, "mean_token_accuracy": 0.8830459713935852, "num_tokens": 697312594.0, "step": 18273 }, { "epoch": 2.324640630962982, "ewc_loss": 0.07022561132907867, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034581084037199616, "grad_norm": 8.261041641235352, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.850090503692627, "num_tokens": 697357834.0, "step": 18274 }, { "epoch": 2.3247678412415724, "ewc_loss": 0.070001021027565, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034356486867181957, "grad_norm": 8.184138298034668, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8643169403076172, "num_tokens": 697401522.0, "step": 18275 }, { "epoch": 2.324895051520163, "ewc_loss": 0.07025414705276489, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034609620342962444, "grad_norm": 8.275096893310547, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8610203266143799, "num_tokens": 697440415.0, "step": 18276 }, { "epoch": 2.3250222617987535, "ewc_loss": 0.07003340125083923, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034388864878565073, "grad_norm": 8.20459270477295, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8483017086982727, "num_tokens": 697477609.0, "step": 18277 }, { "epoch": 2.325149472077344, "ewc_loss": 0.0702558234333992, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003461129090283066, "grad_norm": 8.231573104858398, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8600649237632751, "num_tokens": 697515566.0, "step": 18278 }, { "epoch": 2.3252766823559345, "ewc_loss": 0.07010649144649506, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003446196496952325, "grad_norm": 8.233858108520508, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8693029880523682, "num_tokens": 697549537.0, "step": 18279 }, { "epoch": 2.325403892634525, "ewc_loss": 0.07011248171329498, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034467948717065156, "grad_norm": 8.20389461517334, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8690667748451233, "num_tokens": 697582618.0, "step": 18280 }, { "epoch": 2.325531102913115, "ewc_loss": 0.07005646079778671, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034411929664202034, "grad_norm": 8.255599975585938, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8776801228523254, "num_tokens": 697622234.0, "step": 18281 }, { "epoch": 2.325658313191706, "ewc_loss": 0.06999947130680084, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003435494436416775, "grad_norm": 8.13896369934082, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8564938902854919, "num_tokens": 697657504.0, "step": 18282 }, { "epoch": 2.325785523470296, "ewc_loss": 0.07016106694936752, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034516534651629627, "grad_norm": 8.307064056396484, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8514101505279541, "num_tokens": 697699496.0, "step": 18283 }, { "epoch": 2.3259127337488867, "ewc_loss": 0.06991040706634521, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003426587791182101, "grad_norm": 8.107906341552734, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8781598806381226, "num_tokens": 697739969.0, "step": 18284 }, { "epoch": 2.3260399440274773, "ewc_loss": 0.07041376084089279, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034769228659570217, "grad_norm": 8.342170715332031, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8535262942314148, "num_tokens": 697774346.0, "step": 18285 }, { "epoch": 2.326167154306068, "ewc_loss": 0.06973966211080551, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003409513155929744, "grad_norm": 8.123383522033691, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.850441575050354, "num_tokens": 697813013.0, "step": 18286 }, { "epoch": 2.3262943645846583, "ewc_loss": 0.07054698467254639, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003490245435386896, "grad_norm": 8.341216087341309, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8574333190917969, "num_tokens": 697854163.0, "step": 18287 }, { "epoch": 2.326421574863249, "ewc_loss": 0.06977146863937378, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034126933314837515, "grad_norm": 8.100948333740234, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8722515106201172, "num_tokens": 697898049.0, "step": 18288 }, { "epoch": 2.3265487851418394, "ewc_loss": 0.07049767673015594, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003485314373392612, "grad_norm": 8.2504243850708, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8675072193145752, "num_tokens": 697936086.0, "step": 18289 }, { "epoch": 2.32667599542043, "ewc_loss": 0.06989644467830658, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034251913893967867, "grad_norm": 8.111313819885254, "learning_rate": 1e-06, "loss": 0.5586, "mean_token_accuracy": 0.841322660446167, "num_tokens": 697981529.0, "step": 18290 }, { "epoch": 2.3268032056990204, "ewc_loss": 0.07039438188076019, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003474985423963517, "grad_norm": 8.284439086914062, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8810420036315918, "num_tokens": 698016047.0, "step": 18291 }, { "epoch": 2.326930415977611, "ewc_loss": 0.06998848915100098, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034343957668170333, "grad_norm": 8.171327590942383, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8766424655914307, "num_tokens": 698052408.0, "step": 18292 }, { "epoch": 2.3270576262562015, "ewc_loss": 0.0703057199716568, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034661186509765685, "grad_norm": 8.294994354248047, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8575048446655273, "num_tokens": 698089751.0, "step": 18293 }, { "epoch": 2.327184836534792, "ewc_loss": 0.06999857723712921, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034354045055806637, "grad_norm": 8.237865447998047, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8624519109725952, "num_tokens": 698121274.0, "step": 18294 }, { "epoch": 2.3273120468133826, "ewc_loss": 0.07018794119358063, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003454340621829033, "grad_norm": 8.1983003616333, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8749294281005859, "num_tokens": 698155880.0, "step": 18295 }, { "epoch": 2.327439257091973, "ewc_loss": 0.07011730968952179, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003447277704253793, "grad_norm": 8.278393745422363, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.868091881275177, "num_tokens": 698191448.0, "step": 18296 }, { "epoch": 2.3275664673705636, "ewc_loss": 0.07009053230285645, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003444599569775164, "grad_norm": 8.214178085327148, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8607264757156372, "num_tokens": 698228437.0, "step": 18297 }, { "epoch": 2.327693677649154, "ewc_loss": 0.0701136365532875, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034469107049517334, "grad_norm": 8.174718856811523, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8567215204238892, "num_tokens": 698268714.0, "step": 18298 }, { "epoch": 2.3278208879277447, "ewc_loss": 0.07017922401428223, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003453469544183463, "grad_norm": 8.22690200805664, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8649160861968994, "num_tokens": 698305575.0, "step": 18299 }, { "epoch": 2.327948098206335, "ewc_loss": 0.07008041441440582, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034435882116667926, "grad_norm": 8.19676685333252, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8611476421356201, "num_tokens": 698347701.0, "step": 18300 }, { "epoch": 2.3280753084849257, "ewc_loss": 0.07021331787109375, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034568781848065555, "grad_norm": 8.201550483703613, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.881070077419281, "num_tokens": 698385906.0, "step": 18301 }, { "epoch": 2.3282025187635162, "ewc_loss": 0.06995740532875061, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034312871866859496, "grad_norm": 8.201706886291504, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8640366792678833, "num_tokens": 698415511.0, "step": 18302 }, { "epoch": 2.3283297290421068, "ewc_loss": 0.0702696219086647, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034625091939233243, "grad_norm": 8.20451545715332, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8597992658615112, "num_tokens": 698460528.0, "step": 18303 }, { "epoch": 2.3284569393206973, "ewc_loss": 0.07013298571109772, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003448845527600497, "grad_norm": 8.159126281738281, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.849661111831665, "num_tokens": 698500800.0, "step": 18304 }, { "epoch": 2.328584149599288, "ewc_loss": 0.07029810547828674, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034653572947718203, "grad_norm": 8.23828125, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8635352849960327, "num_tokens": 698539373.0, "step": 18305 }, { "epoch": 2.328711359877878, "ewc_loss": 0.07019039988517761, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003454586840234697, "grad_norm": 8.208136558532715, "learning_rate": 1e-06, "loss": 0.406, "mean_token_accuracy": 0.8858516216278076, "num_tokens": 698577067.0, "step": 18306 }, { "epoch": 2.328838570156469, "ewc_loss": 0.07027074694633484, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003462621243670583, "grad_norm": 8.244723320007324, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8641091585159302, "num_tokens": 698613515.0, "step": 18307 }, { "epoch": 2.328965780435059, "ewc_loss": 0.06999000906944275, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034345476888120174, "grad_norm": 8.18029499053955, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8662536144256592, "num_tokens": 698647278.0, "step": 18308 }, { "epoch": 2.3290929907136495, "ewc_loss": 0.07033825665712357, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034693724592216313, "grad_norm": 8.21646785736084, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8729016780853271, "num_tokens": 698688738.0, "step": 18309 }, { "epoch": 2.32922020099224, "ewc_loss": 0.0701790452003479, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003453451208770275, "grad_norm": 8.21982192993164, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.868428647518158, "num_tokens": 698725003.0, "step": 18310 }, { "epoch": 2.3293474112708306, "ewc_loss": 0.07027289271354675, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034628366120159626, "grad_norm": 8.218122482299805, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8481782078742981, "num_tokens": 698768205.0, "step": 18311 }, { "epoch": 2.329474621549421, "ewc_loss": 0.0703258216381073, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034681291435845196, "grad_norm": 8.276618003845215, "learning_rate": 1e-06, "loss": 0.5292, "mean_token_accuracy": 0.845055341720581, "num_tokens": 698805352.0, "step": 18312 }, { "epoch": 2.3296018318280116, "ewc_loss": 0.07019864022731781, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034554110607132316, "grad_norm": 8.301461219787598, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8755523562431335, "num_tokens": 698836610.0, "step": 18313 }, { "epoch": 2.329729042106602, "ewc_loss": 0.07010286301374435, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034458329901099205, "grad_norm": 8.173559188842773, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8733620643615723, "num_tokens": 698877514.0, "step": 18314 }, { "epoch": 2.3298562523851927, "ewc_loss": 0.07037140429019928, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003472687385510653, "grad_norm": 8.317951202392578, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8683609962463379, "num_tokens": 698916699.0, "step": 18315 }, { "epoch": 2.329983462663783, "ewc_loss": 0.07004222273826599, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034397694980725646, "grad_norm": 8.245460510253906, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8662832975387573, "num_tokens": 698953568.0, "step": 18316 }, { "epoch": 2.3301106729423737, "ewc_loss": 0.0702415257692337, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003459699801169336, "grad_norm": 8.266343116760254, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8605257272720337, "num_tokens": 698990562.0, "step": 18317 }, { "epoch": 2.3302378832209643, "ewc_loss": 0.06998102366924286, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003433649253565818, "grad_norm": 8.178573608398438, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8669103980064392, "num_tokens": 699024095.0, "step": 18318 }, { "epoch": 2.330365093499555, "ewc_loss": 0.07020243257284164, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003455790283624083, "grad_norm": 8.24786376953125, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8445475697517395, "num_tokens": 699062327.0, "step": 18319 }, { "epoch": 2.3304923037781453, "ewc_loss": 0.06996500492095947, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003432047669775784, "grad_norm": 8.161604881286621, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8691888451576233, "num_tokens": 699101591.0, "step": 18320 }, { "epoch": 2.330619514056736, "ewc_loss": 0.07025280594825745, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003460827865637839, "grad_norm": 8.348407745361328, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.85598224401474, "num_tokens": 699134836.0, "step": 18321 }, { "epoch": 2.3307467243353264, "ewc_loss": 0.06980206072330475, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003415752435103059, "grad_norm": 8.095698356628418, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8555219173431396, "num_tokens": 699171218.0, "step": 18322 }, { "epoch": 2.330873934613917, "ewc_loss": 0.07044018805027008, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034795652027241886, "grad_norm": 8.34758186340332, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8825784921646118, "num_tokens": 699200891.0, "step": 18323 }, { "epoch": 2.3310011448925074, "ewc_loss": 0.06982878595590591, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003418425330892205, "grad_norm": 8.131731986999512, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8807870149612427, "num_tokens": 699238080.0, "step": 18324 }, { "epoch": 2.331128355171098, "ewc_loss": 0.07034935802221298, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034704827703535557, "grad_norm": 8.234678268432617, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8531655073165894, "num_tokens": 699276051.0, "step": 18325 }, { "epoch": 2.3312555654496885, "ewc_loss": 0.06990645825862885, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003426192852202803, "grad_norm": 8.10501766204834, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8731763362884521, "num_tokens": 699315721.0, "step": 18326 }, { "epoch": 2.331382775728279, "ewc_loss": 0.07041212916374207, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034767595934681594, "grad_norm": 8.301104545593262, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8752801418304443, "num_tokens": 699350342.0, "step": 18327 }, { "epoch": 2.3315099860068695, "ewc_loss": 0.0698646754026413, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003422014706302434, "grad_norm": 8.10781478881836, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8699946403503418, "num_tokens": 699390301.0, "step": 18328 }, { "epoch": 2.3316371962854596, "ewc_loss": 0.07035240530967712, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003470787196420133, "grad_norm": 8.347764015197754, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.8489367961883545, "num_tokens": 699427135.0, "step": 18329 }, { "epoch": 2.3317644065640506, "ewc_loss": 0.06980263441801071, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034158103517256677, "grad_norm": 8.134029388427734, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8703237771987915, "num_tokens": 699466461.0, "step": 18330 }, { "epoch": 2.3318916168426407, "ewc_loss": 0.07038116455078125, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034736638190224767, "grad_norm": 8.305426597595215, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8582680225372314, "num_tokens": 699502446.0, "step": 18331 }, { "epoch": 2.332018827121231, "ewc_loss": 0.06979213654994965, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003414760867599398, "grad_norm": 8.13171672821045, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8649097681045532, "num_tokens": 699546357.0, "step": 18332 }, { "epoch": 2.3321460373998217, "ewc_loss": 0.07024218887090683, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003459765575826168, "grad_norm": 8.254243850708008, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8646091222763062, "num_tokens": 699581879.0, "step": 18333 }, { "epoch": 2.3322732476784123, "ewc_loss": 0.06991448998451233, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003426995826885104, "grad_norm": 8.17868709564209, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.878398060798645, "num_tokens": 699616287.0, "step": 18334 }, { "epoch": 2.332400457957003, "ewc_loss": 0.07015478610992432, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003451025695540011, "grad_norm": 8.233769416809082, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8615750074386597, "num_tokens": 699655409.0, "step": 18335 }, { "epoch": 2.3325276682355933, "ewc_loss": 0.06988968700170517, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034245155984535813, "grad_norm": 8.128007888793945, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8390449285507202, "num_tokens": 699701420.0, "step": 18336 }, { "epoch": 2.332654878514184, "ewc_loss": 0.07022389769554138, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034579364000819623, "grad_norm": 8.193025588989258, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.867465615272522, "num_tokens": 699736151.0, "step": 18337 }, { "epoch": 2.3327820887927744, "ewc_loss": 0.07002663612365723, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034382109879516065, "grad_norm": 8.269341468811035, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.85743248462677, "num_tokens": 699770111.0, "step": 18338 }, { "epoch": 2.332909299071365, "ewc_loss": 0.0700211226940155, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034376594703644514, "grad_norm": 8.16939640045166, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8618003726005554, "num_tokens": 699807442.0, "step": 18339 }, { "epoch": 2.3330365093499554, "ewc_loss": 0.0701226070523262, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034478073939681053, "grad_norm": 8.153937339782715, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8622848391532898, "num_tokens": 699849364.0, "step": 18340 }, { "epoch": 2.333163719628546, "ewc_loss": 0.07003333419561386, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034388803760521114, "grad_norm": 8.146855354309082, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8785530924797058, "num_tokens": 699886234.0, "step": 18341 }, { "epoch": 2.3332909299071365, "ewc_loss": 0.07030665874481201, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003466212365310639, "grad_norm": 8.242778778076172, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8460060954093933, "num_tokens": 699925660.0, "step": 18342 }, { "epoch": 2.333418140185727, "ewc_loss": 0.07014060020446777, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003449606883805245, "grad_norm": 8.198131561279297, "learning_rate": 1e-06, "loss": 0.5494, "mean_token_accuracy": 0.8387970924377441, "num_tokens": 699967894.0, "step": 18343 }, { "epoch": 2.3335453504643175, "ewc_loss": 0.0702361911535263, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034591660369187593, "grad_norm": 8.232108116149902, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.858156144618988, "num_tokens": 699999211.0, "step": 18344 }, { "epoch": 2.333672560742908, "ewc_loss": 0.07017858326435089, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003453404933679849, "grad_norm": 8.178552627563477, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8720982670783997, "num_tokens": 700037639.0, "step": 18345 }, { "epoch": 2.3337997710214986, "ewc_loss": 0.07020441442728043, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003455988480709493, "grad_norm": 8.181862831115723, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8575456142425537, "num_tokens": 700075677.0, "step": 18346 }, { "epoch": 2.333926981300089, "ewc_loss": 0.07019510120153427, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034550571581348777, "grad_norm": 8.18600082397461, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8710324168205261, "num_tokens": 700115697.0, "step": 18347 }, { "epoch": 2.3340541915786797, "ewc_loss": 0.07015369832515717, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003450916556175798, "grad_norm": 8.220392227172852, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.846105694770813, "num_tokens": 700152553.0, "step": 18348 }, { "epoch": 2.33418140185727, "ewc_loss": 0.07014782726764679, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034503298229537904, "grad_norm": 8.279366493225098, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8713140487670898, "num_tokens": 700190958.0, "step": 18349 }, { "epoch": 2.3343086121358607, "ewc_loss": 0.06998628377914429, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034341748687438667, "grad_norm": 8.217022895812988, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.860966145992279, "num_tokens": 700224244.0, "step": 18350 }, { "epoch": 2.3344358224144512, "ewc_loss": 0.07018262147903442, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034538094769231975, "grad_norm": 8.208551406860352, "learning_rate": 1e-06, "loss": 0.3804, "mean_token_accuracy": 0.8909395933151245, "num_tokens": 700259736.0, "step": 18351 }, { "epoch": 2.3345630326930418, "ewc_loss": 0.07003732025623322, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034392785164527595, "grad_norm": 8.173467636108398, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8651039600372314, "num_tokens": 700293624.0, "step": 18352 }, { "epoch": 2.3346902429716323, "ewc_loss": 0.07017254829406738, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003452801320236176, "grad_norm": 8.195735931396484, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8677730560302734, "num_tokens": 700328052.0, "step": 18353 }, { "epoch": 2.3348174532502224, "ewc_loss": 0.06994594633579254, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003430141368880868, "grad_norm": 8.162419319152832, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8577750325202942, "num_tokens": 700364652.0, "step": 18354 }, { "epoch": 2.3349446635288134, "ewc_loss": 0.07028328627347946, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034638753277249634, "grad_norm": 8.232085227966309, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8774425983428955, "num_tokens": 700403698.0, "step": 18355 }, { "epoch": 2.3350718738074034, "ewc_loss": 0.07001249492168427, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034367962507531047, "grad_norm": 8.169559478759766, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8787389397621155, "num_tokens": 700439391.0, "step": 18356 }, { "epoch": 2.335199084085994, "ewc_loss": 0.0702233612537384, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003457882849033922, "grad_norm": 8.292244911193848, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8657962083816528, "num_tokens": 700475095.0, "step": 18357 }, { "epoch": 2.3353262943645845, "ewc_loss": 0.07001982629299164, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000343752937624231, "grad_norm": 8.154767036437988, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8693439960479736, "num_tokens": 700514167.0, "step": 18358 }, { "epoch": 2.335453504643175, "ewc_loss": 0.07029003649950027, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034645505365915596, "grad_norm": 8.281679153442383, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8733991980552673, "num_tokens": 700548333.0, "step": 18359 }, { "epoch": 2.3355807149217656, "ewc_loss": 0.06990735232830048, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034262819099240005, "grad_norm": 8.145528793334961, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8691972494125366, "num_tokens": 700594261.0, "step": 18360 }, { "epoch": 2.335707925200356, "ewc_loss": 0.07024668157100677, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000346021493896842, "grad_norm": 8.26160717010498, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.871833086013794, "num_tokens": 700632513.0, "step": 18361 }, { "epoch": 2.3358351354789466, "ewc_loss": 0.06988385319709778, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003423932066652924, "grad_norm": 8.157369613647461, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8711881637573242, "num_tokens": 700672430.0, "step": 18362 }, { "epoch": 2.335962345757537, "ewc_loss": 0.07017974555492401, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003453521931078285, "grad_norm": 8.253081321716309, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8704293966293335, "num_tokens": 700714321.0, "step": 18363 }, { "epoch": 2.3360895560361277, "ewc_loss": 0.06985343247652054, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003420890134293586, "grad_norm": 8.194096565246582, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8645811080932617, "num_tokens": 700751714.0, "step": 18364 }, { "epoch": 2.336216766314718, "ewc_loss": 0.07011278718709946, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003446825430728495, "grad_norm": 8.2614107131958, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8628549575805664, "num_tokens": 700794087.0, "step": 18365 }, { "epoch": 2.3363439765933087, "ewc_loss": 0.06987346708774567, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003422893933020532, "grad_norm": 8.210365295410156, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8590087890625, "num_tokens": 700832031.0, "step": 18366 }, { "epoch": 2.3364711868718993, "ewc_loss": 0.07007762789726257, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003443309979047626, "grad_norm": 8.256218910217285, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.867448091506958, "num_tokens": 700867175.0, "step": 18367 }, { "epoch": 2.33659839715049, "ewc_loss": 0.06997493654489517, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003433040401432663, "grad_norm": 8.229104995727539, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8715466260910034, "num_tokens": 700906762.0, "step": 18368 }, { "epoch": 2.3367256074290803, "ewc_loss": 0.06990310549736023, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003425857285037637, "grad_norm": 8.21253490447998, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8671368360519409, "num_tokens": 700944316.0, "step": 18369 }, { "epoch": 2.336852817707671, "ewc_loss": 0.070090651512146, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034446120844222605, "grad_norm": 8.260666847229004, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8615648746490479, "num_tokens": 700979369.0, "step": 18370 }, { "epoch": 2.3369800279862614, "ewc_loss": 0.06996037065982819, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003431584336794913, "grad_norm": 8.22962760925293, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8621470332145691, "num_tokens": 701016213.0, "step": 18371 }, { "epoch": 2.337107238264852, "ewc_loss": 0.07000468671321869, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003436015103943646, "grad_norm": 8.221582412719727, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8618125915527344, "num_tokens": 701055270.0, "step": 18372 }, { "epoch": 2.3372344485434424, "ewc_loss": 0.06994228810071945, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034297758247703314, "grad_norm": 8.231701850891113, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8625697493553162, "num_tokens": 701091058.0, "step": 18373 }, { "epoch": 2.337361658822033, "ewc_loss": 0.06998652219772339, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003434199606999755, "grad_norm": 8.20441722869873, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8733497858047485, "num_tokens": 701129446.0, "step": 18374 }, { "epoch": 2.3374888691006235, "ewc_loss": 0.06996151804924011, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034316981327719986, "grad_norm": 8.24516487121582, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8688530325889587, "num_tokens": 701168726.0, "step": 18375 }, { "epoch": 2.337616079379214, "ewc_loss": 0.06993068009614944, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034286148729734123, "grad_norm": 8.219411849975586, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8602467775344849, "num_tokens": 701204041.0, "step": 18376 }, { "epoch": 2.3377432896578045, "ewc_loss": 0.07004288583993912, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034398355637677014, "grad_norm": 8.262370109558105, "learning_rate": 1e-06, "loss": 0.534, "mean_token_accuracy": 0.8422611355781555, "num_tokens": 701244660.0, "step": 18377 }, { "epoch": 2.337870499936395, "ewc_loss": 0.06994988024234772, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003430535434745252, "grad_norm": 8.2265043258667, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8550635576248169, "num_tokens": 701284830.0, "step": 18378 }, { "epoch": 2.337997710214985, "ewc_loss": 0.06998930871486664, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003434477257542312, "grad_norm": 8.184377670288086, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8713989853858948, "num_tokens": 701319720.0, "step": 18379 }, { "epoch": 2.338124920493576, "ewc_loss": 0.07004080712795258, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034396277624182403, "grad_norm": 8.224462509155273, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.844182014465332, "num_tokens": 701361770.0, "step": 18380 }, { "epoch": 2.338252130772166, "ewc_loss": 0.06998381018638611, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003433927777223289, "grad_norm": 8.2183198928833, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8635940551757812, "num_tokens": 701399777.0, "step": 18381 }, { "epoch": 2.3383793410507567, "ewc_loss": 0.06999580562114716, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003435127146076411, "grad_norm": 8.22931957244873, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.863741397857666, "num_tokens": 701432680.0, "step": 18382 }, { "epoch": 2.3385065513293473, "ewc_loss": 0.07004795223474503, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003440342261455953, "grad_norm": 8.22940731048584, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8753183484077454, "num_tokens": 701474101.0, "step": 18383 }, { "epoch": 2.338633761607938, "ewc_loss": 0.07011749595403671, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034472966217435896, "grad_norm": 8.25529670715332, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.876667857170105, "num_tokens": 701517568.0, "step": 18384 }, { "epoch": 2.3387609718865283, "ewc_loss": 0.07001476734876633, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003437023551668972, "grad_norm": 8.207796096801758, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.860145628452301, "num_tokens": 701558466.0, "step": 18385 }, { "epoch": 2.338888182165119, "ewc_loss": 0.07017017900943756, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003452565288171172, "grad_norm": 8.317660331726074, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.858359694480896, "num_tokens": 701591104.0, "step": 18386 }, { "epoch": 2.3390153924437094, "ewc_loss": 0.06991671770811081, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003427218471188098, "grad_norm": 8.17226505279541, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8580392599105835, "num_tokens": 701633242.0, "step": 18387 }, { "epoch": 2.3391426027223, "ewc_loss": 0.07025399804115295, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034609466092661023, "grad_norm": 8.253211975097656, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8634151220321655, "num_tokens": 701668864.0, "step": 18388 }, { "epoch": 2.3392698130008904, "ewc_loss": 0.06997722387313843, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003433269739616662, "grad_norm": 8.217166900634766, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8680016994476318, "num_tokens": 701703648.0, "step": 18389 }, { "epoch": 2.339397023279481, "ewc_loss": 0.07006213068962097, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003441759617999196, "grad_norm": 8.209451675415039, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8619428873062134, "num_tokens": 701744417.0, "step": 18390 }, { "epoch": 2.3395242335580715, "ewc_loss": 0.07014510035514832, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003450056829024106, "grad_norm": 8.248564720153809, "learning_rate": 1e-06, "loss": 0.5515, "mean_token_accuracy": 0.8330270051956177, "num_tokens": 701783541.0, "step": 18391 }, { "epoch": 2.339651443836662, "ewc_loss": 0.07005193829536438, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003440740401856601, "grad_norm": 8.212646484375, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8608304262161255, "num_tokens": 701825054.0, "step": 18392 }, { "epoch": 2.3397786541152525, "ewc_loss": 0.07014600932598114, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034501473419368267, "grad_norm": 8.243287086486816, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8728000521659851, "num_tokens": 701866173.0, "step": 18393 }, { "epoch": 2.339905864393843, "ewc_loss": 0.07003822922706604, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003439370193518698, "grad_norm": 8.223421096801758, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8603417277336121, "num_tokens": 701904812.0, "step": 18394 }, { "epoch": 2.3400330746724336, "ewc_loss": 0.07017984986305237, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034535315353423357, "grad_norm": 8.255195617675781, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8537794351577759, "num_tokens": 701942590.0, "step": 18395 }, { "epoch": 2.340160284951024, "ewc_loss": 0.06999944150447845, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034354906529188156, "grad_norm": 8.239068984985352, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8672820925712585, "num_tokens": 701980773.0, "step": 18396 }, { "epoch": 2.3402874952296147, "ewc_loss": 0.0701960101723671, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034551479620859027, "grad_norm": 8.256623268127441, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8761223554611206, "num_tokens": 702014508.0, "step": 18397 }, { "epoch": 2.340414705508205, "ewc_loss": 0.07007330656051636, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034428772050887346, "grad_norm": 8.20960521697998, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8771584033966064, "num_tokens": 702055754.0, "step": 18398 }, { "epoch": 2.3405419157867957, "ewc_loss": 0.07005597651004791, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034411443630233407, "grad_norm": 8.208096504211426, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8631864786148071, "num_tokens": 702100022.0, "step": 18399 }, { "epoch": 2.3406691260653862, "ewc_loss": 0.07013669610023499, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003449216310400516, "grad_norm": 8.281495094299316, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8528487086296082, "num_tokens": 702133217.0, "step": 18400 }, { "epoch": 2.3407963363439768, "ewc_loss": 0.0700419694185257, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003439743595663458, "grad_norm": 8.23263931274414, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8772990703582764, "num_tokens": 702176998.0, "step": 18401 }, { "epoch": 2.3409235466225673, "ewc_loss": 0.07014905661344528, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034504523500800133, "grad_norm": 8.273080825805664, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8715766668319702, "num_tokens": 702216219.0, "step": 18402 }, { "epoch": 2.341050756901158, "ewc_loss": 0.0700235664844513, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003437903069425374, "grad_norm": 8.277342796325684, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8597806096076965, "num_tokens": 702246502.0, "step": 18403 }, { "epoch": 2.341177967179748, "ewc_loss": 0.07009963691234589, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445511101745069, "grad_norm": 8.306609153747559, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8723665475845337, "num_tokens": 702277990.0, "step": 18404 }, { "epoch": 2.341305177458339, "ewc_loss": 0.06990676373243332, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003426223120186478, "grad_norm": 8.219027519226074, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8656895160675049, "num_tokens": 702312742.0, "step": 18405 }, { "epoch": 2.341432387736929, "ewc_loss": 0.07016290724277496, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034518371103331447, "grad_norm": 8.259700775146484, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8796292543411255, "num_tokens": 702348575.0, "step": 18406 }, { "epoch": 2.3415595980155195, "ewc_loss": 0.06983326375484467, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003418873529881239, "grad_norm": 8.153497695922852, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8525522947311401, "num_tokens": 702385956.0, "step": 18407 }, { "epoch": 2.34168680829411, "ewc_loss": 0.0702502653002739, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003460573498159647, "grad_norm": 8.340065002441406, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8705785274505615, "num_tokens": 702423973.0, "step": 18408 }, { "epoch": 2.3418140185727006, "ewc_loss": 0.06986810266971588, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003422356676310301, "grad_norm": 8.232562065124512, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.860657274723053, "num_tokens": 702455508.0, "step": 18409 }, { "epoch": 2.341941228851291, "ewc_loss": 0.07018466293811798, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003454013494774699, "grad_norm": 8.299903869628906, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8769345283508301, "num_tokens": 702493365.0, "step": 18410 }, { "epoch": 2.3420684391298816, "ewc_loss": 0.0699407309293747, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034296204103156924, "grad_norm": 8.161484718322754, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8669166564941406, "num_tokens": 702528036.0, "step": 18411 }, { "epoch": 2.342195649408472, "ewc_loss": 0.07019919902086258, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034554669400677085, "grad_norm": 8.283971786499023, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8596245050430298, "num_tokens": 702571238.0, "step": 18412 }, { "epoch": 2.3423228596870627, "ewc_loss": 0.07003383338451385, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034389307256788015, "grad_norm": 8.253802299499512, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8602403402328491, "num_tokens": 702611247.0, "step": 18413 }, { "epoch": 2.342450069965653, "ewc_loss": 0.070177361369133, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034532826975919306, "grad_norm": 8.266098022460938, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8669912815093994, "num_tokens": 702649661.0, "step": 18414 }, { "epoch": 2.3425772802442437, "ewc_loss": 0.07005348056554794, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034408949431963265, "grad_norm": 8.177440643310547, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8571052551269531, "num_tokens": 702693573.0, "step": 18415 }, { "epoch": 2.3427044905228342, "ewc_loss": 0.07030820846557617, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003466367197688669, "grad_norm": 8.260038375854492, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8750705122947693, "num_tokens": 702735351.0, "step": 18416 }, { "epoch": 2.3428317008014248, "ewc_loss": 0.0700482726097107, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034403742756694555, "grad_norm": 8.201970100402832, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8715757131576538, "num_tokens": 702773406.0, "step": 18417 }, { "epoch": 2.3429589110800153, "ewc_loss": 0.07037536799907684, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034730840707197785, "grad_norm": 8.33779239654541, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8722252249717712, "num_tokens": 702809939.0, "step": 18418 }, { "epoch": 2.343086121358606, "ewc_loss": 0.0700719952583313, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034427462378516793, "grad_norm": 8.200767517089844, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8632326126098633, "num_tokens": 702851120.0, "step": 18419 }, { "epoch": 2.3432133316371964, "ewc_loss": 0.07027630507946014, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034631771268323064, "grad_norm": 8.295130729675293, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8729296326637268, "num_tokens": 702889492.0, "step": 18420 }, { "epoch": 2.343340541915787, "ewc_loss": 0.0700831264257431, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034438600414432585, "grad_norm": 8.238387107849121, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8588777780532837, "num_tokens": 702930486.0, "step": 18421 }, { "epoch": 2.3434677521943774, "ewc_loss": 0.07024520635604858, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034600670915097, "grad_norm": 8.283987045288086, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8569575548171997, "num_tokens": 702969866.0, "step": 18422 }, { "epoch": 2.343594962472968, "ewc_loss": 0.06988920271396637, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034244669950567186, "grad_norm": 8.189300537109375, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8556278944015503, "num_tokens": 703013703.0, "step": 18423 }, { "epoch": 2.3437221727515585, "ewc_loss": 0.07036994397640228, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003472541575320065, "grad_norm": 8.350692749023438, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8562209606170654, "num_tokens": 703054849.0, "step": 18424 }, { "epoch": 2.343849383030149, "ewc_loss": 0.06999276578426361, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034348235931247473, "grad_norm": 8.249541282653809, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8601652979850769, "num_tokens": 703088250.0, "step": 18425 }, { "epoch": 2.3439765933087395, "ewc_loss": 0.07036843150854111, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034723899443633854, "grad_norm": 8.341516494750977, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8608112931251526, "num_tokens": 703124072.0, "step": 18426 }, { "epoch": 2.3441038035873296, "ewc_loss": 0.06992454826831818, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003428001655265689, "grad_norm": 8.23552417755127, "learning_rate": 1e-06, "loss": 0.5194, "mean_token_accuracy": 0.8464689254760742, "num_tokens": 703158869.0, "step": 18427 }, { "epoch": 2.3442310138659206, "ewc_loss": 0.07031874358654022, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034674210473895073, "grad_norm": 8.260087013244629, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8677348494529724, "num_tokens": 703193342.0, "step": 18428 }, { "epoch": 2.3443582241445107, "ewc_loss": 0.07004839181900024, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034403856261633337, "grad_norm": 8.232697486877441, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8769481182098389, "num_tokens": 703229899.0, "step": 18429 }, { "epoch": 2.344485434423101, "ewc_loss": 0.07010073959827423, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445620823185891, "grad_norm": 8.231197357177734, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8628859519958496, "num_tokens": 703266735.0, "step": 18430 }, { "epoch": 2.3446126447016917, "ewc_loss": 0.07008647173643112, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003444194153416902, "grad_norm": 8.159791946411133, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8612062931060791, "num_tokens": 703307515.0, "step": 18431 }, { "epoch": 2.3447398549802823, "ewc_loss": 0.07035990059375763, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003471537202131003, "grad_norm": 8.303153038024902, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8745661973953247, "num_tokens": 703341237.0, "step": 18432 }, { "epoch": 2.344867065258873, "ewc_loss": 0.07006548345088959, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003442095185164362, "grad_norm": 8.19405460357666, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8754674196243286, "num_tokens": 703378900.0, "step": 18433 }, { "epoch": 2.3449942755374633, "ewc_loss": 0.07041677087545395, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003477224090602249, "grad_norm": 8.328198432922363, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8738674521446228, "num_tokens": 703415260.0, "step": 18434 }, { "epoch": 2.345121485816054, "ewc_loss": 0.07002957165241241, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003438503772486001, "grad_norm": 8.218318939208984, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8713635802268982, "num_tokens": 703452399.0, "step": 18435 }, { "epoch": 2.3452486960946444, "ewc_loss": 0.07056914269924164, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003468047361820936, "grad_norm": 12.220502853393555, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8660971522331238, "num_tokens": 703492700.0, "step": 18436 }, { "epoch": 2.345375906373235, "ewc_loss": 0.07424803078174591, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003860350407194346, "grad_norm": 8.599486351013184, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8759493827819824, "num_tokens": 703530805.0, "step": 18437 }, { "epoch": 2.3455031166518254, "ewc_loss": 0.07164384424686432, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003599930787459016, "grad_norm": 8.596833229064941, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8738431930541992, "num_tokens": 703571894.0, "step": 18438 }, { "epoch": 2.345630326930416, "ewc_loss": 0.07053212821483612, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034887593938037753, "grad_norm": 8.430623054504395, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.871304988861084, "num_tokens": 703612583.0, "step": 18439 }, { "epoch": 2.3457575372090065, "ewc_loss": 0.07175545394420624, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003611092106439173, "grad_norm": 8.548887252807617, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.858411967754364, "num_tokens": 703649100.0, "step": 18440 }, { "epoch": 2.345884747487597, "ewc_loss": 0.0703914612531662, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034746932215057313, "grad_norm": 8.304963111877441, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8800228834152222, "num_tokens": 703680425.0, "step": 18441 }, { "epoch": 2.3460119577661875, "ewc_loss": 0.07097191363573074, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035327381920069456, "grad_norm": 8.401551246643066, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8714882135391235, "num_tokens": 703721819.0, "step": 18442 }, { "epoch": 2.346139168044778, "ewc_loss": 0.07050499320030212, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034860463347285986, "grad_norm": 8.357467651367188, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8618482351303101, "num_tokens": 703755432.0, "step": 18443 }, { "epoch": 2.3462663783233686, "ewc_loss": 0.07070966064929962, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003506513312458992, "grad_norm": 8.401084899902344, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8645476698875427, "num_tokens": 703795903.0, "step": 18444 }, { "epoch": 2.346393588601959, "ewc_loss": 0.07030308246612549, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003465855261310935, "grad_norm": 8.262231826782227, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8771763443946838, "num_tokens": 703836074.0, "step": 18445 }, { "epoch": 2.3465207988805497, "ewc_loss": 0.07053340971469879, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034888883237726986, "grad_norm": 8.387052536010742, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.870617151260376, "num_tokens": 703870609.0, "step": 18446 }, { "epoch": 2.34664800915914, "ewc_loss": 0.07025146484375, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034606936969794333, "grad_norm": 8.294978141784668, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8697981238365173, "num_tokens": 703909034.0, "step": 18447 }, { "epoch": 2.3467752194377307, "ewc_loss": 0.07041370123624802, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034769170451909304, "grad_norm": 8.34536361694336, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8578059077262878, "num_tokens": 703946976.0, "step": 18448 }, { "epoch": 2.3469024297163212, "ewc_loss": 0.07011623680591583, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034471703111194074, "grad_norm": 8.276284217834473, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8690223097801208, "num_tokens": 703986379.0, "step": 18449 }, { "epoch": 2.3470296399949118, "ewc_loss": 0.07051307708024979, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003486854548100382, "grad_norm": 8.357234001159668, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8831803798675537, "num_tokens": 704019461.0, "step": 18450 }, { "epoch": 2.3471568502735023, "ewc_loss": 0.06999099254608154, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034346466418355703, "grad_norm": 8.247893333435059, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.853865385055542, "num_tokens": 704053411.0, "step": 18451 }, { "epoch": 2.3472840605520924, "ewc_loss": 0.0704997256398201, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003485519264359027, "grad_norm": 8.37990665435791, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.869404137134552, "num_tokens": 704092633.0, "step": 18452 }, { "epoch": 2.3474112708306833, "ewc_loss": 0.07001331448554993, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003436878032516688, "grad_norm": 8.202705383300781, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8685629367828369, "num_tokens": 704126955.0, "step": 18453 }, { "epoch": 2.3475384811092734, "ewc_loss": 0.07035180926322937, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003470728115644306, "grad_norm": 8.270054817199707, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8815340399742126, "num_tokens": 704162222.0, "step": 18454 }, { "epoch": 2.347665691387864, "ewc_loss": 0.07005170732736588, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003440717700868845, "grad_norm": 8.259085655212402, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8652690052986145, "num_tokens": 704198949.0, "step": 18455 }, { "epoch": 2.3477929016664545, "ewc_loss": 0.07019008696079254, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034545554080978036, "grad_norm": 8.279483795166016, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8601878881454468, "num_tokens": 704235327.0, "step": 18456 }, { "epoch": 2.347920111945045, "ewc_loss": 0.07012445479631424, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034479922032915056, "grad_norm": 8.29569149017334, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8721052408218384, "num_tokens": 704272783.0, "step": 18457 }, { "epoch": 2.3480473222236355, "ewc_loss": 0.07018910348415375, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003454457619227469, "grad_norm": 8.278145790100098, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8474698662757874, "num_tokens": 704314396.0, "step": 18458 }, { "epoch": 2.348174532502226, "ewc_loss": 0.07011134177446365, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034466810757294297, "grad_norm": 8.297504425048828, "learning_rate": 1e-06, "loss": 0.4097, "mean_token_accuracy": 0.8816673159599304, "num_tokens": 704346903.0, "step": 18459 }, { "epoch": 2.3483017427808166, "ewc_loss": 0.07010364532470703, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445911861490458, "grad_norm": 8.236161231994629, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8731637001037598, "num_tokens": 704380456.0, "step": 18460 }, { "epoch": 2.348428953059407, "ewc_loss": 0.07023467123508453, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000345901440596208, "grad_norm": 8.300989151000977, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8777029514312744, "num_tokens": 704416403.0, "step": 18461 }, { "epoch": 2.3485561633379977, "ewc_loss": 0.07015043497085571, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003450590302236378, "grad_norm": 8.226579666137695, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.866989016532898, "num_tokens": 704460431.0, "step": 18462 }, { "epoch": 2.348683373616588, "ewc_loss": 0.07038988173007965, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003474534605629742, "grad_norm": 8.292325019836426, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8738914132118225, "num_tokens": 704497872.0, "step": 18463 }, { "epoch": 2.3488105838951787, "ewc_loss": 0.0700879842042923, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034443457843735814, "grad_norm": 8.230673789978027, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.867379903793335, "num_tokens": 704538180.0, "step": 18464 }, { "epoch": 2.3489377941737692, "ewc_loss": 0.07039451599121094, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034749979386106133, "grad_norm": 8.275044441223145, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.878391444683075, "num_tokens": 704578342.0, "step": 18465 }, { "epoch": 2.3490650044523598, "ewc_loss": 0.0701671615242958, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003452263190411031, "grad_norm": 8.230875968933105, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8761716485023499, "num_tokens": 704611792.0, "step": 18466 }, { "epoch": 2.3491922147309503, "ewc_loss": 0.07042652368545532, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003478198777884245, "grad_norm": 8.292312622070312, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8698251247406006, "num_tokens": 704649898.0, "step": 18467 }, { "epoch": 2.349319425009541, "ewc_loss": 0.07023387402296066, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003458934370428324, "grad_norm": 8.221329689025879, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.874478280544281, "num_tokens": 704690632.0, "step": 18468 }, { "epoch": 2.3494466352881314, "ewc_loss": 0.07035349309444427, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034708966268226504, "grad_norm": 8.335138320922852, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8554770946502686, "num_tokens": 704722955.0, "step": 18469 }, { "epoch": 2.349573845566722, "ewc_loss": 0.07013139128684998, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034486863296478987, "grad_norm": 8.245272636413574, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.866271436214447, "num_tokens": 704762678.0, "step": 18470 }, { "epoch": 2.3497010558453124, "ewc_loss": 0.07022972404956818, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034585196408443153, "grad_norm": 8.19404411315918, "learning_rate": 1e-06, "loss": 0.3824, "mean_token_accuracy": 0.8920924663543701, "num_tokens": 704805440.0, "step": 18471 }, { "epoch": 2.349828266123903, "ewc_loss": 0.07029867172241211, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003465414047241211, "grad_norm": 8.260403633117676, "learning_rate": 1e-06, "loss": 0.5259, "mean_token_accuracy": 0.8475033044815063, "num_tokens": 704848153.0, "step": 18472 }, { "epoch": 2.3499554764024935, "ewc_loss": 0.07023951411247253, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003459497820585966, "grad_norm": 8.201102256774902, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8635286688804626, "num_tokens": 704886739.0, "step": 18473 }, { "epoch": 2.350082686681084, "ewc_loss": 0.07040750235319138, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003476297133602202, "grad_norm": 8.23826789855957, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8649303913116455, "num_tokens": 704926145.0, "step": 18474 }, { "epoch": 2.3502098969596745, "ewc_loss": 0.07039003074169159, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003474550321698189, "grad_norm": 8.285417556762695, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8490826487541199, "num_tokens": 704964135.0, "step": 18475 }, { "epoch": 2.350337107238265, "ewc_loss": 0.0703946202993393, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003475008998066187, "grad_norm": 8.348424911499023, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8534221053123474, "num_tokens": 705002171.0, "step": 18476 }, { "epoch": 2.350464317516855, "ewc_loss": 0.07030948996543884, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034664961276575923, "grad_norm": 8.195170402526855, "learning_rate": 1e-06, "loss": 0.3953, "mean_token_accuracy": 0.885364294052124, "num_tokens": 705040660.0, "step": 18477 }, { "epoch": 2.350591527795446, "ewc_loss": 0.07065428793430328, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035009757266379893, "grad_norm": 8.340856552124023, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8724284768104553, "num_tokens": 705073655.0, "step": 18478 }, { "epoch": 2.350718738074036, "ewc_loss": 0.07017750293016434, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003453297249507159, "grad_norm": 8.208189010620117, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8625911474227905, "num_tokens": 705115497.0, "step": 18479 }, { "epoch": 2.3508459483526267, "ewc_loss": 0.07061152160167694, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034966986277140677, "grad_norm": 8.30553913116455, "learning_rate": 1e-06, "loss": 0.4078, "mean_token_accuracy": 0.8827855587005615, "num_tokens": 705152642.0, "step": 18480 }, { "epoch": 2.3509731586312173, "ewc_loss": 0.07021457701921463, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003457004495430738, "grad_norm": 8.230510711669922, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8501368761062622, "num_tokens": 705187282.0, "step": 18481 }, { "epoch": 2.351100368909808, "ewc_loss": 0.07058340311050415, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003493887197691947, "grad_norm": 12.203962326049805, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8700536489486694, "num_tokens": 705225635.0, "step": 18482 }, { "epoch": 2.3512275791883983, "ewc_loss": 0.07432907819747925, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003868454950861633, "grad_norm": 8.577432632446289, "learning_rate": 1e-06, "loss": 0.424, "mean_token_accuracy": 0.8756091594696045, "num_tokens": 705260376.0, "step": 18483 }, { "epoch": 2.351354789466989, "ewc_loss": 0.07229359447956085, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036649059620685875, "grad_norm": 8.672313690185547, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8648606538772583, "num_tokens": 705298799.0, "step": 18484 }, { "epoch": 2.3514819997455794, "ewc_loss": 0.07056950032711029, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034924966166727245, "grad_norm": 8.343932151794434, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8478680849075317, "num_tokens": 705332570.0, "step": 18485 }, { "epoch": 2.35160921002417, "ewc_loss": 0.07255128026008606, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036906750756315887, "grad_norm": 8.63211441040039, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8676817417144775, "num_tokens": 705371214.0, "step": 18486 }, { "epoch": 2.3517364203027604, "ewc_loss": 0.07063867151737213, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034994143061339855, "grad_norm": 8.342423439025879, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8630275130271912, "num_tokens": 705405051.0, "step": 18487 }, { "epoch": 2.351863630581351, "ewc_loss": 0.07148571312427521, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035841186763718724, "grad_norm": 8.49560546875, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8651961088180542, "num_tokens": 705442460.0, "step": 18488 }, { "epoch": 2.3519908408599415, "ewc_loss": 0.07073713839054108, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003509261005092412, "grad_norm": 8.293408393859863, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8652347922325134, "num_tokens": 705482929.0, "step": 18489 }, { "epoch": 2.352118051138532, "ewc_loss": 0.0713014081120491, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003565687802620232, "grad_norm": 8.49661636352539, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8598023653030396, "num_tokens": 705525313.0, "step": 18490 }, { "epoch": 2.3522452614171225, "ewc_loss": 0.07047872990369797, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034834197140298784, "grad_norm": 8.307392120361328, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8621081113815308, "num_tokens": 705563730.0, "step": 18491 }, { "epoch": 2.352372471695713, "ewc_loss": 0.07092586904764175, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035281339660286903, "grad_norm": 8.397485733032227, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8697470426559448, "num_tokens": 705602459.0, "step": 18492 }, { "epoch": 2.3524996819743036, "ewc_loss": 0.07054276764392853, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034898240119218826, "grad_norm": 8.301714897155762, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8627859354019165, "num_tokens": 705636223.0, "step": 18493 }, { "epoch": 2.352626892252894, "ewc_loss": 0.07072766125202179, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035083130933344364, "grad_norm": 8.404441833496094, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8637423515319824, "num_tokens": 705676178.0, "step": 18494 }, { "epoch": 2.3527541025314846, "ewc_loss": 0.07038236409425735, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034737831447273493, "grad_norm": 8.312040328979492, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8728798031806946, "num_tokens": 705716687.0, "step": 18495 }, { "epoch": 2.352881312810075, "ewc_loss": 0.07068312168121338, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035038593341596425, "grad_norm": 8.348453521728516, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8742453455924988, "num_tokens": 705759163.0, "step": 18496 }, { "epoch": 2.3530085230886657, "ewc_loss": 0.07039792090654373, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003475339035503566, "grad_norm": 8.296433448791504, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8684426546096802, "num_tokens": 705795982.0, "step": 18497 }, { "epoch": 2.3531357333672562, "ewc_loss": 0.07041765749454498, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003477312857285142, "grad_norm": 8.33837604522705, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8696651458740234, "num_tokens": 705835601.0, "step": 18498 }, { "epoch": 2.3532629436458468, "ewc_loss": 0.07041162252426147, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000347670866176486, "grad_norm": 8.312186241149902, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.863048255443573, "num_tokens": 705876858.0, "step": 18499 }, { "epoch": 2.353390153924437, "ewc_loss": 0.07030286639928818, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034658334334380925, "grad_norm": 8.272920608520508, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8575749397277832, "num_tokens": 705921944.0, "step": 18500 }, { "epoch": 2.353517364203028, "ewc_loss": 0.07033523917198181, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003469070652499795, "grad_norm": 8.275776863098145, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8672451972961426, "num_tokens": 705967999.0, "step": 18501 }, { "epoch": 2.353644574481618, "ewc_loss": 0.07041983306407928, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003477529971860349, "grad_norm": 8.338232040405273, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8590511679649353, "num_tokens": 706003370.0, "step": 18502 }, { "epoch": 2.353771784760209, "ewc_loss": 0.07030699402093887, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034662464167922735, "grad_norm": 8.3281831741333, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8540292382240295, "num_tokens": 706036551.0, "step": 18503 }, { "epoch": 2.353898995038799, "ewc_loss": 0.07038286328315735, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034738334943540394, "grad_norm": 8.323929786682129, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8835439085960388, "num_tokens": 706075965.0, "step": 18504 }, { "epoch": 2.3540262053173895, "ewc_loss": 0.07031363248825073, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034669096930883825, "grad_norm": 8.269872665405273, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.8543016910552979, "num_tokens": 706117943.0, "step": 18505 }, { "epoch": 2.35415341559598, "ewc_loss": 0.0703570693731308, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000347125343978405, "grad_norm": 8.329251289367676, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8699300289154053, "num_tokens": 706157183.0, "step": 18506 }, { "epoch": 2.3542806258745705, "ewc_loss": 0.07019048929214478, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034545958624221385, "grad_norm": 8.275206565856934, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8740962743759155, "num_tokens": 706196878.0, "step": 18507 }, { "epoch": 2.354407836153161, "ewc_loss": 0.07042926549911499, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003478473227005452, "grad_norm": 8.386096000671387, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8541103005409241, "num_tokens": 706231102.0, "step": 18508 }, { "epoch": 2.3545350464317516, "ewc_loss": 0.07010411471128464, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445958427619189, "grad_norm": 8.216938018798828, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8736104965209961, "num_tokens": 706273372.0, "step": 18509 }, { "epoch": 2.354662256710342, "ewc_loss": 0.07058531045913696, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003494078409858048, "grad_norm": 8.364582061767578, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8610673546791077, "num_tokens": 706308106.0, "step": 18510 }, { "epoch": 2.3547894669889327, "ewc_loss": 0.07016351819038391, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003451898228377104, "grad_norm": 8.240530967712402, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8728893995285034, "num_tokens": 706344267.0, "step": 18511 }, { "epoch": 2.354916677267523, "ewc_loss": 0.07045747339725494, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034812939702533185, "grad_norm": 8.349994659423828, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8848705291748047, "num_tokens": 706385019.0, "step": 18512 }, { "epoch": 2.3550438875461137, "ewc_loss": 0.07014515995979309, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003450063231866807, "grad_norm": 8.242788314819336, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8648880124092102, "num_tokens": 706424913.0, "step": 18513 }, { "epoch": 2.3551710978247042, "ewc_loss": 0.07041791081428528, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000347733759554103, "grad_norm": 8.293127059936523, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8665938973426819, "num_tokens": 706466064.0, "step": 18514 }, { "epoch": 2.3552983081032948, "ewc_loss": 0.07029028981924057, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003464575856924057, "grad_norm": 8.270063400268555, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8715603351593018, "num_tokens": 706504572.0, "step": 18515 }, { "epoch": 2.3554255183818853, "ewc_loss": 0.07045784592628479, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003481331514194608, "grad_norm": 8.264214515686035, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8591122627258301, "num_tokens": 706543901.0, "step": 18516 }, { "epoch": 2.355552728660476, "ewc_loss": 0.07044577598571777, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003480123996268958, "grad_norm": 8.294441223144531, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8587993383407593, "num_tokens": 706579095.0, "step": 18517 }, { "epoch": 2.3556799389390664, "ewc_loss": 0.07046281546354294, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003481828607618809, "grad_norm": 8.270750045776367, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8584822416305542, "num_tokens": 706614078.0, "step": 18518 }, { "epoch": 2.355807149217657, "ewc_loss": 0.07051205635070801, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003486752393655479, "grad_norm": 8.27126693725586, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8733493685722351, "num_tokens": 706652596.0, "step": 18519 }, { "epoch": 2.3559343594962474, "ewc_loss": 0.07053188979625702, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034887358197011054, "grad_norm": 8.339540481567383, "learning_rate": 1e-06, "loss": 0.3876, "mean_token_accuracy": 0.8889883756637573, "num_tokens": 706691297.0, "step": 18520 }, { "epoch": 2.356061569774838, "ewc_loss": 0.07029153406620026, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003464700421318412, "grad_norm": 8.30338191986084, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8541839718818665, "num_tokens": 706727344.0, "step": 18521 }, { "epoch": 2.3561887800534285, "ewc_loss": 0.07051008194684982, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034865550696849823, "grad_norm": 8.351927757263184, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8701186776161194, "num_tokens": 706770411.0, "step": 18522 }, { "epoch": 2.356315990332019, "ewc_loss": 0.070219025015831, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003457449784036726, "grad_norm": 8.208270072937012, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8701635599136353, "num_tokens": 706816215.0, "step": 18523 }, { "epoch": 2.3564432006106095, "ewc_loss": 0.07055879384279251, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003491426177788526, "grad_norm": 8.312856674194336, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8609706163406372, "num_tokens": 706857400.0, "step": 18524 }, { "epoch": 2.3565704108891996, "ewc_loss": 0.07035361230373383, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034709079773165286, "grad_norm": 8.313010215759277, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8577536344528198, "num_tokens": 706892715.0, "step": 18525 }, { "epoch": 2.3566976211677906, "ewc_loss": 0.07041402906179428, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034769499325193465, "grad_norm": 8.250205993652344, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8661717772483826, "num_tokens": 706934373.0, "step": 18526 }, { "epoch": 2.3568248314463807, "ewc_loss": 0.07051193714141846, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034867404610849917, "grad_norm": 8.316864013671875, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8670860528945923, "num_tokens": 706976885.0, "step": 18527 }, { "epoch": 2.356952041724971, "ewc_loss": 0.07044102251529694, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003479649603832513, "grad_norm": 8.282821655273438, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8724093437194824, "num_tokens": 707016957.0, "step": 18528 }, { "epoch": 2.3570792520035617, "ewc_loss": 0.0705208107829094, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003487628127913922, "grad_norm": 8.29511833190918, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8766922950744629, "num_tokens": 707053687.0, "step": 18529 }, { "epoch": 2.3572064622821522, "ewc_loss": 0.07041964679956436, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034775116364471614, "grad_norm": 8.243120193481445, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8597877025604248, "num_tokens": 707087327.0, "step": 18530 }, { "epoch": 2.3573336725607428, "ewc_loss": 0.07061739265918732, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003497286234050989, "grad_norm": 8.277968406677246, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8786907196044922, "num_tokens": 707124479.0, "step": 18531 }, { "epoch": 2.3574608828393333, "ewc_loss": 0.07050979137420654, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003486525674816221, "grad_norm": 8.248579978942871, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8692418336868286, "num_tokens": 707162452.0, "step": 18532 }, { "epoch": 2.357588093117924, "ewc_loss": 0.07062327116727829, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003497874131426215, "grad_norm": 8.345185279846191, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8746344447135925, "num_tokens": 707196765.0, "step": 18533 }, { "epoch": 2.3577153033965144, "ewc_loss": 0.07036007940769196, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034715543733909726, "grad_norm": 8.242598533630371, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8722249865531921, "num_tokens": 707230151.0, "step": 18534 }, { "epoch": 2.357842513675105, "ewc_loss": 0.07068197429180145, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003503743791952729, "grad_norm": 8.350958824157715, "learning_rate": 1e-06, "loss": 0.5174, "mean_token_accuracy": 0.8488441705703735, "num_tokens": 707264373.0, "step": 18535 }, { "epoch": 2.3579697239536954, "ewc_loss": 0.07026693969964981, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003462240856606513, "grad_norm": 8.20923137664795, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8606281876564026, "num_tokens": 707304844.0, "step": 18536 }, { "epoch": 2.358096934232286, "ewc_loss": 0.07068584859371185, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003504132037051022, "grad_norm": 8.352471351623535, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8685624599456787, "num_tokens": 707347081.0, "step": 18537 }, { "epoch": 2.3582241445108765, "ewc_loss": 0.07022587954998016, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003458134306129068, "grad_norm": 8.222000122070312, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8605373501777649, "num_tokens": 707388243.0, "step": 18538 }, { "epoch": 2.358351354789467, "ewc_loss": 0.07066065818071365, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003501612809486687, "grad_norm": 8.37194538116455, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8533351421356201, "num_tokens": 707427109.0, "step": 18539 }, { "epoch": 2.3584785650680575, "ewc_loss": 0.0701984167098999, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003455388068687171, "grad_norm": 8.185673713684082, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8682083487510681, "num_tokens": 707462502.0, "step": 18540 }, { "epoch": 2.358605775346648, "ewc_loss": 0.07067383080720901, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003502930048853159, "grad_norm": 8.361700057983398, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8606066107749939, "num_tokens": 707502861.0, "step": 18541 }, { "epoch": 2.3587329856252386, "ewc_loss": 0.07021647691726685, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034571942524053156, "grad_norm": 8.214714050292969, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8463596105575562, "num_tokens": 707535868.0, "step": 18542 }, { "epoch": 2.358860195903829, "ewc_loss": 0.07072609663009644, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003508156805764884, "grad_norm": 8.330011367797852, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8700705766677856, "num_tokens": 707566128.0, "step": 18543 }, { "epoch": 2.3589874061824196, "ewc_loss": 0.0703820139169693, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003473748220130801, "grad_norm": 8.277044296264648, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8732991814613342, "num_tokens": 707605113.0, "step": 18544 }, { "epoch": 2.35911461646101, "ewc_loss": 0.07054265588521957, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034898123703897, "grad_norm": 8.270662307739258, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8635410666465759, "num_tokens": 707647968.0, "step": 18545 }, { "epoch": 2.3592418267396007, "ewc_loss": 0.07042698562145233, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003478245052974671, "grad_norm": 8.25558090209961, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8744189143180847, "num_tokens": 707682183.0, "step": 18546 }, { "epoch": 2.3593690370181912, "ewc_loss": 0.07054705917835236, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003490252420306206, "grad_norm": 8.298238754272461, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8738498687744141, "num_tokens": 707720938.0, "step": 18547 }, { "epoch": 2.3594962472967818, "ewc_loss": 0.07044832408428192, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003480379527900368, "grad_norm": 8.215106010437012, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8550577163696289, "num_tokens": 707762624.0, "step": 18548 }, { "epoch": 2.3596234575753723, "ewc_loss": 0.07061344385147095, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034968912950716913, "grad_norm": 8.269780158996582, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8587758541107178, "num_tokens": 707798406.0, "step": 18549 }, { "epoch": 2.3597506678539624, "ewc_loss": 0.07053640484809875, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034891869290731847, "grad_norm": 8.268125534057617, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8666570782661438, "num_tokens": 707835935.0, "step": 18550 }, { "epoch": 2.3598778781325533, "ewc_loss": 0.07061424851417542, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003496972203720361, "grad_norm": 8.290817260742188, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8775930404663086, "num_tokens": 707872971.0, "step": 18551 }, { "epoch": 2.3600050884111434, "ewc_loss": 0.07046863436698914, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034824098111130297, "grad_norm": 8.267059326171875, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8766149282455444, "num_tokens": 707909798.0, "step": 18552 }, { "epoch": 2.360132298689734, "ewc_loss": 0.07067060470581055, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035026075784116983, "grad_norm": 8.287646293640137, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8695269823074341, "num_tokens": 707951130.0, "step": 18553 }, { "epoch": 2.3602595089683245, "ewc_loss": 0.07059787213802338, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034953339491039515, "grad_norm": 8.24656867980957, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8526639938354492, "num_tokens": 707989502.0, "step": 18554 }, { "epoch": 2.360386719246915, "ewc_loss": 0.07062043994665146, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003497590951155871, "grad_norm": 8.298118591308594, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8658703565597534, "num_tokens": 708023521.0, "step": 18555 }, { "epoch": 2.3605139295255055, "ewc_loss": 0.07053574174642563, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034891211544163525, "grad_norm": 8.287235260009766, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8718874454498291, "num_tokens": 708057594.0, "step": 18556 }, { "epoch": 2.360641139804096, "ewc_loss": 0.07056771963834763, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003492318792268634, "grad_norm": 8.284126281738281, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.862595796585083, "num_tokens": 708101393.0, "step": 18557 }, { "epoch": 2.3607683500826866, "ewc_loss": 0.07056409865617752, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003491956740617752, "grad_norm": 8.271077156066895, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8764439225196838, "num_tokens": 708137141.0, "step": 18558 }, { "epoch": 2.360895560361277, "ewc_loss": 0.0706573873758316, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035012856824323535, "grad_norm": 8.326189994812012, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8714033365249634, "num_tokens": 708173497.0, "step": 18559 }, { "epoch": 2.3610227706398677, "ewc_loss": 0.07045196741819382, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034807436168193817, "grad_norm": 8.266080856323242, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8661129474639893, "num_tokens": 708209412.0, "step": 18560 }, { "epoch": 2.361149980918458, "ewc_loss": 0.07074365019798279, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035099120577797294, "grad_norm": 8.385428428649902, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8730512857437134, "num_tokens": 708239675.0, "step": 18561 }, { "epoch": 2.3612771911970487, "ewc_loss": 0.07037131488323212, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003472678072284907, "grad_norm": 8.270444869995117, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8611237406730652, "num_tokens": 708274918.0, "step": 18562 }, { "epoch": 2.3614044014756392, "ewc_loss": 0.07066585123538971, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003502131730783731, "grad_norm": 8.356056213378906, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8522822856903076, "num_tokens": 708313645.0, "step": 18563 }, { "epoch": 2.3615316117542298, "ewc_loss": 0.07031598687171936, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034671451430767775, "grad_norm": 8.223000526428223, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8679014444351196, "num_tokens": 708357985.0, "step": 18564 }, { "epoch": 2.3616588220328203, "ewc_loss": 0.07063618302345276, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034991648863069713, "grad_norm": 8.33864974975586, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8443014621734619, "num_tokens": 708393606.0, "step": 18565 }, { "epoch": 2.361786032311411, "ewc_loss": 0.07023460417985916, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034590071300044656, "grad_norm": 8.23311710357666, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8729573488235474, "num_tokens": 708428941.0, "step": 18566 }, { "epoch": 2.3619132425900013, "ewc_loss": 0.07067236304283142, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035027836565859616, "grad_norm": 8.360894203186035, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8706753849983215, "num_tokens": 708467847.0, "step": 18567 }, { "epoch": 2.362040452868592, "ewc_loss": 0.07017425447702408, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034529721597209573, "grad_norm": 8.196152687072754, "learning_rate": 1e-06, "loss": 0.4014, "mean_token_accuracy": 0.8825246095657349, "num_tokens": 708503212.0, "step": 18568 }, { "epoch": 2.3621676631471824, "ewc_loss": 0.0707273781299591, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035082848626188934, "grad_norm": 8.302532196044922, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8662748336791992, "num_tokens": 708542897.0, "step": 18569 }, { "epoch": 2.362294873425773, "ewc_loss": 0.07036510109901428, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003472056705504656, "grad_norm": 8.330212593078613, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8538683652877808, "num_tokens": 708574725.0, "step": 18570 }, { "epoch": 2.3624220837043635, "ewc_loss": 0.07040905952453613, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003476452548056841, "grad_norm": 8.214347839355469, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8765758872032166, "num_tokens": 708613557.0, "step": 18571 }, { "epoch": 2.362549293982954, "ewc_loss": 0.07077734172344208, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035132805351167917, "grad_norm": 8.371147155761719, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8585110902786255, "num_tokens": 708651872.0, "step": 18572 }, { "epoch": 2.3626765042615445, "ewc_loss": 0.0702752023935318, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034630674053914845, "grad_norm": 8.265265464782715, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8531599640846252, "num_tokens": 708688654.0, "step": 18573 }, { "epoch": 2.362803714540135, "ewc_loss": 0.07074326276779175, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035098736407235265, "grad_norm": 8.32823371887207, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.871868908405304, "num_tokens": 708723285.0, "step": 18574 }, { "epoch": 2.362930924818725, "ewc_loss": 0.07042011618614197, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003477558493614197, "grad_norm": 8.297943115234375, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8752509355545044, "num_tokens": 708760617.0, "step": 18575 }, { "epoch": 2.363058135097316, "ewc_loss": 0.07055814564228058, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003491361567284912, "grad_norm": 8.405781745910645, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8710824847221375, "num_tokens": 708790792.0, "step": 18576 }, { "epoch": 2.363185345375906, "ewc_loss": 0.0702923983335495, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003464787150733173, "grad_norm": 8.253066062927246, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8573799729347229, "num_tokens": 708829935.0, "step": 18577 }, { "epoch": 2.3633125556544967, "ewc_loss": 0.07056339085102081, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003491886309348047, "grad_norm": 8.274826049804688, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8596935272216797, "num_tokens": 708870578.0, "step": 18578 }, { "epoch": 2.3634397659330872, "ewc_loss": 0.07039962708950043, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003475509292911738, "grad_norm": 8.291651725769043, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8653870820999146, "num_tokens": 708907044.0, "step": 18579 }, { "epoch": 2.3635669762116778, "ewc_loss": 0.0703604444861412, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034715913352556527, "grad_norm": 8.297374725341797, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8485720157623291, "num_tokens": 708947443.0, "step": 18580 }, { "epoch": 2.3636941864902683, "ewc_loss": 0.07050221413373947, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003485768102109432, "grad_norm": 8.321585655212402, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8674203157424927, "num_tokens": 708983174.0, "step": 18581 }, { "epoch": 2.363821396768859, "ewc_loss": 0.07029754668474197, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003465301706455648, "grad_norm": 8.262889862060547, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8697941303253174, "num_tokens": 709021950.0, "step": 18582 }, { "epoch": 2.3639486070474494, "ewc_loss": 0.07039984315633774, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034755311207845807, "grad_norm": 8.253072738647461, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8713895082473755, "num_tokens": 709057236.0, "step": 18583 }, { "epoch": 2.36407581732604, "ewc_loss": 0.07040326297283173, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003475873381830752, "grad_norm": 8.311164855957031, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8687913417816162, "num_tokens": 709093376.0, "step": 18584 }, { "epoch": 2.3642030276046304, "ewc_loss": 0.07026448845863342, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003461995511315763, "grad_norm": 8.301883697509766, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.860512375831604, "num_tokens": 709129502.0, "step": 18585 }, { "epoch": 2.364330237883221, "ewc_loss": 0.07031424343585968, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034669716842472553, "grad_norm": 8.293500900268555, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8634345531463623, "num_tokens": 709162538.0, "step": 18586 }, { "epoch": 2.3644574481618115, "ewc_loss": 0.07026425749063492, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003461972810328007, "grad_norm": 8.28317642211914, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8551862239837646, "num_tokens": 709201690.0, "step": 18587 }, { "epoch": 2.364584658440402, "ewc_loss": 0.07032281160354614, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003467827627900988, "grad_norm": 8.32580852508545, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8587632179260254, "num_tokens": 709240593.0, "step": 18588 }, { "epoch": 2.3647118687189925, "ewc_loss": 0.07026031613349915, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003461578453425318, "grad_norm": 8.278961181640625, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8772724270820618, "num_tokens": 709278192.0, "step": 18589 }, { "epoch": 2.364839078997583, "ewc_loss": 0.07035775482654572, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034713218337856233, "grad_norm": 8.288161277770996, "learning_rate": 1e-06, "loss": 0.3972, "mean_token_accuracy": 0.8834522366523743, "num_tokens": 709311797.0, "step": 18590 }, { "epoch": 2.3649662892761736, "ewc_loss": 0.07021390646696091, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003456937556620687, "grad_norm": 8.276849746704102, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8642532229423523, "num_tokens": 709350379.0, "step": 18591 }, { "epoch": 2.365093499554764, "ewc_loss": 0.07034005224704742, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034695526119321585, "grad_norm": 8.283013343811035, "learning_rate": 1e-06, "loss": 0.383, "mean_token_accuracy": 0.8920270204544067, "num_tokens": 709389030.0, "step": 18592 }, { "epoch": 2.3652207098333546, "ewc_loss": 0.07019861042499542, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003455408150330186, "grad_norm": 8.318197250366211, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8588109016418457, "num_tokens": 709433692.0, "step": 18593 }, { "epoch": 2.365347920111945, "ewc_loss": 0.07015520334243774, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003451067314017564, "grad_norm": 8.239092826843262, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8649729490280151, "num_tokens": 709470773.0, "step": 18594 }, { "epoch": 2.3654751303905357, "ewc_loss": 0.0703151747584343, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003467064234428108, "grad_norm": 8.33422565460205, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8603042960166931, "num_tokens": 709506764.0, "step": 18595 }, { "epoch": 2.3656023406691262, "ewc_loss": 0.07011876255273819, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034474232234060764, "grad_norm": 8.323145866394043, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8625615835189819, "num_tokens": 709546931.0, "step": 18596 }, { "epoch": 2.3657295509477168, "ewc_loss": 0.0703480914235115, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003470356168691069, "grad_norm": 8.32217788696289, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8669764995574951, "num_tokens": 709582842.0, "step": 18597 }, { "epoch": 2.365856761226307, "ewc_loss": 0.07015785574913025, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034513321588747203, "grad_norm": 8.290389060974121, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8664189577102661, "num_tokens": 709621574.0, "step": 18598 }, { "epoch": 2.365983971504898, "ewc_loss": 0.07029063999652863, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003464610781520605, "grad_norm": 8.310001373291016, "learning_rate": 1e-06, "loss": 0.4036, "mean_token_accuracy": 0.8843398690223694, "num_tokens": 709662437.0, "step": 18599 }, { "epoch": 2.366111181783488, "ewc_loss": 0.0701819434762001, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003453741373959929, "grad_norm": 8.190579414367676, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.8796287775039673, "num_tokens": 709701318.0, "step": 18600 }, { "epoch": 2.3662383920620784, "ewc_loss": 0.07041554898023605, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034771018545143306, "grad_norm": 8.336174011230469, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8670313358306885, "num_tokens": 709745156.0, "step": 18601 }, { "epoch": 2.366365602340669, "ewc_loss": 0.0701674148440361, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003452288219705224, "grad_norm": 8.279317855834961, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8612878322601318, "num_tokens": 709785849.0, "step": 18602 }, { "epoch": 2.3664928126192595, "ewc_loss": 0.07036177814006805, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003471724921837449, "grad_norm": 8.323689460754395, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8708837032318115, "num_tokens": 709830038.0, "step": 18603 }, { "epoch": 2.36662002289785, "ewc_loss": 0.07026121020317078, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034616683842614293, "grad_norm": 8.252859115600586, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.863843560218811, "num_tokens": 709871174.0, "step": 18604 }, { "epoch": 2.3667472331764405, "ewc_loss": 0.07037374377250671, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034729213803075254, "grad_norm": 8.331377983093262, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8878281712532043, "num_tokens": 709905921.0, "step": 18605 }, { "epoch": 2.366874443455031, "ewc_loss": 0.07013152539730072, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003448699426371604, "grad_norm": 8.26085376739502, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8821445107460022, "num_tokens": 709939856.0, "step": 18606 }, { "epoch": 2.3670016537336216, "ewc_loss": 0.07041316479444504, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000347686349414289, "grad_norm": 8.2963228225708, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8792810440063477, "num_tokens": 709978805.0, "step": 18607 }, { "epoch": 2.367128864012212, "ewc_loss": 0.07030276209115982, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034658232470974326, "grad_norm": 8.290427207946777, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.880520224571228, "num_tokens": 710015015.0, "step": 18608 }, { "epoch": 2.3672560742908026, "ewc_loss": 0.07031343132257462, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003466889902483672, "grad_norm": 8.295820236206055, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8578253984451294, "num_tokens": 710054887.0, "step": 18609 }, { "epoch": 2.367383284569393, "ewc_loss": 0.07036104798316956, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034716518712230027, "grad_norm": 8.287495613098145, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8661328554153442, "num_tokens": 710097012.0, "step": 18610 }, { "epoch": 2.3675104948479837, "ewc_loss": 0.07039092481136322, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034746393794193864, "grad_norm": 8.386575698852539, "learning_rate": 1e-06, "loss": 0.5093, "mean_token_accuracy": 0.8509616851806641, "num_tokens": 710130912.0, "step": 18611 }, { "epoch": 2.3676377051265742, "ewc_loss": 0.0702960342168808, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034651500754989684, "grad_norm": 8.293452262878418, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.86400306224823, "num_tokens": 710168880.0, "step": 18612 }, { "epoch": 2.3677649154051648, "ewc_loss": 0.07054180651903152, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003489727678243071, "grad_norm": 8.37598991394043, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8760707378387451, "num_tokens": 710205619.0, "step": 18613 }, { "epoch": 2.3678921256837553, "ewc_loss": 0.07024635374546051, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034601823426783085, "grad_norm": 8.297712326049805, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8738447427749634, "num_tokens": 710239475.0, "step": 18614 }, { "epoch": 2.368019335962346, "ewc_loss": 0.07045535743236542, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003481082967482507, "grad_norm": 8.332282066345215, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8692452311515808, "num_tokens": 710276533.0, "step": 18615 }, { "epoch": 2.3681465462409363, "ewc_loss": 0.07026079297065735, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003461626183707267, "grad_norm": 8.301191329956055, "learning_rate": 1e-06, "loss": 0.4085, "mean_token_accuracy": 0.8818261623382568, "num_tokens": 710316731.0, "step": 18616 }, { "epoch": 2.368273756519527, "ewc_loss": 0.07029606401920319, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003465153567958623, "grad_norm": 8.295316696166992, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8610886335372925, "num_tokens": 710352310.0, "step": 18617 }, { "epoch": 2.3684009667981174, "ewc_loss": 0.07044780254364014, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034803268499672413, "grad_norm": 8.34614086151123, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.86253821849823, "num_tokens": 710400203.0, "step": 18618 }, { "epoch": 2.368528177076708, "ewc_loss": 0.07024724781513214, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003460271982476115, "grad_norm": 8.219992637634277, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8728183507919312, "num_tokens": 710433054.0, "step": 18619 }, { "epoch": 2.3686553873552985, "ewc_loss": 0.07066047191619873, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003501594183035195, "grad_norm": 8.35364818572998, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.869918942451477, "num_tokens": 710466879.0, "step": 18620 }, { "epoch": 2.368782597633889, "ewc_loss": 0.07030339539051056, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034658864024095237, "grad_norm": 8.28527545928955, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8604299426078796, "num_tokens": 710502808.0, "step": 18621 }, { "epoch": 2.3689098079124795, "ewc_loss": 0.07062092423439026, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003497638972476125, "grad_norm": 8.361870765686035, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8557340502738953, "num_tokens": 710546190.0, "step": 18622 }, { "epoch": 2.3690370181910696, "ewc_loss": 0.0702822357416153, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003463770844973624, "grad_norm": 8.233484268188477, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8684523105621338, "num_tokens": 710585556.0, "step": 18623 }, { "epoch": 2.3691642284696606, "ewc_loss": 0.0707085132598877, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003506398352328688, "grad_norm": 8.374363899230957, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8705609440803528, "num_tokens": 710621366.0, "step": 18624 }, { "epoch": 2.3692914387482507, "ewc_loss": 0.07028666138648987, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003464212641119957, "grad_norm": 8.252535820007324, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8711592555046082, "num_tokens": 710660217.0, "step": 18625 }, { "epoch": 2.369418649026841, "ewc_loss": 0.07055827975273132, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034913752460852265, "grad_norm": 8.344595909118652, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8629636764526367, "num_tokens": 710695798.0, "step": 18626 }, { "epoch": 2.3695458593054317, "ewc_loss": 0.07030614465475082, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000346616143360734, "grad_norm": 8.268427848815918, "learning_rate": 1e-06, "loss": 0.5241, "mean_token_accuracy": 0.8495821356773376, "num_tokens": 710732739.0, "step": 18627 }, { "epoch": 2.3696730695840222, "ewc_loss": 0.07051775604486465, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034873225376941264, "grad_norm": 8.35687255859375, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8569595217704773, "num_tokens": 710769651.0, "step": 18628 }, { "epoch": 2.3698002798626128, "ewc_loss": 0.07021786272525787, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034573327866382897, "grad_norm": 8.149810791015625, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8659749031066895, "num_tokens": 710809487.0, "step": 18629 }, { "epoch": 2.3699274901412033, "ewc_loss": 0.07089746743440628, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003525293432176113, "grad_norm": 8.438008308410645, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8705748319625854, "num_tokens": 710849152.0, "step": 18630 }, { "epoch": 2.370054700419794, "ewc_loss": 0.0700983852148056, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445385373197496, "grad_norm": 8.28336238861084, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8527477979660034, "num_tokens": 710890933.0, "step": 18631 }, { "epoch": 2.3701819106983844, "ewc_loss": 0.07086756825447083, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035223033046349883, "grad_norm": 8.445520401000977, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8692830801010132, "num_tokens": 710928262.0, "step": 18632 }, { "epoch": 2.370309120976975, "ewc_loss": 0.07001977413892746, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034375241375528276, "grad_norm": 8.204398155212402, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8722000122070312, "num_tokens": 710962969.0, "step": 18633 }, { "epoch": 2.3704363312555654, "ewc_loss": 0.07068120688199997, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035036675399169326, "grad_norm": 8.311038970947266, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8629881143569946, "num_tokens": 710998717.0, "step": 18634 }, { "epoch": 2.370563541534156, "ewc_loss": 0.07028542459011078, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034640898229554296, "grad_norm": 8.250846862792969, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8517822027206421, "num_tokens": 711032819.0, "step": 18635 }, { "epoch": 2.3706907518127465, "ewc_loss": 0.07043969631195068, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003479516308289021, "grad_norm": 8.29893684387207, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8573064208030701, "num_tokens": 711069395.0, "step": 18636 }, { "epoch": 2.370817962091337, "ewc_loss": 0.07037939131259918, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003473485994618386, "grad_norm": 8.287479400634766, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8679797053337097, "num_tokens": 711105754.0, "step": 18637 }, { "epoch": 2.3709451723699275, "ewc_loss": 0.07026679813861847, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034622265957295895, "grad_norm": 8.254140853881836, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8617391586303711, "num_tokens": 711145291.0, "step": 18638 }, { "epoch": 2.371072382648518, "ewc_loss": 0.07047587633132935, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034831344964914024, "grad_norm": 8.317177772521973, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8676626086235046, "num_tokens": 711183431.0, "step": 18639 }, { "epoch": 2.3711995929271086, "ewc_loss": 0.07014298439025879, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003449845826253295, "grad_norm": 8.200632095336914, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8524550199508667, "num_tokens": 711229754.0, "step": 18640 }, { "epoch": 2.371326803205699, "ewc_loss": 0.07052625715732574, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034881720785051584, "grad_norm": 8.389832496643066, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8680477142333984, "num_tokens": 711269071.0, "step": 18641 }, { "epoch": 2.3714540134842896, "ewc_loss": 0.07021155953407288, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034567026887089014, "grad_norm": 8.255093574523926, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8628143668174744, "num_tokens": 711312036.0, "step": 18642 }, { "epoch": 2.37158122376288, "ewc_loss": 0.07049323618412018, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034848705399781466, "grad_norm": 8.291693687438965, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8742216229438782, "num_tokens": 711347945.0, "step": 18643 }, { "epoch": 2.3717084340414707, "ewc_loss": 0.07023560255765915, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003459107247181237, "grad_norm": 8.278938293457031, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8444629907608032, "num_tokens": 711388433.0, "step": 18644 }, { "epoch": 2.371835644320061, "ewc_loss": 0.0704718679189682, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003482734027784318, "grad_norm": 8.330007553100586, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8713361024856567, "num_tokens": 711425821.0, "step": 18645 }, { "epoch": 2.3719628545986517, "ewc_loss": 0.07025007903575897, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000346055458066985, "grad_norm": 8.300936698913574, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8602994084358215, "num_tokens": 711462725.0, "step": 18646 }, { "epoch": 2.3720900648772423, "ewc_loss": 0.07035249471664429, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003470796509645879, "grad_norm": 8.291664123535156, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8809117078781128, "num_tokens": 711499987.0, "step": 18647 }, { "epoch": 2.3722172751558324, "ewc_loss": 0.07025021314620972, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000346056796843186, "grad_norm": 8.200233459472656, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8507646322250366, "num_tokens": 711543658.0, "step": 18648 }, { "epoch": 2.3723444854344233, "ewc_loss": 0.07053300738334656, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003488847869448364, "grad_norm": 8.348665237426758, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.8776464462280273, "num_tokens": 711583875.0, "step": 18649 }, { "epoch": 2.3724716957130134, "ewc_loss": 0.07022728025913239, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034582745865918696, "grad_norm": 8.288569450378418, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8486615419387817, "num_tokens": 711625543.0, "step": 18650 }, { "epoch": 2.372598905991604, "ewc_loss": 0.07055406272411346, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003490952658466995, "grad_norm": 8.350107192993164, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.866784393787384, "num_tokens": 711660528.0, "step": 18651 }, { "epoch": 2.3727261162701945, "ewc_loss": 0.07025279104709625, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034608255373314023, "grad_norm": 8.266332626342773, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8809576034545898, "num_tokens": 711695646.0, "step": 18652 }, { "epoch": 2.372853326548785, "ewc_loss": 0.07054537534713745, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034900844912044704, "grad_norm": 8.333943367004395, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8504155874252319, "num_tokens": 711734691.0, "step": 18653 }, { "epoch": 2.3729805368273755, "ewc_loss": 0.07028603553771973, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003464150649961084, "grad_norm": 8.302864074707031, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8691475987434387, "num_tokens": 711771518.0, "step": 18654 }, { "epoch": 2.373107747105966, "ewc_loss": 0.07045529782772064, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034810768556781113, "grad_norm": 8.266736030578613, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8559256196022034, "num_tokens": 711808968.0, "step": 18655 }, { "epoch": 2.3732349573845566, "ewc_loss": 0.07036148011684418, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034716943628154695, "grad_norm": 8.310965538024902, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8732970952987671, "num_tokens": 711842944.0, "step": 18656 }, { "epoch": 2.373362167663147, "ewc_loss": 0.07041037082672119, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003476583806332201, "grad_norm": 8.305795669555664, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8621830940246582, "num_tokens": 711882740.0, "step": 18657 }, { "epoch": 2.3734893779417376, "ewc_loss": 0.07054862380027771, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003490408998914063, "grad_norm": 8.254744529724121, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8603642582893372, "num_tokens": 711925362.0, "step": 18658 }, { "epoch": 2.373616588220328, "ewc_loss": 0.07032962143421173, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003468509530648589, "grad_norm": 8.231857299804688, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8714444637298584, "num_tokens": 711971285.0, "step": 18659 }, { "epoch": 2.3737437984989187, "ewc_loss": 0.07057182490825653, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003492728865239769, "grad_norm": 8.281005859375, "learning_rate": 1e-06, "loss": 0.474, "mean_token_accuracy": 0.8622044324874878, "num_tokens": 712013306.0, "step": 18660 }, { "epoch": 2.3738710087775092, "ewc_loss": 0.07056313753128052, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034918604069389403, "grad_norm": 8.32313060760498, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8667137622833252, "num_tokens": 712050837.0, "step": 18661 }, { "epoch": 2.3739982190560998, "ewc_loss": 0.07047509402036667, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003483056207187474, "grad_norm": 8.343297958374023, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.881991982460022, "num_tokens": 712085140.0, "step": 18662 }, { "epoch": 2.3741254293346903, "ewc_loss": 0.07054533064365387, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003490079543553293, "grad_norm": 8.286377906799316, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8800275325775146, "num_tokens": 712122495.0, "step": 18663 }, { "epoch": 2.374252639613281, "ewc_loss": 0.07052644342184067, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000348819128703326, "grad_norm": 8.33535099029541, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8600224852561951, "num_tokens": 712161312.0, "step": 18664 }, { "epoch": 2.3743798498918713, "ewc_loss": 0.0705246776342392, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003488014917820692, "grad_norm": 8.345258712768555, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8649355173110962, "num_tokens": 712197275.0, "step": 18665 }, { "epoch": 2.374507060170462, "ewc_loss": 0.07050725817680359, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034862730535678566, "grad_norm": 8.346939086914062, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8536174297332764, "num_tokens": 712231309.0, "step": 18666 }, { "epoch": 2.3746342704490524, "ewc_loss": 0.07044469565153122, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034800166031345725, "grad_norm": 8.273028373718262, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8613473176956177, "num_tokens": 712264864.0, "step": 18667 }, { "epoch": 2.374761480727643, "ewc_loss": 0.0705782026052475, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000349336740327999, "grad_norm": 8.362492561340332, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8674577474594116, "num_tokens": 712303564.0, "step": 18668 }, { "epoch": 2.3748886910062335, "ewc_loss": 0.07039929926395416, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003475476405583322, "grad_norm": 8.240734100341797, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8763963580131531, "num_tokens": 712341872.0, "step": 18669 }, { "epoch": 2.375015901284824, "ewc_loss": 0.07070022076368332, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003505568893160671, "grad_norm": 8.376035690307617, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8547160625457764, "num_tokens": 712379835.0, "step": 18670 }, { "epoch": 2.3751431115634145, "ewc_loss": 0.07025133073329926, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034606794361025095, "grad_norm": 8.274192810058594, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8623666763305664, "num_tokens": 712417462.0, "step": 18671 }, { "epoch": 2.375270321842005, "ewc_loss": 0.07069770991802216, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003505318018142134, "grad_norm": 8.44907283782959, "learning_rate": 1e-06, "loss": 0.5554, "mean_token_accuracy": 0.839747428894043, "num_tokens": 712461581.0, "step": 18672 }, { "epoch": 2.375397532120595, "ewc_loss": 0.07013453543186188, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003449000942055136, "grad_norm": 8.236492156982422, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8643198609352112, "num_tokens": 712499615.0, "step": 18673 }, { "epoch": 2.375524742399186, "ewc_loss": 0.07062885910272598, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000349843263393268, "grad_norm": 8.386992454528809, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8762755393981934, "num_tokens": 712533264.0, "step": 18674 }, { "epoch": 2.375651952677776, "ewc_loss": 0.07018536329269409, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003454082761891186, "grad_norm": 8.245973587036133, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8584426045417786, "num_tokens": 712571496.0, "step": 18675 }, { "epoch": 2.3757791629563667, "ewc_loss": 0.0706818550825119, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035037327324971557, "grad_norm": 9.022802352905273, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8720061779022217, "num_tokens": 712606510.0, "step": 18676 }, { "epoch": 2.3759063732349572, "ewc_loss": 0.0696900337934494, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404550370760262, "grad_norm": 8.096403121948242, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8749391436576843, "num_tokens": 712646218.0, "step": 18677 }, { "epoch": 2.3760335835135478, "ewc_loss": 0.0713103711605072, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003566584491636604, "grad_norm": 8.581462860107422, "learning_rate": 1e-06, "loss": 0.5644, "mean_token_accuracy": 0.8338918089866638, "num_tokens": 712681905.0, "step": 18678 }, { "epoch": 2.3761607937921383, "ewc_loss": 0.0696936696767807, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003404913586564362, "grad_norm": 8.181356430053711, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.87891685962677, "num_tokens": 712719916.0, "step": 18679 }, { "epoch": 2.376288004070729, "ewc_loss": 0.07113297283649445, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003548843669705093, "grad_norm": 8.456829071044922, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8669743537902832, "num_tokens": 712760849.0, "step": 18680 }, { "epoch": 2.3764152143493193, "ewc_loss": 0.07009877264499664, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034454240812920034, "grad_norm": 8.191447257995605, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.877602219581604, "num_tokens": 712801199.0, "step": 18681 }, { "epoch": 2.37654242462791, "ewc_loss": 0.07089639455080032, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035251863300800323, "grad_norm": 8.395689010620117, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8593425750732422, "num_tokens": 712843471.0, "step": 18682 }, { "epoch": 2.3766696349065004, "ewc_loss": 0.0702764093875885, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003463187604211271, "grad_norm": 8.281567573547363, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8658643960952759, "num_tokens": 712880649.0, "step": 18683 }, { "epoch": 2.376796845185091, "ewc_loss": 0.07060699909925461, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003496246936265379, "grad_norm": 8.43936824798584, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8434165120124817, "num_tokens": 712920574.0, "step": 18684 }, { "epoch": 2.3769240554636815, "ewc_loss": 0.07023714482784271, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034592614974826574, "grad_norm": 8.323626518249512, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8756737112998962, "num_tokens": 712962544.0, "step": 18685 }, { "epoch": 2.377051265742272, "ewc_loss": 0.07040548324584961, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034760957350954413, "grad_norm": 8.97869873046875, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8688191771507263, "num_tokens": 712999138.0, "step": 18686 }, { "epoch": 2.3771784760208625, "ewc_loss": 0.06961600482463837, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003397147520445287, "grad_norm": 8.110799789428711, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8669102191925049, "num_tokens": 713041799.0, "step": 18687 }, { "epoch": 2.377305686299453, "ewc_loss": 0.07116042077541351, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003551588742993772, "grad_norm": 8.570913314819336, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.858892560005188, "num_tokens": 713088248.0, "step": 18688 }, { "epoch": 2.3774328965780436, "ewc_loss": 0.06968539953231812, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034040873288176954, "grad_norm": 8.200491905212402, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8750135898590088, "num_tokens": 713127239.0, "step": 18689 }, { "epoch": 2.377560106856634, "ewc_loss": 0.07104651629924774, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035401989589445293, "grad_norm": 8.597027778625488, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8697744607925415, "num_tokens": 713168389.0, "step": 18690 }, { "epoch": 2.3776873171352246, "ewc_loss": 0.06998391449451447, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003433938545640558, "grad_norm": 8.367934226989746, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8596048355102539, "num_tokens": 713208137.0, "step": 18691 }, { "epoch": 2.377814527413815, "ewc_loss": 0.0705038458108902, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034859319566749036, "grad_norm": 8.443331718444824, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8559256792068481, "num_tokens": 713249344.0, "step": 18692 }, { "epoch": 2.3779417376924057, "ewc_loss": 0.07002586126327515, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003438133280724287, "grad_norm": 8.2368803024292, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8628429174423218, "num_tokens": 713291376.0, "step": 18693 }, { "epoch": 2.378068947970996, "ewc_loss": 0.07057968527078629, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000349351525073871, "grad_norm": 8.481400489807129, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8643558621406555, "num_tokens": 713329925.0, "step": 18694 }, { "epoch": 2.3781961582495867, "ewc_loss": 0.07006590068340302, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034421368036419153, "grad_norm": 8.316205024719238, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8685274720191956, "num_tokens": 713372509.0, "step": 18695 }, { "epoch": 2.378323368528177, "ewc_loss": 0.07047605514526367, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000348315283190459, "grad_norm": 8.396178245544434, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8597192764282227, "num_tokens": 713410055.0, "step": 18696 }, { "epoch": 2.378450578806768, "ewc_loss": 0.0702325701713562, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003458804276306182, "grad_norm": 8.262700080871582, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8632500171661377, "num_tokens": 713453958.0, "step": 18697 }, { "epoch": 2.378577789085358, "ewc_loss": 0.07047779858112335, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003483326581772417, "grad_norm": 8.42056655883789, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8643475770950317, "num_tokens": 713485683.0, "step": 18698 }, { "epoch": 2.3787049993639484, "ewc_loss": 0.07017318904399872, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003452866221778095, "grad_norm": 8.25603199005127, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8665766716003418, "num_tokens": 713528565.0, "step": 18699 }, { "epoch": 2.378832209642539, "ewc_loss": 0.07068131864070892, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035036791814491153, "grad_norm": 8.532793045043945, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8609597682952881, "num_tokens": 713560932.0, "step": 18700 }, { "epoch": 2.3789594199211295, "ewc_loss": 0.07010039687156677, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003445587062742561, "grad_norm": 8.301362037658691, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8628181219100952, "num_tokens": 713592891.0, "step": 18701 }, { "epoch": 2.37908663019972, "ewc_loss": 0.07065616548061371, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035011631553061306, "grad_norm": 8.354464530944824, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8632910251617432, "num_tokens": 713631285.0, "step": 18702 }, { "epoch": 2.3792138404783105, "ewc_loss": 0.07032319903373718, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034678666270338, "grad_norm": 8.314313888549805, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.848567008972168, "num_tokens": 713675143.0, "step": 18703 }, { "epoch": 2.379341050756901, "ewc_loss": 0.07042066752910614, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034776132088154554, "grad_norm": 8.342822074890137, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8713486790657043, "num_tokens": 713717556.0, "step": 18704 }, { "epoch": 2.3794682610354916, "ewc_loss": 0.07041751593351364, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003477298596408218, "grad_norm": 8.321810722351074, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8571460247039795, "num_tokens": 713763515.0, "step": 18705 }, { "epoch": 2.379595471314082, "ewc_loss": 0.07045788317918777, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003481335297692567, "grad_norm": 8.387656211853027, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8703579306602478, "num_tokens": 713801064.0, "step": 18706 }, { "epoch": 2.3797226815926726, "ewc_loss": 0.07025016844272614, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003460564184933901, "grad_norm": 8.275472640991211, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8607184886932373, "num_tokens": 713841662.0, "step": 18707 }, { "epoch": 2.379849891871263, "ewc_loss": 0.07065297663211823, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035008444683626294, "grad_norm": 8.367420196533203, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8689246773719788, "num_tokens": 713883646.0, "step": 18708 }, { "epoch": 2.3799771021498537, "ewc_loss": 0.07026125490665436, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003461671876721084, "grad_norm": 8.289167404174805, "learning_rate": 1e-06, "loss": 0.4013, "mean_token_accuracy": 0.8865923881530762, "num_tokens": 713925365.0, "step": 18709 }, { "epoch": 2.3801043124284442, "ewc_loss": 0.07052108645439148, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034876560675911605, "grad_norm": 8.263216972351074, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8623915314674377, "num_tokens": 713961142.0, "step": 18710 }, { "epoch": 2.3802315227070348, "ewc_loss": 0.07058364152908325, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034939113538712263, "grad_norm": 8.361077308654785, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8730538487434387, "num_tokens": 714001652.0, "step": 18711 }, { "epoch": 2.3803587329856253, "ewc_loss": 0.07041750103235245, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034772971412166953, "grad_norm": 8.302530288696289, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8683832287788391, "num_tokens": 714043023.0, "step": 18712 }, { "epoch": 2.380485943264216, "ewc_loss": 0.07065679877996445, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003501226892694831, "grad_norm": 8.309391975402832, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.864679753780365, "num_tokens": 714080389.0, "step": 18713 }, { "epoch": 2.3806131535428063, "ewc_loss": 0.07042810320854187, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034783571027219296, "grad_norm": 8.31975269317627, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8703181743621826, "num_tokens": 714117695.0, "step": 18714 }, { "epoch": 2.380740363821397, "ewc_loss": 0.07057160139083862, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034927070373669267, "grad_norm": 8.332905769348145, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.873301088809967, "num_tokens": 714154204.0, "step": 18715 }, { "epoch": 2.3808675740999874, "ewc_loss": 0.07051773369312286, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000348732020938769, "grad_norm": 8.27524471282959, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.849304735660553, "num_tokens": 714194857.0, "step": 18716 }, { "epoch": 2.380994784378578, "ewc_loss": 0.07060546427965164, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034960932680405676, "grad_norm": 8.292869567871094, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8780156373977661, "num_tokens": 714228034.0, "step": 18717 }, { "epoch": 2.3811219946571685, "ewc_loss": 0.07051031291484833, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003486578643787652, "grad_norm": 8.287519454956055, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8721444606781006, "num_tokens": 714269688.0, "step": 18718 }, { "epoch": 2.381249204935759, "ewc_loss": 0.07056160271167755, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003491706738714129, "grad_norm": 8.271291732788086, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8593195676803589, "num_tokens": 714309039.0, "step": 18719 }, { "epoch": 2.3813764152143495, "ewc_loss": 0.07067559659481049, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003503107000142336, "grad_norm": 8.274712562561035, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8710236549377441, "num_tokens": 714347453.0, "step": 18720 }, { "epoch": 2.3815036254929396, "ewc_loss": 0.07064163684844971, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034997102920897305, "grad_norm": 8.356101036071777, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8633449077606201, "num_tokens": 714385850.0, "step": 18721 }, { "epoch": 2.3816308357715306, "ewc_loss": 0.07059946656227112, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034954934380948544, "grad_norm": 8.263885498046875, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8606091737747192, "num_tokens": 714426735.0, "step": 18722 }, { "epoch": 2.3817580460501206, "ewc_loss": 0.07069198787212372, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035047458368353546, "grad_norm": 8.327353477478027, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8780286908149719, "num_tokens": 714465279.0, "step": 18723 }, { "epoch": 2.381885256328711, "ewc_loss": 0.07057040929794312, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034925880027003586, "grad_norm": 8.316808700561523, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8538447022438049, "num_tokens": 714505703.0, "step": 18724 }, { "epoch": 2.3820124666073017, "ewc_loss": 0.07064725458621979, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003500272869132459, "grad_norm": 8.345414161682129, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8606142401695251, "num_tokens": 714543642.0, "step": 18725 }, { "epoch": 2.3821396768858922, "ewc_loss": 0.0705675482749939, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000349230132997036, "grad_norm": 8.318460464477539, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8833303451538086, "num_tokens": 714583533.0, "step": 18726 }, { "epoch": 2.3822668871644828, "ewc_loss": 0.07063846290111542, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034993927692994475, "grad_norm": 8.341251373291016, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8709929585456848, "num_tokens": 714625577.0, "step": 18727 }, { "epoch": 2.3823940974430733, "ewc_loss": 0.07061551511287689, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003497097932267934, "grad_norm": 8.29196834564209, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8612920045852661, "num_tokens": 714664987.0, "step": 18728 }, { "epoch": 2.382521307721664, "ewc_loss": 0.07071197777986526, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003506744687911123, "grad_norm": 8.351129531860352, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8609253168106079, "num_tokens": 714705881.0, "step": 18729 }, { "epoch": 2.3826485180002543, "ewc_loss": 0.0705837607383728, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034939232864417136, "grad_norm": 8.262039184570312, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.856568455696106, "num_tokens": 714746012.0, "step": 18730 }, { "epoch": 2.382775728278845, "ewc_loss": 0.07082603871822357, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003518150479067117, "grad_norm": 8.348740577697754, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8675513863563538, "num_tokens": 714790001.0, "step": 18731 }, { "epoch": 2.3829029385574354, "ewc_loss": 0.07060234248638153, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003495780983939767, "grad_norm": 8.317777633666992, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8731651306152344, "num_tokens": 714824515.0, "step": 18732 }, { "epoch": 2.383030148836026, "ewc_loss": 0.07077033817768097, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035125802969560027, "grad_norm": 8.343766212463379, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8591241836547852, "num_tokens": 714860841.0, "step": 18733 }, { "epoch": 2.3831573591146165, "ewc_loss": 0.07063819468021393, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003499366866890341, "grad_norm": 8.280328750610352, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.870245099067688, "num_tokens": 714896373.0, "step": 18734 }, { "epoch": 2.383284569393207, "ewc_loss": 0.0708669126033783, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003522238112054765, "grad_norm": 8.371458053588867, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8688167333602905, "num_tokens": 714926684.0, "step": 18735 }, { "epoch": 2.3834117796717975, "ewc_loss": 0.07054894417524338, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000349044130416587, "grad_norm": 8.27200698852539, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8814118504524231, "num_tokens": 714961057.0, "step": 18736 }, { "epoch": 2.383538989950388, "ewc_loss": 0.07085178047418594, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035207250039093196, "grad_norm": 8.364322662353516, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8710252046585083, "num_tokens": 715001215.0, "step": 18737 }, { "epoch": 2.3836662002289786, "ewc_loss": 0.07050272822380066, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003485819906927645, "grad_norm": 8.320996284484863, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8451375961303711, "num_tokens": 715038252.0, "step": 18738 }, { "epoch": 2.383793410507569, "ewc_loss": 0.07078124582767487, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003513671108521521, "grad_norm": 8.409896850585938, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8659282326698303, "num_tokens": 715078320.0, "step": 18739 }, { "epoch": 2.3839206207861596, "ewc_loss": 0.07050368934869766, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034859159495681524, "grad_norm": 8.316673278808594, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8598480224609375, "num_tokens": 715115628.0, "step": 18740 }, { "epoch": 2.38404783106475, "ewc_loss": 0.07067643851041794, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035031908191740513, "grad_norm": 8.313531875610352, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8750198483467102, "num_tokens": 715153294.0, "step": 18741 }, { "epoch": 2.3841750413433407, "ewc_loss": 0.07057933509349823, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003493480908218771, "grad_norm": 8.270406723022461, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8671963214874268, "num_tokens": 715193736.0, "step": 18742 }, { "epoch": 2.384302251621931, "ewc_loss": 0.07072724401950836, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035082714748568833, "grad_norm": 8.300189018249512, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8688901662826538, "num_tokens": 715234539.0, "step": 18743 }, { "epoch": 2.3844294619005217, "ewc_loss": 0.07072234898805618, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003507781948428601, "grad_norm": 8.377026557922363, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8789807558059692, "num_tokens": 715271567.0, "step": 18744 }, { "epoch": 2.3845566721791123, "ewc_loss": 0.07055403292179108, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003490949748083949, "grad_norm": 8.23429012298584, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8785903453826904, "num_tokens": 715314850.0, "step": 18745 }, { "epoch": 2.3846838824577024, "ewc_loss": 0.07103732228279114, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003539279568940401, "grad_norm": 8.418466567993164, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8713149428367615, "num_tokens": 715353353.0, "step": 18746 }, { "epoch": 2.3848110927362933, "ewc_loss": 0.07048735022544861, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003484282351564616, "grad_norm": 8.3230619430542, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8763629198074341, "num_tokens": 715387843.0, "step": 18747 }, { "epoch": 2.3849383030148834, "ewc_loss": 0.07097165286540985, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035327119985595345, "grad_norm": 8.435914039611816, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8537665605545044, "num_tokens": 715428037.0, "step": 18748 }, { "epoch": 2.385065513293474, "ewc_loss": 0.07049943506717682, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003485489869490266, "grad_norm": 8.306622505187988, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8810540437698364, "num_tokens": 715464403.0, "step": 18749 }, { "epoch": 2.3851927235720645, "ewc_loss": 0.07085913419723511, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003521460166666657, "grad_norm": 8.359806060791016, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.876362681388855, "num_tokens": 715501820.0, "step": 18750 }, { "epoch": 2.385319933850655, "ewc_loss": 0.07057485729455948, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034930327092297375, "grad_norm": 8.325294494628906, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8768030405044556, "num_tokens": 715543287.0, "step": 18751 }, { "epoch": 2.3854471441292455, "ewc_loss": 0.07080341875553131, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003515889111440629, "grad_norm": 8.34896469116211, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8763404488563538, "num_tokens": 715578189.0, "step": 18752 }, { "epoch": 2.385574354407836, "ewc_loss": 0.07050114870071411, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003485661291051656, "grad_norm": 8.316720008850098, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8560483455657959, "num_tokens": 715616447.0, "step": 18753 }, { "epoch": 2.3857015646864266, "ewc_loss": 0.07070375233888626, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003505922213662416, "grad_norm": 8.352465629577637, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8705136179924011, "num_tokens": 715651692.0, "step": 18754 }, { "epoch": 2.385828774965017, "ewc_loss": 0.07061909884214401, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034974567824974656, "grad_norm": 8.35049819946289, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8573300242424011, "num_tokens": 715687733.0, "step": 18755 }, { "epoch": 2.3859559852436076, "ewc_loss": 0.07055303454399109, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034908499219454825, "grad_norm": 8.350491523742676, "learning_rate": 1e-06, "loss": 0.5204, "mean_token_accuracy": 0.8442826867103577, "num_tokens": 715727481.0, "step": 18756 }, { "epoch": 2.386083195522198, "ewc_loss": 0.0705554187297821, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003491088282316923, "grad_norm": 8.261882781982422, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8487731218338013, "num_tokens": 715767549.0, "step": 18757 }, { "epoch": 2.3862104058007887, "ewc_loss": 0.07067429274320602, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035029760329052806, "grad_norm": 8.290630340576172, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8580230474472046, "num_tokens": 715807251.0, "step": 18758 }, { "epoch": 2.386337616079379, "ewc_loss": 0.07070232927799225, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003505779604893178, "grad_norm": 8.283973693847656, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8503624200820923, "num_tokens": 715846893.0, "step": 18759 }, { "epoch": 2.3864648263579697, "ewc_loss": 0.07068482041358948, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003504028427414596, "grad_norm": 8.285102844238281, "learning_rate": 1e-06, "loss": 0.5095, "mean_token_accuracy": 0.8528047800064087, "num_tokens": 715883611.0, "step": 18760 }, { "epoch": 2.3865920366365603, "ewc_loss": 0.07086391746997833, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035219386336393654, "grad_norm": 8.393603324890137, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8682491779327393, "num_tokens": 715915598.0, "step": 18761 }, { "epoch": 2.386719246915151, "ewc_loss": 0.07052131742238998, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003487678477540612, "grad_norm": 8.20110034942627, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8741182684898376, "num_tokens": 715956647.0, "step": 18762 }, { "epoch": 2.3868464571937413, "ewc_loss": 0.0709613785147667, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003531684633344412, "grad_norm": 8.319608688354492, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8731358051300049, "num_tokens": 715996627.0, "step": 18763 }, { "epoch": 2.386973667472332, "ewc_loss": 0.0705028623342514, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003485833003651351, "grad_norm": 8.23521614074707, "learning_rate": 1e-06, "loss": 0.3917, "mean_token_accuracy": 0.8874835968017578, "num_tokens": 716027823.0, "step": 18764 }, { "epoch": 2.3871008777509224, "ewc_loss": 0.07082271575927734, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000351781869539991, "grad_norm": 8.32836627960205, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8526777029037476, "num_tokens": 716067430.0, "step": 18765 }, { "epoch": 2.387228088029513, "ewc_loss": 0.07061964273452759, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003497511497698724, "grad_norm": 8.223837852478027, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8583482503890991, "num_tokens": 716109436.0, "step": 18766 }, { "epoch": 2.3873552983081034, "ewc_loss": 0.07075770199298859, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035113171907141805, "grad_norm": 8.309115409851074, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8615618944168091, "num_tokens": 716148157.0, "step": 18767 }, { "epoch": 2.387482508586694, "ewc_loss": 0.07060936093330383, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003496482677292079, "grad_norm": 8.220959663391113, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8686946034431458, "num_tokens": 716184662.0, "step": 18768 }, { "epoch": 2.3876097188652845, "ewc_loss": 0.07109255343675613, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003544802311807871, "grad_norm": 8.372758865356445, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8591303825378418, "num_tokens": 716227382.0, "step": 18769 }, { "epoch": 2.387736929143875, "ewc_loss": 0.0704706609249115, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003482612664811313, "grad_norm": 8.280155181884766, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8695429563522339, "num_tokens": 716262236.0, "step": 18770 }, { "epoch": 2.387864139422465, "ewc_loss": 0.07102316617965698, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003537863085512072, "grad_norm": 8.358119010925293, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8761838674545288, "num_tokens": 716295541.0, "step": 18771 }, { "epoch": 2.387991349701056, "ewc_loss": 0.07053492963314056, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003489039372652769, "grad_norm": 8.346168518066406, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8692725896835327, "num_tokens": 716331796.0, "step": 18772 }, { "epoch": 2.388118559979646, "ewc_loss": 0.07077201455831528, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035127485170960426, "grad_norm": 8.350963592529297, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8571621179580688, "num_tokens": 716369539.0, "step": 18773 }, { "epoch": 2.3882457702582367, "ewc_loss": 0.07061297446489334, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003496844437904656, "grad_norm": 8.601274490356445, "learning_rate": 1e-06, "loss": 0.4123, "mean_token_accuracy": 0.8828372955322266, "num_tokens": 716409806.0, "step": 18774 }, { "epoch": 2.3883729805368272, "ewc_loss": 0.07021636515855789, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034571834839880466, "grad_norm": 8.199962615966797, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8587625026702881, "num_tokens": 716448191.0, "step": 18775 }, { "epoch": 2.3885001908154178, "ewc_loss": 0.07112690806388855, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035482380189932883, "grad_norm": 8.383772850036621, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8679196834564209, "num_tokens": 716486522.0, "step": 18776 }, { "epoch": 2.3886274010940083, "ewc_loss": 0.0702984482049942, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034653913462534547, "grad_norm": 8.230945587158203, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8661059737205505, "num_tokens": 716519383.0, "step": 18777 }, { "epoch": 2.388754611372599, "ewc_loss": 0.07114031910896301, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035495791235007346, "grad_norm": 8.417235374450684, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8684415817260742, "num_tokens": 716556136.0, "step": 18778 }, { "epoch": 2.3888818216511893, "ewc_loss": 0.07039155066013336, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003474701661616564, "grad_norm": 8.317641258239746, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8591279983520508, "num_tokens": 716592346.0, "step": 18779 }, { "epoch": 2.38900903192978, "ewc_loss": 0.07077656686306, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035132039920426905, "grad_norm": 8.333491325378418, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8594925403594971, "num_tokens": 716633996.0, "step": 18780 }, { "epoch": 2.3891362422083704, "ewc_loss": 0.0706750825047493, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003503055195324123, "grad_norm": 8.35490608215332, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8610402345657349, "num_tokens": 716669638.0, "step": 18781 }, { "epoch": 2.389263452486961, "ewc_loss": 0.07047519087791443, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034830663935281336, "grad_norm": 8.265157699584961, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8567211627960205, "num_tokens": 716711194.0, "step": 18782 }, { "epoch": 2.3893906627655515, "ewc_loss": 0.07077951729297638, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003513498813845217, "grad_norm": 8.321081161499023, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8725402355194092, "num_tokens": 716753402.0, "step": 18783 }, { "epoch": 2.389517873044142, "ewc_loss": 0.07059581577777863, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034951281850226223, "grad_norm": 8.29811954498291, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8692359924316406, "num_tokens": 716794638.0, "step": 18784 }, { "epoch": 2.3896450833227325, "ewc_loss": 0.07074585556983948, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035101326648145914, "grad_norm": 8.31861400604248, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8373395204544067, "num_tokens": 716834003.0, "step": 18785 }, { "epoch": 2.389772293601323, "ewc_loss": 0.07072526216506958, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035080735688097775, "grad_norm": 8.288290977478027, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8715460896492004, "num_tokens": 716867349.0, "step": 18786 }, { "epoch": 2.3898995038799136, "ewc_loss": 0.07065282762050629, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003500829916447401, "grad_norm": 8.34605598449707, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8649672269821167, "num_tokens": 716904271.0, "step": 18787 }, { "epoch": 2.390026714158504, "ewc_loss": 0.07063594460487366, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034991413122043014, "grad_norm": 8.376166343688965, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8681022524833679, "num_tokens": 716936165.0, "step": 18788 }, { "epoch": 2.3901539244370946, "ewc_loss": 0.07079072296619415, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035146187292411923, "grad_norm": 8.34854507446289, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8638362884521484, "num_tokens": 716974011.0, "step": 18789 }, { "epoch": 2.390281134715685, "ewc_loss": 0.07068181037902832, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003503727784845978, "grad_norm": 8.317187309265137, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8654345870018005, "num_tokens": 717012438.0, "step": 18790 }, { "epoch": 2.3904083449942757, "ewc_loss": 0.07067535072565079, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003503081970848143, "grad_norm": 8.32186222076416, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8570644855499268, "num_tokens": 717051755.0, "step": 18791 }, { "epoch": 2.390535555272866, "ewc_loss": 0.07062308490276337, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034978552139364183, "grad_norm": 8.264349937438965, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8665943741798401, "num_tokens": 717090740.0, "step": 18792 }, { "epoch": 2.3906627655514567, "ewc_loss": 0.07086832821369171, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035223792656324804, "grad_norm": 8.360614776611328, "learning_rate": 1e-06, "loss": 0.5307, "mean_token_accuracy": 0.8396087884902954, "num_tokens": 717132033.0, "step": 18793 }, { "epoch": 2.390789975830047, "ewc_loss": 0.07061879336833954, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034974259324371815, "grad_norm": 8.30572509765625, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8725368976593018, "num_tokens": 717168255.0, "step": 18794 }, { "epoch": 2.390917186108638, "ewc_loss": 0.07081878185272217, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035174249205738306, "grad_norm": 8.340169906616211, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8626817464828491, "num_tokens": 717206863.0, "step": 18795 }, { "epoch": 2.391044396387228, "ewc_loss": 0.0706678181886673, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003502328763715923, "grad_norm": 8.31122875213623, "learning_rate": 1e-06, "loss": 0.5258, "mean_token_accuracy": 0.8441835641860962, "num_tokens": 717245374.0, "step": 18796 }, { "epoch": 2.3911716066658184, "ewc_loss": 0.07076172530651093, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035117194056510925, "grad_norm": 8.315442085266113, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8847667574882507, "num_tokens": 717279821.0, "step": 18797 }, { "epoch": 2.391298816944409, "ewc_loss": 0.07066482305526733, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035020289942622185, "grad_norm": 8.324451446533203, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8580305576324463, "num_tokens": 717318896.0, "step": 18798 }, { "epoch": 2.3914260272229995, "ewc_loss": 0.0706634521484375, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003501891915220767, "grad_norm": 8.278236389160156, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8566298484802246, "num_tokens": 717361493.0, "step": 18799 }, { "epoch": 2.39155323750159, "ewc_loss": 0.07077419757843018, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003512966795824468, "grad_norm": 8.29376220703125, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8554588556289673, "num_tokens": 717402801.0, "step": 18800 }, { "epoch": 2.3916804477801805, "ewc_loss": 0.07063804566860199, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003499351441860199, "grad_norm": 8.286540031433105, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8836981058120728, "num_tokens": 717441275.0, "step": 18801 }, { "epoch": 2.391807658058771, "ewc_loss": 0.07066527009010315, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003502073814161122, "grad_norm": 8.289226531982422, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.853650689125061, "num_tokens": 717477423.0, "step": 18802 }, { "epoch": 2.3919348683373616, "ewc_loss": 0.0708131343126297, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035168605973012745, "grad_norm": 8.309371948242188, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8585520386695862, "num_tokens": 717512978.0, "step": 18803 }, { "epoch": 2.392062078615952, "ewc_loss": 0.07078367471694946, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000351391383446753, "grad_norm": 8.27987003326416, "learning_rate": 1e-06, "loss": 0.4008, "mean_token_accuracy": 0.8843454122543335, "num_tokens": 717549710.0, "step": 18804 }, { "epoch": 2.3921892888945426, "ewc_loss": 0.07084453105926514, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003519999736454338, "grad_norm": 8.310164451599121, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8624401092529297, "num_tokens": 717586588.0, "step": 18805 }, { "epoch": 2.392316499173133, "ewc_loss": 0.0708027333021164, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003515820426400751, "grad_norm": 8.266563415527344, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8777027130126953, "num_tokens": 717621507.0, "step": 18806 }, { "epoch": 2.3924437094517237, "ewc_loss": 0.07095856964588165, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003531404072418809, "grad_norm": 8.337445259094238, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8668292760848999, "num_tokens": 717655915.0, "step": 18807 }, { "epoch": 2.392570919730314, "ewc_loss": 0.07079453021287918, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035149999894201756, "grad_norm": 8.302879333496094, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8622317314147949, "num_tokens": 717696357.0, "step": 18808 }, { "epoch": 2.3926981300089047, "ewc_loss": 0.070824533700943, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003518000594340265, "grad_norm": 8.308805465698242, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.878593921661377, "num_tokens": 717735397.0, "step": 18809 }, { "epoch": 2.3928253402874953, "ewc_loss": 0.07076992094516754, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003512538969516754, "grad_norm": 8.26696491241455, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.869051992893219, "num_tokens": 717770081.0, "step": 18810 }, { "epoch": 2.392952550566086, "ewc_loss": 0.07082279026508331, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035178259713575244, "grad_norm": 8.354670524597168, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8817743062973022, "num_tokens": 717805262.0, "step": 18811 }, { "epoch": 2.3930797608446763, "ewc_loss": 0.07074128091335297, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035096751525998116, "grad_norm": 8.331415176391602, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8654996752738953, "num_tokens": 717841877.0, "step": 18812 }, { "epoch": 2.393206971123267, "ewc_loss": 0.07086935639381409, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003522482584230602, "grad_norm": 8.388937950134277, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8495005965232849, "num_tokens": 717881435.0, "step": 18813 }, { "epoch": 2.3933341814018574, "ewc_loss": 0.07068267464637756, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035038142232224345, "grad_norm": 8.291647911071777, "learning_rate": 1e-06, "loss": 0.4015, "mean_token_accuracy": 0.8851132392883301, "num_tokens": 717922240.0, "step": 18814 }, { "epoch": 2.393461391680448, "ewc_loss": 0.07078762352466583, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003514309355523437, "grad_norm": 8.523460388183594, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8637772798538208, "num_tokens": 717958438.0, "step": 18815 }, { "epoch": 2.3935886019590384, "ewc_loss": 0.07035473734140396, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003471020609140396, "grad_norm": 8.28984260559082, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.856785774230957, "num_tokens": 717994451.0, "step": 18816 }, { "epoch": 2.393715812237629, "ewc_loss": 0.07083992660045624, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003519539022818208, "grad_norm": 8.39832878112793, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.868237316608429, "num_tokens": 718034318.0, "step": 18817 }, { "epoch": 2.3938430225162195, "ewc_loss": 0.07039707899093628, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034752549254335463, "grad_norm": 8.217832565307617, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.866400957107544, "num_tokens": 718074994.0, "step": 18818 }, { "epoch": 2.3939702327948096, "ewc_loss": 0.07085272669792175, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003520819591358304, "grad_norm": 8.359322547912598, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8620786666870117, "num_tokens": 718118849.0, "step": 18819 }, { "epoch": 2.3940974430734006, "ewc_loss": 0.07042033225297928, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003477580030448735, "grad_norm": 8.29726791381836, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8722876310348511, "num_tokens": 718152874.0, "step": 18820 }, { "epoch": 2.3942246533519906, "ewc_loss": 0.0708606019616127, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003521607141010463, "grad_norm": 8.371527671813965, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8594876527786255, "num_tokens": 718194206.0, "step": 18821 }, { "epoch": 2.394351863630581, "ewc_loss": 0.07043427228927612, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034789741039276123, "grad_norm": 8.250059127807617, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8704233169555664, "num_tokens": 718227408.0, "step": 18822 }, { "epoch": 2.3944790739091717, "ewc_loss": 0.07099908590316772, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035354550345800817, "grad_norm": 8.389678955078125, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8594195246696472, "num_tokens": 718266962.0, "step": 18823 }, { "epoch": 2.3946062841877622, "ewc_loss": 0.07046222686767578, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003481769817881286, "grad_norm": 8.256467819213867, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8712555170059204, "num_tokens": 718307873.0, "step": 18824 }, { "epoch": 2.3947334944663528, "ewc_loss": 0.07088962942361832, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035245096660219133, "grad_norm": 8.330060958862305, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8724362254142761, "num_tokens": 718350296.0, "step": 18825 }, { "epoch": 2.3948607047449433, "ewc_loss": 0.07057345658540726, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003492892428766936, "grad_norm": 8.312397956848145, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.86839359998703, "num_tokens": 718390734.0, "step": 18826 }, { "epoch": 2.394987915023534, "ewc_loss": 0.07074439525604248, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035099865635856986, "grad_norm": 8.320849418640137, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8816189169883728, "num_tokens": 718435630.0, "step": 18827 }, { "epoch": 2.3951151253021243, "ewc_loss": 0.07071876525878906, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035074236802756786, "grad_norm": 8.298961639404297, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8725508451461792, "num_tokens": 718474791.0, "step": 18828 }, { "epoch": 2.395242335580715, "ewc_loss": 0.07086866348981857, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003522413317114115, "grad_norm": 8.35915470123291, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8476061820983887, "num_tokens": 718514957.0, "step": 18829 }, { "epoch": 2.3953695458593054, "ewc_loss": 0.07063624262809753, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034991707070730627, "grad_norm": 8.311741828918457, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8779631853103638, "num_tokens": 718551761.0, "step": 18830 }, { "epoch": 2.395496756137896, "ewc_loss": 0.07092598080635071, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003528145025484264, "grad_norm": 8.419437408447266, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8652365207672119, "num_tokens": 718586445.0, "step": 18831 }, { "epoch": 2.3956239664164865, "ewc_loss": 0.07049524784088135, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003485071938484907, "grad_norm": 10.97366714477539, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8615140914916992, "num_tokens": 718631105.0, "step": 18832 }, { "epoch": 2.395751176695077, "ewc_loss": 0.07170848548412323, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003606395039241761, "grad_norm": 8.225746154785156, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.861920952796936, "num_tokens": 718672278.0, "step": 18833 }, { "epoch": 2.3958783869736675, "ewc_loss": 0.0730462372303009, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003740171086974442, "grad_norm": 8.827047348022461, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8711561560630798, "num_tokens": 718713191.0, "step": 18834 }, { "epoch": 2.396005597252258, "ewc_loss": 0.07012651860713959, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003448198258411139, "grad_norm": 8.235760688781738, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8597628474235535, "num_tokens": 718745012.0, "step": 18835 }, { "epoch": 2.3961328075308486, "ewc_loss": 0.07292294502258301, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003727841831278056, "grad_norm": 8.713805198669434, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8669946193695068, "num_tokens": 718780149.0, "step": 18836 }, { "epoch": 2.396260017809439, "ewc_loss": 0.07059334963560104, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003494881675578654, "grad_norm": 8.340234756469727, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.860509991645813, "num_tokens": 718812831.0, "step": 18837 }, { "epoch": 2.3963872280880296, "ewc_loss": 0.07196857780218124, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003632404550444335, "grad_norm": 8.615367889404297, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8745049238204956, "num_tokens": 718847650.0, "step": 18838 }, { "epoch": 2.39651443836662, "ewc_loss": 0.07085303217172623, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035208501503802836, "grad_norm": 8.355195045471191, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8668057918548584, "num_tokens": 718883417.0, "step": 18839 }, { "epoch": 2.3966416486452107, "ewc_loss": 0.07150883227586746, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003586430102586746, "grad_norm": 8.558865547180176, "learning_rate": 1e-06, "loss": 0.3761, "mean_token_accuracy": 0.8913363218307495, "num_tokens": 718915501.0, "step": 18840 }, { "epoch": 2.396768858923801, "ewc_loss": 0.07081876695156097, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003517423465382308, "grad_norm": 8.318954467773438, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8809236884117126, "num_tokens": 718954485.0, "step": 18841 }, { "epoch": 2.3968960692023917, "ewc_loss": 0.07124859094619751, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003560405457392335, "grad_norm": 8.496301651000977, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.851271390914917, "num_tokens": 718995815.0, "step": 18842 }, { "epoch": 2.3970232794809823, "ewc_loss": 0.0706631988286972, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035018668859265745, "grad_norm": 8.271027565002441, "learning_rate": 1e-06, "loss": 0.3949, "mean_token_accuracy": 0.8819524049758911, "num_tokens": 719032820.0, "step": 18843 }, { "epoch": 2.3971504897595723, "ewc_loss": 0.0711653083562851, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035520773963071406, "grad_norm": 8.451033592224121, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.864152193069458, "num_tokens": 719066966.0, "step": 18844 }, { "epoch": 2.3972777000381633, "ewc_loss": 0.07063119858503342, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034986669197678566, "grad_norm": 11.001333236694336, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8651912808418274, "num_tokens": 719108768.0, "step": 18845 }, { "epoch": 2.3974049103167534, "ewc_loss": 0.07167857885360718, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003603404329624027, "grad_norm": 8.223461151123047, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8635552525520325, "num_tokens": 719151451.0, "step": 18846 }, { "epoch": 2.397532120595344, "ewc_loss": 0.07306300103664398, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003741846594493836, "grad_norm": 8.830493927001953, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8578525185585022, "num_tokens": 719192103.0, "step": 18847 }, { "epoch": 2.3976593308739345, "ewc_loss": 0.07014130800962448, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003449677606113255, "grad_norm": 8.215920448303223, "learning_rate": 1e-06, "loss": 0.479, "mean_token_accuracy": 0.8641067743301392, "num_tokens": 719226514.0, "step": 18848 }, { "epoch": 2.397786541152525, "ewc_loss": 0.07319191098213196, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00037547381361946464, "grad_norm": 8.779623985290527, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8908162117004395, "num_tokens": 719262944.0, "step": 18849 }, { "epoch": 2.3979137514311155, "ewc_loss": 0.07059530913829803, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003495077835395932, "grad_norm": 8.268239974975586, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8720828294754028, "num_tokens": 719306707.0, "step": 18850 }, { "epoch": 2.398040961709706, "ewc_loss": 0.07226473093032837, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036620194441638887, "grad_norm": 8.614256858825684, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8725452423095703, "num_tokens": 719347523.0, "step": 18851 }, { "epoch": 2.3981681719882966, "ewc_loss": 0.0708770677447319, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003523253835737705, "grad_norm": 8.370250701904297, "learning_rate": 1e-06, "loss": 0.3894, "mean_token_accuracy": 0.885326087474823, "num_tokens": 719379474.0, "step": 18852 }, { "epoch": 2.398295382266887, "ewc_loss": 0.07169464230537415, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036050111521035433, "grad_norm": 8.555557250976562, "learning_rate": 1e-06, "loss": 0.518, "mean_token_accuracy": 0.8508813977241516, "num_tokens": 719421624.0, "step": 18853 }, { "epoch": 2.3984225925454776, "ewc_loss": 0.07092107832431793, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003527655208017677, "grad_norm": 8.383284568786621, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8632439374923706, "num_tokens": 719455570.0, "step": 18854 }, { "epoch": 2.398549802824068, "ewc_loss": 0.07134641706943512, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035701890010386705, "grad_norm": 8.46982192993164, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8829368948936462, "num_tokens": 719500300.0, "step": 18855 }, { "epoch": 2.3986770131026587, "ewc_loss": 0.07081867754459381, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003517414734233171, "grad_norm": 8.455100059509277, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.851715624332428, "num_tokens": 719538119.0, "step": 18856 }, { "epoch": 2.398804223381249, "ewc_loss": 0.0709439218044281, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035299392766319215, "grad_norm": 8.41908073425293, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8556881546974182, "num_tokens": 719574012.0, "step": 18857 }, { "epoch": 2.3989314336598397, "ewc_loss": 0.07084700465202332, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035202474100515246, "grad_norm": 8.466755867004395, "learning_rate": 1e-06, "loss": 0.4251, "mean_token_accuracy": 0.8785814642906189, "num_tokens": 719609548.0, "step": 18858 }, { "epoch": 2.3990586439384303, "ewc_loss": 0.07056286931037903, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034918339224532247, "grad_norm": 8.34699821472168, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8606171607971191, "num_tokens": 719647280.0, "step": 18859 }, { "epoch": 2.399185854217021, "ewc_loss": 0.07088503241539001, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003524049825500697, "grad_norm": 8.368387222290039, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8813381791114807, "num_tokens": 719686987.0, "step": 18860 }, { "epoch": 2.3993130644956113, "ewc_loss": 0.07064048945903778, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000349959620507434, "grad_norm": 8.363115310668945, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8849892020225525, "num_tokens": 719727974.0, "step": 18861 }, { "epoch": 2.399440274774202, "ewc_loss": 0.07069700956344604, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003505248168949038, "grad_norm": 8.379223823547363, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8662101626396179, "num_tokens": 719764308.0, "step": 18862 }, { "epoch": 2.3995674850527924, "ewc_loss": 0.07066355645656586, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035019023925997317, "grad_norm": 8.319995880126953, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8544667959213257, "num_tokens": 719801872.0, "step": 18863 }, { "epoch": 2.399694695331383, "ewc_loss": 0.07079444825649261, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035149918403476477, "grad_norm": 8.395610809326172, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8686441779136658, "num_tokens": 719842504.0, "step": 18864 }, { "epoch": 2.3998219056099734, "ewc_loss": 0.07063816487789154, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003499363665468991, "grad_norm": 8.370800971984863, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8621257543563843, "num_tokens": 719878885.0, "step": 18865 }, { "epoch": 2.399949115888564, "ewc_loss": 0.07078881561756134, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003514428099151701, "grad_norm": 8.358205795288086, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8507660627365112, "num_tokens": 719922891.0, "step": 18866 }, { "epoch": 2.4000763261671545, "ewc_loss": 0.07057598233222961, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034931456320919096, "grad_norm": 8.337386131286621, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8640711307525635, "num_tokens": 719960113.0, "step": 18867 }, { "epoch": 2.400203536445745, "ewc_loss": 0.07080458849668503, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000351600581780076, "grad_norm": 8.283655166625977, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8661400079727173, "num_tokens": 720006079.0, "step": 18868 }, { "epoch": 2.400330746724335, "ewc_loss": 0.07080904394388199, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035164511064067483, "grad_norm": 8.355918884277344, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.853164792060852, "num_tokens": 720042219.0, "step": 18869 }, { "epoch": 2.400457957002926, "ewc_loss": 0.07070834934711456, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003506382054183632, "grad_norm": 8.303423881530762, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8672385811805725, "num_tokens": 720082574.0, "step": 18870 }, { "epoch": 2.400585167281516, "ewc_loss": 0.07085207104682922, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003520754398778081, "grad_norm": 8.340245246887207, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8629043102264404, "num_tokens": 720115583.0, "step": 18871 }, { "epoch": 2.4007123775601067, "ewc_loss": 0.07080815732479095, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035163629218004644, "grad_norm": 8.313817024230957, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8562705516815186, "num_tokens": 720151350.0, "step": 18872 }, { "epoch": 2.400839587838697, "ewc_loss": 0.07092995196580887, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003528542001731694, "grad_norm": 8.419371604919434, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8572951555252075, "num_tokens": 720187072.0, "step": 18873 }, { "epoch": 2.4009667981172877, "ewc_loss": 0.07065588235855103, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003501134633552283, "grad_norm": 8.27946949005127, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.869303286075592, "num_tokens": 720222249.0, "step": 18874 }, { "epoch": 2.4010940083958783, "ewc_loss": 0.0710049420595169, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003536040603648871, "grad_norm": 8.394512176513672, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.865139365196228, "num_tokens": 720264682.0, "step": 18875 }, { "epoch": 2.401221218674469, "ewc_loss": 0.07062188535928726, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003497735597193241, "grad_norm": 8.317152976989746, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8629062175750732, "num_tokens": 720309935.0, "step": 18876 }, { "epoch": 2.4013484289530593, "ewc_loss": 0.0710471123456955, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035402580397203565, "grad_norm": 8.443821907043457, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8600975275039673, "num_tokens": 720343992.0, "step": 18877 }, { "epoch": 2.40147563923165, "ewc_loss": 0.07051773369312286, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003487319918349385, "grad_norm": 8.283882141113281, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8644837141036987, "num_tokens": 720379707.0, "step": 18878 }, { "epoch": 2.4016028495102404, "ewc_loss": 0.07105197012424469, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003540743491612375, "grad_norm": 8.415507316589355, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8626362085342407, "num_tokens": 720419796.0, "step": 18879 }, { "epoch": 2.401730059788831, "ewc_loss": 0.07060334086418152, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034958813921548426, "grad_norm": 8.20519733428955, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8681376576423645, "num_tokens": 720460919.0, "step": 18880 }, { "epoch": 2.4018572700674214, "ewc_loss": 0.07115024328231812, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035505712730810046, "grad_norm": 8.460636138916016, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8617337942123413, "num_tokens": 720498403.0, "step": 18881 }, { "epoch": 2.401984480346012, "ewc_loss": 0.07053789496421814, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034893365227617323, "grad_norm": 8.225179672241211, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8660473227500916, "num_tokens": 720534931.0, "step": 18882 }, { "epoch": 2.4021116906246025, "ewc_loss": 0.07131395488977432, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003566942468751222, "grad_norm": 8.453022003173828, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8701500296592712, "num_tokens": 720574062.0, "step": 18883 }, { "epoch": 2.402238900903193, "ewc_loss": 0.07061678916215897, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003497225698083639, "grad_norm": 8.270171165466309, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8587313890457153, "num_tokens": 720611552.0, "step": 18884 }, { "epoch": 2.4023661111817836, "ewc_loss": 0.07123711705207825, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003559258475434035, "grad_norm": 8.437525749206543, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8641757965087891, "num_tokens": 720647886.0, "step": 18885 }, { "epoch": 2.402493321460374, "ewc_loss": 0.07062968611717224, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003498515288811177, "grad_norm": 8.211442947387695, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8714203834533691, "num_tokens": 720688034.0, "step": 18886 }, { "epoch": 2.4026205317389646, "ewc_loss": 0.07135036587715149, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003570583648979664, "grad_norm": 8.437987327575684, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.873015284538269, "num_tokens": 720727805.0, "step": 18887 }, { "epoch": 2.402747742017555, "ewc_loss": 0.0707118958234787, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035067362478002906, "grad_norm": 8.3402738571167, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.869343101978302, "num_tokens": 720759717.0, "step": 18888 }, { "epoch": 2.4028749522961457, "ewc_loss": 0.07107517123222351, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000354306394001469, "grad_norm": 8.359936714172363, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8595808744430542, "num_tokens": 720790435.0, "step": 18889 }, { "epoch": 2.403002162574736, "ewc_loss": 0.07090984284877777, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003526530636008829, "grad_norm": 8.336305618286133, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8628283739089966, "num_tokens": 720831715.0, "step": 18890 }, { "epoch": 2.4031293728533267, "ewc_loss": 0.07099363207817078, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035349102108739316, "grad_norm": 8.363325119018555, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8658895492553711, "num_tokens": 720869598.0, "step": 18891 }, { "epoch": 2.403256583131917, "ewc_loss": 0.07097267359495163, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035328141530044377, "grad_norm": 8.305871963500977, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8608819246292114, "num_tokens": 720911122.0, "step": 18892 }, { "epoch": 2.403383793410508, "ewc_loss": 0.07099322229623795, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035348691744729877, "grad_norm": 8.437339782714844, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8696386814117432, "num_tokens": 720952265.0, "step": 18893 }, { "epoch": 2.403511003689098, "ewc_loss": 0.07084808498620987, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003520355385262519, "grad_norm": 8.37354850769043, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8696555495262146, "num_tokens": 720991191.0, "step": 18894 }, { "epoch": 2.4036382139676884, "ewc_loss": 0.07095992565155029, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003531539405230433, "grad_norm": 8.345795631408691, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8761237859725952, "num_tokens": 721032524.0, "step": 18895 }, { "epoch": 2.403765424246279, "ewc_loss": 0.0709480345249176, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003530350513756275, "grad_norm": 8.417710304260254, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8646694421768188, "num_tokens": 721072699.0, "step": 18896 }, { "epoch": 2.4038926345248695, "ewc_loss": 0.07073874026536942, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035094207851216197, "grad_norm": 8.334954261779785, "learning_rate": 1e-06, "loss": 0.4156, "mean_token_accuracy": 0.8788416385650635, "num_tokens": 721109467.0, "step": 18897 }, { "epoch": 2.40401984480346, "ewc_loss": 0.07092095911502838, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000352764327544719, "grad_norm": 8.42201042175293, "learning_rate": 1e-06, "loss": 0.5485, "mean_token_accuracy": 0.8402974605560303, "num_tokens": 721142108.0, "step": 18898 }, { "epoch": 2.4041470550820505, "ewc_loss": 0.07054774463176727, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003490321687422693, "grad_norm": 8.272515296936035, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8587279319763184, "num_tokens": 721179875.0, "step": 18899 }, { "epoch": 2.404274265360641, "ewc_loss": 0.07097721844911575, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003533268754836172, "grad_norm": 8.498294830322266, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8621679544448853, "num_tokens": 721217141.0, "step": 18900 }, { "epoch": 2.4044014756392316, "ewc_loss": 0.07038485258817673, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003474032273516059, "grad_norm": 8.194418907165527, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8591096997261047, "num_tokens": 721256357.0, "step": 18901 }, { "epoch": 2.404528685917822, "ewc_loss": 0.0711410641670227, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003549653629306704, "grad_norm": 8.402997016906738, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8643379211425781, "num_tokens": 721292610.0, "step": 18902 }, { "epoch": 2.4046558961964126, "ewc_loss": 0.07039327919483185, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003474874247331172, "grad_norm": 8.229899406433105, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8730777502059937, "num_tokens": 721329048.0, "step": 18903 }, { "epoch": 2.404783106475003, "ewc_loss": 0.07122177630662918, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035577244125306606, "grad_norm": 8.468454360961914, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8796226978302002, "num_tokens": 721364249.0, "step": 18904 }, { "epoch": 2.4049103167535937, "ewc_loss": 0.070518359541893, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003487382782623172, "grad_norm": 8.286187171936035, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8719649910926819, "num_tokens": 721391154.0, "step": 18905 }, { "epoch": 2.405037527032184, "ewc_loss": 0.07103893160820007, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035394399310462177, "grad_norm": 8.395183563232422, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8764593601226807, "num_tokens": 721428443.0, "step": 18906 }, { "epoch": 2.4051647373107747, "ewc_loss": 0.07067124545574188, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003502671024762094, "grad_norm": 8.248664855957031, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8557106256484985, "num_tokens": 721474545.0, "step": 18907 }, { "epoch": 2.4052919475893653, "ewc_loss": 0.07114755362272263, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035503023536875844, "grad_norm": 8.45787525177002, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8656044006347656, "num_tokens": 721513153.0, "step": 18908 }, { "epoch": 2.405419157867956, "ewc_loss": 0.07059060037136078, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003494607226457447, "grad_norm": 8.274903297424316, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8625407218933105, "num_tokens": 721556287.0, "step": 18909 }, { "epoch": 2.4055463681465463, "ewc_loss": 0.07109314948320389, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003544861974660307, "grad_norm": 8.443586349487305, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8732320070266724, "num_tokens": 721590219.0, "step": 18910 }, { "epoch": 2.405673578425137, "ewc_loss": 0.07046754658222198, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003482301835902035, "grad_norm": 8.288073539733887, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.864549458026886, "num_tokens": 721627308.0, "step": 18911 }, { "epoch": 2.4058007887037274, "ewc_loss": 0.07097882032394409, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035334291169419885, "grad_norm": 8.458100318908691, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8621900081634521, "num_tokens": 721664218.0, "step": 18912 }, { "epoch": 2.405927998982318, "ewc_loss": 0.07048679888248444, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003484227054286748, "grad_norm": 8.2975435256958, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8756587505340576, "num_tokens": 721703511.0, "step": 18913 }, { "epoch": 2.4060552092609084, "ewc_loss": 0.07096223533153534, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003531770780682564, "grad_norm": 8.359437942504883, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8778574466705322, "num_tokens": 721741512.0, "step": 18914 }, { "epoch": 2.406182419539499, "ewc_loss": 0.07057832926511765, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034933799179270864, "grad_norm": 8.285484313964844, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8697768449783325, "num_tokens": 721778443.0, "step": 18915 }, { "epoch": 2.4063096298180895, "ewc_loss": 0.07093692570924759, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003529239329509437, "grad_norm": 8.429295539855957, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8625006079673767, "num_tokens": 721817226.0, "step": 18916 }, { "epoch": 2.4064368400966796, "ewc_loss": 0.07056451588869095, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003491998359095305, "grad_norm": 8.230643272399902, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8510153889656067, "num_tokens": 721859419.0, "step": 18917 }, { "epoch": 2.4065640503752705, "ewc_loss": 0.07096223533153534, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003531770780682564, "grad_norm": 8.338924407958984, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8787881135940552, "num_tokens": 721895183.0, "step": 18918 }, { "epoch": 2.4066912606538606, "ewc_loss": 0.07081671059131622, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003517217410262674, "grad_norm": 8.373587608337402, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8505512475967407, "num_tokens": 721933566.0, "step": 18919 }, { "epoch": 2.406818470932451, "ewc_loss": 0.070892333984375, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035247806226834655, "grad_norm": 8.387776374816895, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8685315847396851, "num_tokens": 721968827.0, "step": 18920 }, { "epoch": 2.4069456812110417, "ewc_loss": 0.07081781327724457, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003517328586895019, "grad_norm": 8.318180084228516, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8694186210632324, "num_tokens": 722004989.0, "step": 18921 }, { "epoch": 2.407072891489632, "ewc_loss": 0.07102400064468384, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003537947195582092, "grad_norm": 8.429072380065918, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8704753518104553, "num_tokens": 722043681.0, "step": 18922 }, { "epoch": 2.4072001017682227, "ewc_loss": 0.07056629657745361, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034921764745377004, "grad_norm": 8.277347564697266, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8671602606773376, "num_tokens": 722086541.0, "step": 18923 }, { "epoch": 2.4073273120468133, "ewc_loss": 0.07108701765537262, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035442490479908884, "grad_norm": 8.399222373962402, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8677940368652344, "num_tokens": 722125519.0, "step": 18924 }, { "epoch": 2.407454522325404, "ewc_loss": 0.07060615718364716, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034961625351570547, "grad_norm": 8.379325866699219, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8616889715194702, "num_tokens": 722157665.0, "step": 18925 }, { "epoch": 2.4075817326039943, "ewc_loss": 0.07078702747821808, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035142494016326964, "grad_norm": 8.337409019470215, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8793100714683533, "num_tokens": 722191819.0, "step": 18926 }, { "epoch": 2.407708942882585, "ewc_loss": 0.0707852840423584, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003514075360726565, "grad_norm": 8.365440368652344, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8557407259941101, "num_tokens": 722229755.0, "step": 18927 }, { "epoch": 2.4078361531611754, "ewc_loss": 0.07060516625642776, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003496063582133502, "grad_norm": 8.384977340698242, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8549842834472656, "num_tokens": 722267440.0, "step": 18928 }, { "epoch": 2.407963363439766, "ewc_loss": 0.07064621150493622, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003500168095342815, "grad_norm": 8.25736141204834, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8626694083213806, "num_tokens": 722310102.0, "step": 18929 }, { "epoch": 2.4080905737183564, "ewc_loss": 0.07096873968839645, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035324206692166626, "grad_norm": 8.459561347961426, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8597352504730225, "num_tokens": 722346713.0, "step": 18930 }, { "epoch": 2.408217783996947, "ewc_loss": 0.07044188678264618, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003479735169094056, "grad_norm": 8.251408576965332, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.860633134841919, "num_tokens": 722387803.0, "step": 18931 }, { "epoch": 2.4083449942755375, "ewc_loss": 0.07103761285543442, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035393080906942487, "grad_norm": 8.409323692321777, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8577756881713867, "num_tokens": 722428943.0, "step": 18932 }, { "epoch": 2.408472204554128, "ewc_loss": 0.07057107985019684, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034926546504721045, "grad_norm": 8.368643760681152, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8587673306465149, "num_tokens": 722467860.0, "step": 18933 }, { "epoch": 2.4085994148327186, "ewc_loss": 0.07092950493097305, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035284971818327904, "grad_norm": 8.417486190795898, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8619391918182373, "num_tokens": 722503067.0, "step": 18934 }, { "epoch": 2.408726625111309, "ewc_loss": 0.0706537663936615, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035009230487048626, "grad_norm": 8.374375343322754, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8682030439376831, "num_tokens": 722537842.0, "step": 18935 }, { "epoch": 2.4088538353898996, "ewc_loss": 0.07073627412319183, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035091739846393466, "grad_norm": 8.335892677307129, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8711519241333008, "num_tokens": 722574036.0, "step": 18936 }, { "epoch": 2.40898104566849, "ewc_loss": 0.07084358483552933, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003519905440043658, "grad_norm": 8.395092964172363, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8658313155174255, "num_tokens": 722606087.0, "step": 18937 }, { "epoch": 2.4091082559470807, "ewc_loss": 0.07071519643068314, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035070665762759745, "grad_norm": 8.334071159362793, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.864446759223938, "num_tokens": 722648745.0, "step": 18938 }, { "epoch": 2.409235466225671, "ewc_loss": 0.0709104984998703, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035265967017039657, "grad_norm": 8.378011703491211, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8571924567222595, "num_tokens": 722688589.0, "step": 18939 }, { "epoch": 2.4093626765042617, "ewc_loss": 0.0706787258386612, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003503419575281441, "grad_norm": 8.271958351135254, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8847018480300903, "num_tokens": 722723941.0, "step": 18940 }, { "epoch": 2.4094898867828523, "ewc_loss": 0.07094928622245789, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035304759512655437, "grad_norm": 8.38768482208252, "learning_rate": 1e-06, "loss": 0.5119, "mean_token_accuracy": 0.8498413562774658, "num_tokens": 722768242.0, "step": 18941 }, { "epoch": 2.4096170970614423, "ewc_loss": 0.07079999148845673, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003515546559356153, "grad_norm": 8.342329978942871, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8660018444061279, "num_tokens": 722800970.0, "step": 18942 }, { "epoch": 2.4097443073400333, "ewc_loss": 0.0710127204656601, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003536819131113589, "grad_norm": 8.400785446166992, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8557161092758179, "num_tokens": 722839168.0, "step": 18943 }, { "epoch": 2.4098715176186234, "ewc_loss": 0.07072031497955322, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003507578221615404, "grad_norm": 8.285118103027344, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8771455883979797, "num_tokens": 722883451.0, "step": 18944 }, { "epoch": 2.409998727897214, "ewc_loss": 0.07100959122180939, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000353650568285957, "grad_norm": 8.332626342773438, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8665527701377869, "num_tokens": 722928581.0, "step": 18945 }, { "epoch": 2.4101259381758044, "ewc_loss": 0.07094627618789673, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035301747266203165, "grad_norm": 8.340405464172363, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8770612478256226, "num_tokens": 722967141.0, "step": 18946 }, { "epoch": 2.410253148454395, "ewc_loss": 0.07099150121212006, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003534697461873293, "grad_norm": 8.326312065124512, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.861139714717865, "num_tokens": 723006263.0, "step": 18947 }, { "epoch": 2.4103803587329855, "ewc_loss": 0.07093483209609985, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035290297819301486, "grad_norm": 8.304391860961914, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8690014481544495, "num_tokens": 723050843.0, "step": 18948 }, { "epoch": 2.410507569011576, "ewc_loss": 0.07113830745220184, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003549377724993974, "grad_norm": 8.430691719055176, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8660718202590942, "num_tokens": 723092124.0, "step": 18949 }, { "epoch": 2.4106347792901666, "ewc_loss": 0.07080782204866409, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035163291613571346, "grad_norm": 8.32657527923584, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8765778541564941, "num_tokens": 723122478.0, "step": 18950 }, { "epoch": 2.410761989568757, "ewc_loss": 0.07106579095125198, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035421259235590696, "grad_norm": 8.362395286560059, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8599238991737366, "num_tokens": 723163054.0, "step": 18951 }, { "epoch": 2.4108891998473476, "ewc_loss": 0.07100831717252731, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003536378499120474, "grad_norm": 8.368062019348145, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8667309284210205, "num_tokens": 723201518.0, "step": 18952 }, { "epoch": 2.411016410125938, "ewc_loss": 0.07092617452144623, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035281648160889745, "grad_norm": 8.35426139831543, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8664414286613464, "num_tokens": 723235685.0, "step": 18953 }, { "epoch": 2.4111436204045287, "ewc_loss": 0.07103525102138519, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000353907176759094, "grad_norm": 8.359347343444824, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8645283579826355, "num_tokens": 723266510.0, "step": 18954 }, { "epoch": 2.411270830683119, "ewc_loss": 0.07096375524997711, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035319224116392434, "grad_norm": 8.382000923156738, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8693338632583618, "num_tokens": 723302800.0, "step": 18955 }, { "epoch": 2.4113980409617097, "ewc_loss": 0.07094570994377136, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003530117392074317, "grad_norm": 8.373641014099121, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8678016662597656, "num_tokens": 723340669.0, "step": 18956 }, { "epoch": 2.4115252512403003, "ewc_loss": 0.07095976173877716, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035315233981236815, "grad_norm": 8.32773208618164, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8694911003112793, "num_tokens": 723377556.0, "step": 18957 }, { "epoch": 2.411652461518891, "ewc_loss": 0.0709478110074997, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003530328394845128, "grad_norm": 8.39184284210205, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8704249858856201, "num_tokens": 723410942.0, "step": 18958 }, { "epoch": 2.4117796717974813, "ewc_loss": 0.07096514850854874, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003532061818987131, "grad_norm": 8.368765830993652, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8769782185554504, "num_tokens": 723451131.0, "step": 18959 }, { "epoch": 2.411906882076072, "ewc_loss": 0.07093045115470886, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000352859147824347, "grad_norm": 8.359475135803223, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8682389259338379, "num_tokens": 723485876.0, "step": 18960 }, { "epoch": 2.4120340923546624, "ewc_loss": 0.07100483030080795, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003536029835231602, "grad_norm": 8.42518424987793, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8726609945297241, "num_tokens": 723521252.0, "step": 18961 }, { "epoch": 2.412161302633253, "ewc_loss": 0.07083286345005035, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035188329638913274, "grad_norm": 8.339340209960938, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.850141167640686, "num_tokens": 723561094.0, "step": 18962 }, { "epoch": 2.4122885129118434, "ewc_loss": 0.07108299434185028, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035438459599390626, "grad_norm": 8.384574890136719, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8647518157958984, "num_tokens": 723597281.0, "step": 18963 }, { "epoch": 2.412415723190434, "ewc_loss": 0.0708794891834259, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035234956885688007, "grad_norm": 8.36756706237793, "learning_rate": 1e-06, "loss": 0.3859, "mean_token_accuracy": 0.8889114856719971, "num_tokens": 723637690.0, "step": 18964 }, { "epoch": 2.4125429334690245, "ewc_loss": 0.07102642953395844, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035381902125664055, "grad_norm": 8.437774658203125, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8657119274139404, "num_tokens": 723676250.0, "step": 18965 }, { "epoch": 2.412670143747615, "ewc_loss": 0.07084096968173981, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003519644378684461, "grad_norm": 8.302740097045898, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8732762932777405, "num_tokens": 723713157.0, "step": 18966 }, { "epoch": 2.412797354026205, "ewc_loss": 0.07110191881656647, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003545739164110273, "grad_norm": 8.466747283935547, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8706390857696533, "num_tokens": 723745877.0, "step": 18967 }, { "epoch": 2.412924564304796, "ewc_loss": 0.0706760585308075, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035031529841944575, "grad_norm": 8.296232223510742, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8669987916946411, "num_tokens": 723784285.0, "step": 18968 }, { "epoch": 2.413051774583386, "ewc_loss": 0.07113493978977203, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003549041284713894, "grad_norm": 8.406614303588867, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8554589748382568, "num_tokens": 723817085.0, "step": 18969 }, { "epoch": 2.4131789848619767, "ewc_loss": 0.0708031877875328, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035158658283762634, "grad_norm": 8.375741958618164, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8613306879997253, "num_tokens": 723858466.0, "step": 18970 }, { "epoch": 2.413306195140567, "ewc_loss": 0.07089877128601074, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003525424108374864, "grad_norm": 8.33837604522705, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8653578758239746, "num_tokens": 723896627.0, "step": 18971 }, { "epoch": 2.4134334054191577, "ewc_loss": 0.07094686478376389, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035302332253195345, "grad_norm": 8.38156509399414, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8615021705627441, "num_tokens": 723930727.0, "step": 18972 }, { "epoch": 2.4135606156977483, "ewc_loss": 0.07090189307928085, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000352573610143736, "grad_norm": 8.365551948547363, "learning_rate": 1e-06, "loss": 0.5003, "mean_token_accuracy": 0.8544942736625671, "num_tokens": 723965996.0, "step": 18973 }, { "epoch": 2.413687825976339, "ewc_loss": 0.07097315788269043, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035328621743246913, "grad_norm": 8.39400577545166, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8601582646369934, "num_tokens": 724002441.0, "step": 18974 }, { "epoch": 2.4138150362549293, "ewc_loss": 0.07077884674072266, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003513431001920253, "grad_norm": 8.362992286682129, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8705546259880066, "num_tokens": 724039238.0, "step": 18975 }, { "epoch": 2.41394224653352, "ewc_loss": 0.07087737321853638, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003523284394759685, "grad_norm": 8.36674976348877, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8723899126052856, "num_tokens": 724077430.0, "step": 18976 }, { "epoch": 2.4140694568121104, "ewc_loss": 0.0708603486418724, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003521581820677966, "grad_norm": 8.342994689941406, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8647266626358032, "num_tokens": 724113175.0, "step": 18977 }, { "epoch": 2.414196667090701, "ewc_loss": 0.07087424397468567, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003522971528582275, "grad_norm": 8.314667701721191, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.860682487487793, "num_tokens": 724156409.0, "step": 18978 }, { "epoch": 2.4143238773692914, "ewc_loss": 0.07092660665512085, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003528207598719746, "grad_norm": 8.344371795654297, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8663192987442017, "num_tokens": 724199879.0, "step": 18979 }, { "epoch": 2.414451087647882, "ewc_loss": 0.07088854908943176, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003524401399772614, "grad_norm": 8.318883895874023, "learning_rate": 1e-06, "loss": 0.4099, "mean_token_accuracy": 0.8808622360229492, "num_tokens": 724238065.0, "step": 18980 }, { "epoch": 2.4145782979264725, "ewc_loss": 0.07083195447921753, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003518741868901998, "grad_norm": 8.39767074584961, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8580693006515503, "num_tokens": 724272336.0, "step": 18981 }, { "epoch": 2.414705508205063, "ewc_loss": 0.07075421512126923, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035109679447486997, "grad_norm": 8.312165260314941, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.856214165687561, "num_tokens": 724305200.0, "step": 18982 }, { "epoch": 2.4148327184836536, "ewc_loss": 0.07104156911373138, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035397036117501557, "grad_norm": 8.41087532043457, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8710170984268188, "num_tokens": 724341295.0, "step": 18983 }, { "epoch": 2.414959928762244, "ewc_loss": 0.07065198570489883, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035007455153390765, "grad_norm": 8.22988224029541, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8692622184753418, "num_tokens": 724384336.0, "step": 18984 }, { "epoch": 2.4150871390408346, "ewc_loss": 0.0712948739528656, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035650338395498693, "grad_norm": 8.4954195022583, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8627969026565552, "num_tokens": 724417279.0, "step": 18985 }, { "epoch": 2.415214349319425, "ewc_loss": 0.07051514089107513, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003487060603220016, "grad_norm": 8.295743942260742, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8655528426170349, "num_tokens": 724456509.0, "step": 18986 }, { "epoch": 2.4153415595980157, "ewc_loss": 0.07124608755111694, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003560155164450407, "grad_norm": 8.408553123474121, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8771069645881653, "num_tokens": 724498812.0, "step": 18987 }, { "epoch": 2.415468769876606, "ewc_loss": 0.07062245905399323, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003497792349662632, "grad_norm": 8.255053520202637, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8541315793991089, "num_tokens": 724538174.0, "step": 18988 }, { "epoch": 2.4155959801551967, "ewc_loss": 0.07121197879314423, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035567444865591824, "grad_norm": 8.383955001831055, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8712992668151855, "num_tokens": 724577171.0, "step": 18989 }, { "epoch": 2.415723190433787, "ewc_loss": 0.07070578634738922, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035061256494373083, "grad_norm": 8.261887550354004, "learning_rate": 1e-06, "loss": 0.3931, "mean_token_accuracy": 0.8872025012969971, "num_tokens": 724610626.0, "step": 18990 }, { "epoch": 2.4158504007123778, "ewc_loss": 0.0712040588259697, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003555952862370759, "grad_norm": 8.484472274780273, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8775380849838257, "num_tokens": 724647613.0, "step": 18991 }, { "epoch": 2.415977610990968, "ewc_loss": 0.07059819996356964, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034953668364323676, "grad_norm": 8.233891487121582, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8679218292236328, "num_tokens": 724684898.0, "step": 18992 }, { "epoch": 2.4161048212695584, "ewc_loss": 0.07132133841514587, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035676805418916047, "grad_norm": 8.430686950683594, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8638768792152405, "num_tokens": 724725566.0, "step": 18993 }, { "epoch": 2.416232031548149, "ewc_loss": 0.07072402536869049, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003507949586492032, "grad_norm": 8.301944732666016, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8567154407501221, "num_tokens": 724763701.0, "step": 18994 }, { "epoch": 2.4163592418267394, "ewc_loss": 0.07117334008216858, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035528812441043556, "grad_norm": 8.427517890930176, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8638641238212585, "num_tokens": 724798108.0, "step": 18995 }, { "epoch": 2.41648645210533, "ewc_loss": 0.07072940468788147, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003508486843202263, "grad_norm": 8.246270179748535, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8452082276344299, "num_tokens": 724842430.0, "step": 18996 }, { "epoch": 2.4166136623839205, "ewc_loss": 0.07113471627235413, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035490182926878333, "grad_norm": 8.406508445739746, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.861617922782898, "num_tokens": 724884585.0, "step": 18997 }, { "epoch": 2.416740872662511, "ewc_loss": 0.07069843262434006, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035053901956416667, "grad_norm": 8.278525352478027, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8776814341545105, "num_tokens": 724919338.0, "step": 18998 }, { "epoch": 2.4168680829411016, "ewc_loss": 0.07102510333061218, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035380569170229137, "grad_norm": 8.377375602722168, "learning_rate": 1e-06, "loss": 0.3951, "mean_token_accuracy": 0.8884545564651489, "num_tokens": 724956007.0, "step": 18999 }, { "epoch": 2.416995293219692, "ewc_loss": 0.07078120112419128, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003513666451908648, "grad_norm": 8.372353553771973, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8821153044700623, "num_tokens": 724996994.0, "step": 19000 }, { "epoch": 2.4171225034982826, "ewc_loss": 0.0708581805229187, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003521364997141063, "grad_norm": 8.318002700805664, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8594925403594971, "num_tokens": 725035607.0, "step": 19001 }, { "epoch": 2.417249713776873, "ewc_loss": 0.07095614075660706, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003531160473357886, "grad_norm": 8.375550270080566, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8696091175079346, "num_tokens": 725075370.0, "step": 19002 }, { "epoch": 2.4173769240554637, "ewc_loss": 0.07070524245500565, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035060709342360497, "grad_norm": 8.305989265441895, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8651161193847656, "num_tokens": 725119536.0, "step": 19003 }, { "epoch": 2.417504134334054, "ewc_loss": 0.07097473740577698, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003533021081238985, "grad_norm": 8.337442398071289, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8657994270324707, "num_tokens": 725158048.0, "step": 19004 }, { "epoch": 2.4176313446126447, "ewc_loss": 0.07098214328289032, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003533760900609195, "grad_norm": 8.351289749145508, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8756411671638489, "num_tokens": 725196343.0, "step": 19005 }, { "epoch": 2.4177585548912353, "ewc_loss": 0.07095424830913544, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035309712984599173, "grad_norm": 8.362353324890137, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8686800599098206, "num_tokens": 725234613.0, "step": 19006 }, { "epoch": 2.417885765169826, "ewc_loss": 0.07090801745653152, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035263484460301697, "grad_norm": 8.318756103515625, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8640643358230591, "num_tokens": 725272129.0, "step": 19007 }, { "epoch": 2.4180129754484163, "ewc_loss": 0.07106797397136688, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035423439112491906, "grad_norm": 8.399365425109863, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8566357493400574, "num_tokens": 725305740.0, "step": 19008 }, { "epoch": 2.418140185727007, "ewc_loss": 0.07077093422412872, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003512640541885048, "grad_norm": 8.350537300109863, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8623998761177063, "num_tokens": 725340579.0, "step": 19009 }, { "epoch": 2.4182673960055974, "ewc_loss": 0.07100521773099899, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003536068834364414, "grad_norm": 8.317469596862793, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8679245710372925, "num_tokens": 725380492.0, "step": 19010 }, { "epoch": 2.418394606284188, "ewc_loss": 0.07106239348649979, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035417862818576396, "grad_norm": 8.369630813598633, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8664828538894653, "num_tokens": 725418043.0, "step": 19011 }, { "epoch": 2.4185218165627784, "ewc_loss": 0.07082851231098175, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035183984437026083, "grad_norm": 8.345003128051758, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8580124378204346, "num_tokens": 725456260.0, "step": 19012 }, { "epoch": 2.418649026841369, "ewc_loss": 0.07110746204853058, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003546293010003865, "grad_norm": 8.385489463806152, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8627355098724365, "num_tokens": 725496605.0, "step": 19013 }, { "epoch": 2.4187762371199595, "ewc_loss": 0.07092095911502838, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000352764327544719, "grad_norm": 8.344230651855469, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.867229700088501, "num_tokens": 725532395.0, "step": 19014 }, { "epoch": 2.4189034473985496, "ewc_loss": 0.0711066722869873, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003546214138623327, "grad_norm": 8.361577987670898, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8608097434043884, "num_tokens": 725575335.0, "step": 19015 }, { "epoch": 2.4190306576771405, "ewc_loss": 0.07099281251430511, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003534827847033739, "grad_norm": 8.315468788146973, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8726871013641357, "num_tokens": 725612345.0, "step": 19016 }, { "epoch": 2.4191578679557306, "ewc_loss": 0.07103098928928375, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035386462695896626, "grad_norm": 8.423646926879883, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8697279691696167, "num_tokens": 725645566.0, "step": 19017 }, { "epoch": 2.419285078234321, "ewc_loss": 0.07090197503566742, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035257445415481925, "grad_norm": 8.30676555633545, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8601875305175781, "num_tokens": 725684218.0, "step": 19018 }, { "epoch": 2.4194122885129117, "ewc_loss": 0.071055568754673, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003541103797033429, "grad_norm": 8.330093383789062, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8713232278823853, "num_tokens": 725718478.0, "step": 19019 }, { "epoch": 2.419539498791502, "ewc_loss": 0.07091826945543289, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035273737739771605, "grad_norm": 8.33777141571045, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8598361015319824, "num_tokens": 725756208.0, "step": 19020 }, { "epoch": 2.4196667090700927, "ewc_loss": 0.07094965130090714, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000353051204001531, "grad_norm": 8.297140121459961, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8605713844299316, "num_tokens": 725791884.0, "step": 19021 }, { "epoch": 2.4197939193486833, "ewc_loss": 0.07090528309345245, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035260754521004856, "grad_norm": 8.289169311523438, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8699442148208618, "num_tokens": 725829138.0, "step": 19022 }, { "epoch": 2.419921129627274, "ewc_loss": 0.07102788239717484, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000353833514964208, "grad_norm": 8.34290885925293, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8668338656425476, "num_tokens": 725868301.0, "step": 19023 }, { "epoch": 2.4200483399058643, "ewc_loss": 0.0709945559501648, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003535002178978175, "grad_norm": 8.260634422302246, "learning_rate": 1e-06, "loss": 0.5043, "mean_token_accuracy": 0.8542742133140564, "num_tokens": 725906186.0, "step": 19024 }, { "epoch": 2.420175550184455, "ewc_loss": 0.07124266028404236, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035598126123659313, "grad_norm": 8.38521671295166, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8547564148902893, "num_tokens": 725947006.0, "step": 19025 }, { "epoch": 2.4203027604630454, "ewc_loss": 0.07084567099809647, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003520113823469728, "grad_norm": 8.271985054016113, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8692158460617065, "num_tokens": 725987363.0, "step": 19026 }, { "epoch": 2.420429970741636, "ewc_loss": 0.07140403985977173, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000357595068635419, "grad_norm": 8.448944091796875, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8562959432601929, "num_tokens": 726026176.0, "step": 19027 }, { "epoch": 2.4205571810202264, "ewc_loss": 0.07074803113937378, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003510350070428103, "grad_norm": 8.33545207977295, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8646180033683777, "num_tokens": 726068124.0, "step": 19028 }, { "epoch": 2.420684391298817, "ewc_loss": 0.07120729982852936, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003556276496965438, "grad_norm": 8.430411338806152, "learning_rate": 1e-06, "loss": 0.4807, "mean_token_accuracy": 0.8579808473587036, "num_tokens": 726110270.0, "step": 19029 }, { "epoch": 2.4208116015774075, "ewc_loss": 0.07084473967552185, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003520020982250571, "grad_norm": 8.34337329864502, "learning_rate": 1e-06, "loss": 0.4065, "mean_token_accuracy": 0.8812292814254761, "num_tokens": 726143908.0, "step": 19030 }, { "epoch": 2.420938811855998, "ewc_loss": 0.07108058035373688, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035436052712611854, "grad_norm": 8.362342834472656, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8642420768737793, "num_tokens": 726179082.0, "step": 19031 }, { "epoch": 2.4210660221345885, "ewc_loss": 0.07078848779201508, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035143960849381983, "grad_norm": 8.31142807006836, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8586915135383606, "num_tokens": 726216387.0, "step": 19032 }, { "epoch": 2.421193232413179, "ewc_loss": 0.07105489075183868, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003541035985108465, "grad_norm": 8.397184371948242, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8756930232048035, "num_tokens": 726258831.0, "step": 19033 }, { "epoch": 2.4213204426917696, "ewc_loss": 0.07079504430294037, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003515050921123475, "grad_norm": 8.290547370910645, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8673547506332397, "num_tokens": 726301929.0, "step": 19034 }, { "epoch": 2.42144765297036, "ewc_loss": 0.07105359435081482, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003540906764101237, "grad_norm": 8.393328666687012, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8689124584197998, "num_tokens": 726339938.0, "step": 19035 }, { "epoch": 2.4215748632489507, "ewc_loss": 0.07085447758436203, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035209947964176536, "grad_norm": 8.31765365600586, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8685605525970459, "num_tokens": 726377197.0, "step": 19036 }, { "epoch": 2.421702073527541, "ewc_loss": 0.071121945977211, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003547741216607392, "grad_norm": 8.362035751342773, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8693628311157227, "num_tokens": 726417591.0, "step": 19037 }, { "epoch": 2.4218292838061317, "ewc_loss": 0.07089313864707947, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035248606582172215, "grad_norm": 8.32838249206543, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8560835719108582, "num_tokens": 726456094.0, "step": 19038 }, { "epoch": 2.4219564940847222, "ewc_loss": 0.0710526555776596, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003540811885613948, "grad_norm": 8.410665512084961, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8748410940170288, "num_tokens": 726491158.0, "step": 19039 }, { "epoch": 2.4220837043633123, "ewc_loss": 0.07083313167095184, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035188603214919567, "grad_norm": 8.326578140258789, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8515735268592834, "num_tokens": 726531334.0, "step": 19040 }, { "epoch": 2.4222109146419033, "ewc_loss": 0.07105717062950134, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035412644501775503, "grad_norm": 8.377445220947266, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8581688404083252, "num_tokens": 726571999.0, "step": 19041 }, { "epoch": 2.4223381249204934, "ewc_loss": 0.07089413702487946, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035249607753939927, "grad_norm": 8.470869064331055, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8563658595085144, "num_tokens": 726604945.0, "step": 19042 }, { "epoch": 2.422465335199084, "ewc_loss": 0.07073130458593369, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000350867718225345, "grad_norm": 8.328937530517578, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8585847616195679, "num_tokens": 726645023.0, "step": 19043 }, { "epoch": 2.4225925454776744, "ewc_loss": 0.07094146311283112, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003529693349264562, "grad_norm": 8.422355651855469, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8662785291671753, "num_tokens": 726684297.0, "step": 19044 }, { "epoch": 2.422719755756265, "ewc_loss": 0.07061751186847687, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034972981666214764, "grad_norm": 8.260555267333984, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8559301495552063, "num_tokens": 726726950.0, "step": 19045 }, { "epoch": 2.4228469660348555, "ewc_loss": 0.0711212307214737, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003547669912222773, "grad_norm": 8.447370529174805, "learning_rate": 1e-06, "loss": 0.5297, "mean_token_accuracy": 0.8471279144287109, "num_tokens": 726761944.0, "step": 19046 }, { "epoch": 2.422974176313446, "ewc_loss": 0.07065735757350922, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035012830630876124, "grad_norm": 8.286595344543457, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8756766319274902, "num_tokens": 726799041.0, "step": 19047 }, { "epoch": 2.4231013865920366, "ewc_loss": 0.07126366347074509, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035619133268482983, "grad_norm": 8.362688064575195, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8604177832603455, "num_tokens": 726838160.0, "step": 19048 }, { "epoch": 2.423228596870627, "ewc_loss": 0.07079684734344482, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035152319469489157, "grad_norm": 8.345943450927734, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8760718107223511, "num_tokens": 726873150.0, "step": 19049 }, { "epoch": 2.4233558071492176, "ewc_loss": 0.07097098976373672, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003532645932864398, "grad_norm": 8.358499526977539, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8705853223800659, "num_tokens": 726905253.0, "step": 19050 }, { "epoch": 2.423483017427808, "ewc_loss": 0.07093189656734467, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003528736997395754, "grad_norm": 8.31257438659668, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8777782917022705, "num_tokens": 726942842.0, "step": 19051 }, { "epoch": 2.4236102277063987, "ewc_loss": 0.07098737359046936, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003534283896442503, "grad_norm": 8.28626537322998, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8705046772956848, "num_tokens": 726987974.0, "step": 19052 }, { "epoch": 2.423737437984989, "ewc_loss": 0.07104472070932388, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003540018806234002, "grad_norm": 8.349752426147461, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8846139907836914, "num_tokens": 727021614.0, "step": 19053 }, { "epoch": 2.4238646482635797, "ewc_loss": 0.0710274800658226, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003538294695317745, "grad_norm": 8.30257511138916, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8638847470283508, "num_tokens": 727061773.0, "step": 19054 }, { "epoch": 2.4239918585421703, "ewc_loss": 0.07106325030326843, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003541871556080878, "grad_norm": 8.358932495117188, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8590902090072632, "num_tokens": 727095877.0, "step": 19055 }, { "epoch": 2.424119068820761, "ewc_loss": 0.07099485397338867, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003535031864885241, "grad_norm": 8.361934661865234, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8695502281188965, "num_tokens": 727128676.0, "step": 19056 }, { "epoch": 2.4242462790993513, "ewc_loss": 0.07120902091264725, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003556448791641742, "grad_norm": 8.37739086151123, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8748430609703064, "num_tokens": 727164528.0, "step": 19057 }, { "epoch": 2.424373489377942, "ewc_loss": 0.07109847664833069, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003545394283719361, "grad_norm": 8.317852020263672, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.872775673866272, "num_tokens": 727202403.0, "step": 19058 }, { "epoch": 2.4245006996565324, "ewc_loss": 0.071128249168396, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000354837131453678, "grad_norm": 8.349879264831543, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8683111667633057, "num_tokens": 727242231.0, "step": 19059 }, { "epoch": 2.424627909935123, "ewc_loss": 0.0711112767457962, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035466745612211525, "grad_norm": 8.392608642578125, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8516216278076172, "num_tokens": 727276371.0, "step": 19060 }, { "epoch": 2.4247551202137134, "ewc_loss": 0.0711309164762497, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003548638487700373, "grad_norm": 8.35626220703125, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8690849542617798, "num_tokens": 727320101.0, "step": 19061 }, { "epoch": 2.424882330492304, "ewc_loss": 0.07107958197593689, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003543505445122719, "grad_norm": 8.350751876831055, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8573743104934692, "num_tokens": 727363687.0, "step": 19062 }, { "epoch": 2.4250095407708945, "ewc_loss": 0.07102210074663162, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003537756856530905, "grad_norm": 8.359999656677246, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8714935779571533, "num_tokens": 727399772.0, "step": 19063 }, { "epoch": 2.425136751049485, "ewc_loss": 0.0711330771446228, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035488547291606665, "grad_norm": 8.404895782470703, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8750684261322021, "num_tokens": 727442944.0, "step": 19064 }, { "epoch": 2.425263961328075, "ewc_loss": 0.07103043049573898, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003538590099196881, "grad_norm": 8.345260620117188, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8749178647994995, "num_tokens": 727483283.0, "step": 19065 }, { "epoch": 2.425391171606666, "ewc_loss": 0.07113470137119293, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003549016546458006, "grad_norm": 8.343805313110352, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8645533919334412, "num_tokens": 727526316.0, "step": 19066 }, { "epoch": 2.425518381885256, "ewc_loss": 0.07104812562465668, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035403590300120413, "grad_norm": 8.374220848083496, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8734202980995178, "num_tokens": 727563218.0, "step": 19067 }, { "epoch": 2.4256455921638467, "ewc_loss": 0.07111059129238129, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003546606167219579, "grad_norm": 8.378886222839355, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8787329196929932, "num_tokens": 727594877.0, "step": 19068 }, { "epoch": 2.425772802442437, "ewc_loss": 0.07104317843914032, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003539864846970886, "grad_norm": 8.387330055236816, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8790165185928345, "num_tokens": 727627561.0, "step": 19069 }, { "epoch": 2.4259000127210277, "ewc_loss": 0.07102631032466888, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003538177697919309, "grad_norm": 8.377214431762695, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8586151003837585, "num_tokens": 727664384.0, "step": 19070 }, { "epoch": 2.4260272229996183, "ewc_loss": 0.07104799151420593, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003540345933288336, "grad_norm": 8.345723152160645, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.862868070602417, "num_tokens": 727700866.0, "step": 19071 }, { "epoch": 2.426154433278209, "ewc_loss": 0.07096202671527863, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003531749825924635, "grad_norm": 8.326536178588867, "learning_rate": 1e-06, "loss": 0.41, "mean_token_accuracy": 0.8846917152404785, "num_tokens": 727741539.0, "step": 19072 }, { "epoch": 2.4262816435567993, "ewc_loss": 0.07108430564403534, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003543976927176118, "grad_norm": 8.467827796936035, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8677100539207458, "num_tokens": 727777828.0, "step": 19073 }, { "epoch": 2.42640885383539, "ewc_loss": 0.07079464197158813, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003515011048875749, "grad_norm": 8.294134140014648, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8753381967544556, "num_tokens": 727818979.0, "step": 19074 }, { "epoch": 2.4265360641139804, "ewc_loss": 0.07116502523422241, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003552049456629902, "grad_norm": 8.500629425048828, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8667217493057251, "num_tokens": 727856440.0, "step": 19075 }, { "epoch": 2.426663274392571, "ewc_loss": 0.07066120207309723, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003501667524687946, "grad_norm": 8.326574325561523, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8766129016876221, "num_tokens": 727895036.0, "step": 19076 }, { "epoch": 2.4267904846711614, "ewc_loss": 0.07117897272109985, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003553444112185389, "grad_norm": 8.465532302856445, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8544667959213257, "num_tokens": 727932292.0, "step": 19077 }, { "epoch": 2.426917694949752, "ewc_loss": 0.07067443430423737, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035029902937822044, "grad_norm": 8.341771125793457, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8675216436386108, "num_tokens": 727967005.0, "step": 19078 }, { "epoch": 2.4270449052283425, "ewc_loss": 0.0711469054222107, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035502371611073613, "grad_norm": 8.424989700317383, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8564305901527405, "num_tokens": 728004262.0, "step": 19079 }, { "epoch": 2.427172115506933, "ewc_loss": 0.07085929811000824, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003521476173773408, "grad_norm": 8.329282760620117, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8753383159637451, "num_tokens": 728041451.0, "step": 19080 }, { "epoch": 2.4272993257855235, "ewc_loss": 0.07109306007623672, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035448529524728656, "grad_norm": 8.426998138427734, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8646067380905151, "num_tokens": 728079217.0, "step": 19081 }, { "epoch": 2.427426536064114, "ewc_loss": 0.07067714631557465, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035032618325203657, "grad_norm": 8.292401313781738, "learning_rate": 1e-06, "loss": 0.3948, "mean_token_accuracy": 0.8832122087478638, "num_tokens": 728116221.0, "step": 19082 }, { "epoch": 2.4275537463427046, "ewc_loss": 0.07115818560123444, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003551365516614169, "grad_norm": 8.379056930541992, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8564469814300537, "num_tokens": 728154861.0, "step": 19083 }, { "epoch": 2.427680956621295, "ewc_loss": 0.07090365886688232, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003525913052726537, "grad_norm": 8.395413398742676, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8557504415512085, "num_tokens": 728188276.0, "step": 19084 }, { "epoch": 2.4278081668998857, "ewc_loss": 0.07107585668563843, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035431323340162635, "grad_norm": 8.390003204345703, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8776142597198486, "num_tokens": 728229313.0, "step": 19085 }, { "epoch": 2.427935377178476, "ewc_loss": 0.07100258767604828, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035358057357370853, "grad_norm": 8.33395767211914, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.8562068939208984, "num_tokens": 728268706.0, "step": 19086 }, { "epoch": 2.4280625874570667, "ewc_loss": 0.07107585668563843, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035431323340162635, "grad_norm": 8.411355018615723, "learning_rate": 1e-06, "loss": 0.4095, "mean_token_accuracy": 0.879668116569519, "num_tokens": 728297490.0, "step": 19087 }, { "epoch": 2.428189797735657, "ewc_loss": 0.07094520330429077, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035300676245242357, "grad_norm": 8.342179298400879, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8601394891738892, "num_tokens": 728336041.0, "step": 19088 }, { "epoch": 2.4283170080142478, "ewc_loss": 0.07122145593166351, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035576929803937674, "grad_norm": 8.407700538635254, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8740028738975525, "num_tokens": 728372701.0, "step": 19089 }, { "epoch": 2.428444218292838, "ewc_loss": 0.07103854417800903, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035394018050283194, "grad_norm": 11.123263359069824, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8703954815864563, "num_tokens": 728408138.0, "step": 19090 }, { "epoch": 2.4285714285714284, "ewc_loss": 0.07223477214574814, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036590240779332817, "grad_norm": 8.263267517089844, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8725028038024902, "num_tokens": 728452649.0, "step": 19091 }, { "epoch": 2.428698638850019, "ewc_loss": 0.07373351603746414, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003808898327406496, "grad_norm": 8.923030853271484, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.856884777545929, "num_tokens": 728490551.0, "step": 19092 }, { "epoch": 2.4288258491286094, "ewc_loss": 0.07057230919599533, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034927777596749365, "grad_norm": 8.215928077697754, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8715685606002808, "num_tokens": 728529558.0, "step": 19093 }, { "epoch": 2.4289530594072, "ewc_loss": 0.07391761243343353, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00038273082464002073, "grad_norm": 8.863045692443848, "learning_rate": 1e-06, "loss": 0.5395, "mean_token_accuracy": 0.8429487347602844, "num_tokens": 728573495.0, "step": 19094 }, { "epoch": 2.4290802696857905, "ewc_loss": 0.07121877372264862, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035574243520386517, "grad_norm": 8.407637596130371, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8665604591369629, "num_tokens": 728609166.0, "step": 19095 }, { "epoch": 2.429207479964381, "ewc_loss": 0.07283259183168411, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003718805965036154, "grad_norm": 8.684768676757812, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8748279213905334, "num_tokens": 728653070.0, "step": 19096 }, { "epoch": 2.4293346902429716, "ewc_loss": 0.07140138000249863, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.000357568496838212, "grad_norm": 8.434730529785156, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8689977526664734, "num_tokens": 728689142.0, "step": 19097 }, { "epoch": 2.429461900521562, "ewc_loss": 0.07209606468677521, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003645152901299298, "grad_norm": 8.613231658935547, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8709376454353333, "num_tokens": 728725615.0, "step": 19098 }, { "epoch": 2.4295891108001526, "ewc_loss": 0.07149398326873779, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035849452251568437, "grad_norm": 8.523088455200195, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8567067384719849, "num_tokens": 728762366.0, "step": 19099 }, { "epoch": 2.429716321078743, "ewc_loss": 0.07159747183322906, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035952942562289536, "grad_norm": 8.460820198059082, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8558661937713623, "num_tokens": 728801951.0, "step": 19100 }, { "epoch": 2.4298435313573337, "ewc_loss": 0.0713510662317276, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000357065349817276, "grad_norm": 8.48376750946045, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8752962350845337, "num_tokens": 728836403.0, "step": 19101 }, { "epoch": 2.429970741635924, "ewc_loss": 0.07126296311616898, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035618431866168976, "grad_norm": 8.417235374450684, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8734248876571655, "num_tokens": 728878485.0, "step": 19102 }, { "epoch": 2.4300979519145147, "ewc_loss": 0.071338951587677, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035694416146725416, "grad_norm": 8.434191703796387, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8762233257293701, "num_tokens": 728915935.0, "step": 19103 }, { "epoch": 2.4302251621931052, "ewc_loss": 0.07105281203985214, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035408278927206993, "grad_norm": 8.373217582702637, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8489701747894287, "num_tokens": 728956847.0, "step": 19104 }, { "epoch": 2.4303523724716958, "ewc_loss": 0.07122422754764557, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003557969175744802, "grad_norm": 8.44780158996582, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8713603615760803, "num_tokens": 728994865.0, "step": 19105 }, { "epoch": 2.4304795827502863, "ewc_loss": 0.07100403308868408, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003535950090736151, "grad_norm": 8.334080696105957, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8587540984153748, "num_tokens": 729033030.0, "step": 19106 }, { "epoch": 2.430606793028877, "ewc_loss": 0.07127834856510162, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035633816150948405, "grad_norm": 8.461346626281738, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8779811263084412, "num_tokens": 729070430.0, "step": 19107 }, { "epoch": 2.4307340033074674, "ewc_loss": 0.07096947729587555, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003532494301907718, "grad_norm": 8.327157020568848, "learning_rate": 1e-06, "loss": 0.4314, "mean_token_accuracy": 0.8773200511932373, "num_tokens": 729102127.0, "step": 19108 }, { "epoch": 2.430861213586058, "ewc_loss": 0.0713503509759903, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003570581611711532, "grad_norm": 8.447945594787598, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8679918050765991, "num_tokens": 729138128.0, "step": 19109 }, { "epoch": 2.4309884238646484, "ewc_loss": 0.07101993262767792, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035375397419556975, "grad_norm": 8.337634086608887, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.863495945930481, "num_tokens": 729179756.0, "step": 19110 }, { "epoch": 2.431115634143239, "ewc_loss": 0.0712975487112999, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003565301885828376, "grad_norm": 8.437247276306152, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8550575971603394, "num_tokens": 729221356.0, "step": 19111 }, { "epoch": 2.4312428444218295, "ewc_loss": 0.07093331962823868, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003528878733050078, "grad_norm": 8.314823150634766, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8694330453872681, "num_tokens": 729260456.0, "step": 19112 }, { "epoch": 2.4313700547004196, "ewc_loss": 0.07144109159708023, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035796561860479414, "grad_norm": 8.447357177734375, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8553245663642883, "num_tokens": 729299654.0, "step": 19113 }, { "epoch": 2.4314972649790105, "ewc_loss": 0.07088513672351837, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035240603028796613, "grad_norm": 8.32994270324707, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8695034980773926, "num_tokens": 729341296.0, "step": 19114 }, { "epoch": 2.4316244752576006, "ewc_loss": 0.07133345305919647, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003568891843315214, "grad_norm": 11.118185997009277, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8701725602149963, "num_tokens": 729379632.0, "step": 19115 }, { "epoch": 2.431751685536191, "ewc_loss": 0.07256808876991272, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036923555308021605, "grad_norm": 8.385686874389648, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8688480854034424, "num_tokens": 729423202.0, "step": 19116 }, { "epoch": 2.4318788958147817, "ewc_loss": 0.07347479462623596, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00037830264773219824, "grad_norm": 8.8406343460083, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.8871514797210693, "num_tokens": 729457108.0, "step": 19117 }, { "epoch": 2.432006106093372, "ewc_loss": 0.07089997828006744, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003525545180309564, "grad_norm": 8.384001731872559, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8633200526237488, "num_tokens": 729494065.0, "step": 19118 }, { "epoch": 2.4321333163719627, "ewc_loss": 0.07317134737968445, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00037526816595345736, "grad_norm": 8.777374267578125, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8713385462760925, "num_tokens": 729536621.0, "step": 19119 }, { "epoch": 2.4322605266505533, "ewc_loss": 0.07114583998918533, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003550130932126194, "grad_norm": 8.412152290344238, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.867538571357727, "num_tokens": 729574423.0, "step": 19120 }, { "epoch": 2.432387736929144, "ewc_loss": 0.07229210436344147, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003664756950456649, "grad_norm": 8.673009872436523, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8612779378890991, "num_tokens": 729614673.0, "step": 19121 }, { "epoch": 2.4325149472077343, "ewc_loss": 0.07109039276838303, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035445860703475773, "grad_norm": 8.38080883026123, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8665117025375366, "num_tokens": 729653049.0, "step": 19122 }, { "epoch": 2.432642157486325, "ewc_loss": 0.07201586663722992, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003637133340816945, "grad_norm": 8.597085952758789, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8653489351272583, "num_tokens": 729689667.0, "step": 19123 }, { "epoch": 2.4327693677649154, "ewc_loss": 0.07125842571258545, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035613891668617725, "grad_norm": 8.452202796936035, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8709518909454346, "num_tokens": 729734747.0, "step": 19124 }, { "epoch": 2.432896578043506, "ewc_loss": 0.07161737978458405, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035972846671938896, "grad_norm": 8.534391403198242, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8614904880523682, "num_tokens": 729770135.0, "step": 19125 }, { "epoch": 2.4330237883220964, "ewc_loss": 0.07109494507312775, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035450412542559206, "grad_norm": 8.455601692199707, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.865447461605072, "num_tokens": 729809203.0, "step": 19126 }, { "epoch": 2.433150998600687, "ewc_loss": 0.07126379013061523, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003561926423572004, "grad_norm": 8.413108825683594, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8721716403961182, "num_tokens": 729850164.0, "step": 19127 }, { "epoch": 2.4332782088792775, "ewc_loss": 0.07126428186893463, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035619756090454757, "grad_norm": 8.504729270935059, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.85637366771698, "num_tokens": 729884337.0, "step": 19128 }, { "epoch": 2.433405419157868, "ewc_loss": 0.0709133893251419, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000352688628481701, "grad_norm": 8.455718040466309, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8797205686569214, "num_tokens": 729924098.0, "step": 19129 }, { "epoch": 2.4335326294364585, "ewc_loss": 0.0712289959192276, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003558446478564292, "grad_norm": 8.411199569702148, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8859464526176453, "num_tokens": 729960671.0, "step": 19130 }, { "epoch": 2.433659839715049, "ewc_loss": 0.07104062288999557, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035396090243011713, "grad_norm": 8.407363891601562, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8683040738105774, "num_tokens": 730000780.0, "step": 19131 }, { "epoch": 2.4337870499936396, "ewc_loss": 0.07113691419363022, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003549238317646086, "grad_norm": 8.384395599365234, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8646319508552551, "num_tokens": 730045198.0, "step": 19132 }, { "epoch": 2.43391426027223, "ewc_loss": 0.07108837366104126, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003544383798725903, "grad_norm": 8.391769409179688, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8612436056137085, "num_tokens": 730085010.0, "step": 19133 }, { "epoch": 2.4340414705508207, "ewc_loss": 0.07126195728778839, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035617424873635173, "grad_norm": 8.442145347595215, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8617642521858215, "num_tokens": 730121512.0, "step": 19134 }, { "epoch": 2.434168680829411, "ewc_loss": 0.07110126316547394, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003545673389453441, "grad_norm": 8.349810600280762, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8658033609390259, "num_tokens": 730164859.0, "step": 19135 }, { "epoch": 2.4342958911080017, "ewc_loss": 0.07141018658876419, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003576565650291741, "grad_norm": 8.51177978515625, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8675965666770935, "num_tokens": 730202273.0, "step": 19136 }, { "epoch": 2.4344231013865922, "ewc_loss": 0.07098594307899475, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000353414099663496, "grad_norm": 8.32992935180664, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8590869903564453, "num_tokens": 730248208.0, "step": 19137 }, { "epoch": 2.4345503116651823, "ewc_loss": 0.07137389481067657, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035729361115954816, "grad_norm": 8.437132835388184, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8750110268592834, "num_tokens": 730291084.0, "step": 19138 }, { "epoch": 2.4346775219437733, "ewc_loss": 0.07106003910303116, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035415508318692446, "grad_norm": 8.383056640625, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8714446425437927, "num_tokens": 730334504.0, "step": 19139 }, { "epoch": 2.4348047322223634, "ewc_loss": 0.07138605415821075, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035741523606702685, "grad_norm": 8.411975860595703, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8713358640670776, "num_tokens": 730373356.0, "step": 19140 }, { "epoch": 2.434931942500954, "ewc_loss": 0.07109010964632034, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035445578396320343, "grad_norm": 8.365290641784668, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8527347445487976, "num_tokens": 730416229.0, "step": 19141 }, { "epoch": 2.4350591527795444, "ewc_loss": 0.0714840292930603, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003583949583116919, "grad_norm": 8.46177864074707, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8804492354393005, "num_tokens": 730458922.0, "step": 19142 }, { "epoch": 2.435186363058135, "ewc_loss": 0.07112482935190201, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035480299266055226, "grad_norm": 8.363183975219727, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8656208515167236, "num_tokens": 730494583.0, "step": 19143 }, { "epoch": 2.4353135733367255, "ewc_loss": 0.0715489611029625, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035904429387301207, "grad_norm": 8.41638469696045, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8675788640975952, "num_tokens": 730529526.0, "step": 19144 }, { "epoch": 2.435440783615316, "ewc_loss": 0.07129311561584473, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000356485863449052, "grad_norm": 8.4612398147583, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8702288269996643, "num_tokens": 730558584.0, "step": 19145 }, { "epoch": 2.4355679938939065, "ewc_loss": 0.07125082612037659, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003560629556886852, "grad_norm": 8.37902545928955, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.866609513759613, "num_tokens": 730598677.0, "step": 19146 }, { "epoch": 2.435695204172497, "ewc_loss": 0.0714825987815857, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003583806683309376, "grad_norm": 8.356958389282227, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.875963568687439, "num_tokens": 730636985.0, "step": 19147 }, { "epoch": 2.4358224144510876, "ewc_loss": 0.07138633728027344, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035741805913858116, "grad_norm": 8.393295288085938, "learning_rate": 1e-06, "loss": 0.5139, "mean_token_accuracy": 0.8495904803276062, "num_tokens": 730675409.0, "step": 19148 }, { "epoch": 2.435949624729678, "ewc_loss": 0.07141266763210297, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035768133238889277, "grad_norm": 8.404449462890625, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8547085523605347, "num_tokens": 730714739.0, "step": 19149 }, { "epoch": 2.4360768350082687, "ewc_loss": 0.07155562937259674, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035911097074858844, "grad_norm": 8.384000778198242, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8645761013031006, "num_tokens": 730757059.0, "step": 19150 }, { "epoch": 2.436204045286859, "ewc_loss": 0.07161689549684525, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035972363548353314, "grad_norm": 8.477258682250977, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8649964332580566, "num_tokens": 730793469.0, "step": 19151 }, { "epoch": 2.4363312555654497, "ewc_loss": 0.07136442512273788, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003571989363990724, "grad_norm": 8.345770835876465, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8719170093536377, "num_tokens": 730829336.0, "step": 19152 }, { "epoch": 2.4364584658440402, "ewc_loss": 0.0718681812286377, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003622365475166589, "grad_norm": 8.463183403015137, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8640211820602417, "num_tokens": 730869304.0, "step": 19153 }, { "epoch": 2.4365856761226308, "ewc_loss": 0.07138766348361969, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035743133048526943, "grad_norm": 8.451626777648926, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8553013801574707, "num_tokens": 730905315.0, "step": 19154 }, { "epoch": 2.4367128864012213, "ewc_loss": 0.07170389592647552, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003605936362873763, "grad_norm": 8.397713661193848, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8573513031005859, "num_tokens": 730947346.0, "step": 19155 }, { "epoch": 2.436840096679812, "ewc_loss": 0.07160584628582001, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035961309913545847, "grad_norm": 8.47031307220459, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.870976448059082, "num_tokens": 730976907.0, "step": 19156 }, { "epoch": 2.4369673069584024, "ewc_loss": 0.07137671113014221, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003573217545635998, "grad_norm": 8.40174388885498, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.863226056098938, "num_tokens": 731015940.0, "step": 19157 }, { "epoch": 2.437094517236993, "ewc_loss": 0.07160889357328415, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003596436290536076, "grad_norm": 8.4097900390625, "learning_rate": 1e-06, "loss": 0.3929, "mean_token_accuracy": 0.8845590353012085, "num_tokens": 731051345.0, "step": 19158 }, { "epoch": 2.4372217275155834, "ewc_loss": 0.0713990330696106, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003575450391508639, "grad_norm": 8.441831588745117, "learning_rate": 1e-06, "loss": 0.548, "mean_token_accuracy": 0.8342330455780029, "num_tokens": 731089642.0, "step": 19159 }, { "epoch": 2.437348937794174, "ewc_loss": 0.07133875787258148, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000356942240614444, "grad_norm": 8.385531425476074, "learning_rate": 1e-06, "loss": 0.4082, "mean_token_accuracy": 0.8848041296005249, "num_tokens": 731133196.0, "step": 19160 }, { "epoch": 2.4374761480727645, "ewc_loss": 0.07146459072828293, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003582006029319018, "grad_norm": 8.479728698730469, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8559051752090454, "num_tokens": 731175510.0, "step": 19161 }, { "epoch": 2.437603358351355, "ewc_loss": 0.07127629965543747, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035631770151667297, "grad_norm": 8.430145263671875, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8744903802871704, "num_tokens": 731217526.0, "step": 19162 }, { "epoch": 2.437730568629945, "ewc_loss": 0.07148484885692596, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003584031655918807, "grad_norm": 8.411508560180664, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8556996583938599, "num_tokens": 731259525.0, "step": 19163 }, { "epoch": 2.437857778908536, "ewc_loss": 0.0713815689086914, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003573704161681235, "grad_norm": 8.449226379394531, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8718479871749878, "num_tokens": 731301480.0, "step": 19164 }, { "epoch": 2.437984989187126, "ewc_loss": 0.07127586007118225, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003563132486306131, "grad_norm": 8.392068862915039, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8752197027206421, "num_tokens": 731347385.0, "step": 19165 }, { "epoch": 2.4381121994657167, "ewc_loss": 0.07150548696517944, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035860956995747983, "grad_norm": 8.46148681640625, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8704825639724731, "num_tokens": 731383666.0, "step": 19166 }, { "epoch": 2.438239409744307, "ewc_loss": 0.07127855718135834, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035634025698527694, "grad_norm": 8.432425498962402, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8672749400138855, "num_tokens": 731420297.0, "step": 19167 }, { "epoch": 2.4383666200228977, "ewc_loss": 0.0714317336678505, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003578720206860453, "grad_norm": 8.436814308166504, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8617713451385498, "num_tokens": 731455083.0, "step": 19168 }, { "epoch": 2.4384938303014883, "ewc_loss": 0.07135835289955139, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035713822580873966, "grad_norm": 8.470011711120605, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8752932548522949, "num_tokens": 731482555.0, "step": 19169 }, { "epoch": 2.438621040580079, "ewc_loss": 0.07125046849250793, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003560593177098781, "grad_norm": 8.44161319732666, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8783648014068604, "num_tokens": 731519459.0, "step": 19170 }, { "epoch": 2.4387482508586693, "ewc_loss": 0.07143040001392365, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003578586911316961, "grad_norm": 8.392087936401367, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8709850907325745, "num_tokens": 731554577.0, "step": 19171 }, { "epoch": 2.43887546113726, "ewc_loss": 0.07148152589797974, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035836995812132955, "grad_norm": 8.519932746887207, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8581352233886719, "num_tokens": 731594993.0, "step": 19172 }, { "epoch": 2.4390026714158504, "ewc_loss": 0.07119515538215637, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035550628672353923, "grad_norm": 8.447335243225098, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8712253570556641, "num_tokens": 731629182.0, "step": 19173 }, { "epoch": 2.439129881694441, "ewc_loss": 0.07146291434764862, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003581838100217283, "grad_norm": 8.475308418273926, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8647364377975464, "num_tokens": 731667661.0, "step": 19174 }, { "epoch": 2.4392570919730314, "ewc_loss": 0.07125246524810791, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003560793702490628, "grad_norm": 8.410353660583496, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8675317764282227, "num_tokens": 731704911.0, "step": 19175 }, { "epoch": 2.439384302251622, "ewc_loss": 0.07130394876003265, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000356594129698351, "grad_norm": 8.445682525634766, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8485747575759888, "num_tokens": 731744990.0, "step": 19176 }, { "epoch": 2.4395115125302125, "ewc_loss": 0.07136541604995728, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035720880259759724, "grad_norm": 8.409364700317383, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8690917491912842, "num_tokens": 731783335.0, "step": 19177 }, { "epoch": 2.439638722808803, "ewc_loss": 0.07145172357559204, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035807196400128305, "grad_norm": 8.485148429870605, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.869020402431488, "num_tokens": 731819206.0, "step": 19178 }, { "epoch": 2.4397659330873935, "ewc_loss": 0.07112918794155121, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035484653199091554, "grad_norm": 8.401812553405762, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8641101121902466, "num_tokens": 731855291.0, "step": 19179 }, { "epoch": 2.439893143365984, "ewc_loss": 0.07147158682346344, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003582705103326589, "grad_norm": 8.437479972839355, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8702377080917358, "num_tokens": 731899392.0, "step": 19180 }, { "epoch": 2.4400203536445746, "ewc_loss": 0.07118906080722809, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035544525599107146, "grad_norm": 8.407194137573242, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8669604063034058, "num_tokens": 731937502.0, "step": 19181 }, { "epoch": 2.440147563923165, "ewc_loss": 0.07128989696502686, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003564537037163973, "grad_norm": 8.384382247924805, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8701561689376831, "num_tokens": 731974946.0, "step": 19182 }, { "epoch": 2.4402747742017556, "ewc_loss": 0.07131217420101166, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003566764644347131, "grad_norm": 8.431771278381348, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8545734882354736, "num_tokens": 732018732.0, "step": 19183 }, { "epoch": 2.440401984480346, "ewc_loss": 0.07130173593759537, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035657203989103436, "grad_norm": 8.418242454528809, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8653947114944458, "num_tokens": 732052036.0, "step": 19184 }, { "epoch": 2.4405291947589367, "ewc_loss": 0.0713030993938446, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035658571869134903, "grad_norm": 8.403230667114258, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8626991510391235, "num_tokens": 732090944.0, "step": 19185 }, { "epoch": 2.440656405037527, "ewc_loss": 0.07135511934757233, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003571058623492718, "grad_norm": 8.425362586975098, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8682726621627808, "num_tokens": 732131406.0, "step": 19186 }, { "epoch": 2.4407836153161178, "ewc_loss": 0.071144238114357, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003549970278982073, "grad_norm": 8.376471519470215, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.86597740650177, "num_tokens": 732171701.0, "step": 19187 }, { "epoch": 2.440910825594708, "ewc_loss": 0.07132451236248016, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003567998355720192, "grad_norm": 8.444828033447266, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8754976987838745, "num_tokens": 732209658.0, "step": 19188 }, { "epoch": 2.4410380358732984, "ewc_loss": 0.07109272480010986, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003544819192029536, "grad_norm": 8.412735939025879, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8551502823829651, "num_tokens": 732251395.0, "step": 19189 }, { "epoch": 2.441165246151889, "ewc_loss": 0.07122404128313065, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035579511313699186, "grad_norm": 8.429712295532227, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8562594652175903, "num_tokens": 732289306.0, "step": 19190 }, { "epoch": 2.4412924564304794, "ewc_loss": 0.0711342990398407, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000354897667421028, "grad_norm": 8.351258277893066, "learning_rate": 1e-06, "loss": 0.4974, "mean_token_accuracy": 0.8516559600830078, "num_tokens": 732337916.0, "step": 19191 }, { "epoch": 2.44141966670907, "ewc_loss": 0.07129454612731934, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003565001825336367, "grad_norm": 8.430830001831055, "learning_rate": 1e-06, "loss": 0.51, "mean_token_accuracy": 0.8535240888595581, "num_tokens": 732380358.0, "step": 19192 }, { "epoch": 2.4415468769876605, "ewc_loss": 0.0712134838104248, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035568952444009483, "grad_norm": 8.388866424560547, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8765343427658081, "num_tokens": 732416534.0, "step": 19193 }, { "epoch": 2.441674087266251, "ewc_loss": 0.07126915454864502, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035624622250907123, "grad_norm": 8.422367095947266, "learning_rate": 1e-06, "loss": 0.5208, "mean_token_accuracy": 0.8457193374633789, "num_tokens": 732458927.0, "step": 19194 }, { "epoch": 2.4418012975448415, "ewc_loss": 0.07119139283895493, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035546859726309776, "grad_norm": 8.369041442871094, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.859175443649292, "num_tokens": 732501261.0, "step": 19195 }, { "epoch": 2.441928507823432, "ewc_loss": 0.07129689306020737, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035652361111715436, "grad_norm": 8.437752723693848, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8743964433670044, "num_tokens": 732541162.0, "step": 19196 }, { "epoch": 2.4420557181020226, "ewc_loss": 0.07121331989765167, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003556878655217588, "grad_norm": 8.474912643432617, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8638895750045776, "num_tokens": 732578237.0, "step": 19197 }, { "epoch": 2.442182928380613, "ewc_loss": 0.07123830914497375, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003559377510100603, "grad_norm": 8.40595817565918, "learning_rate": 1e-06, "loss": 0.4905, "mean_token_accuracy": 0.8556860089302063, "num_tokens": 732618624.0, "step": 19198 }, { "epoch": 2.4423101386592037, "ewc_loss": 0.0712594985961914, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003561497142072767, "grad_norm": 8.448038101196289, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8634127378463745, "num_tokens": 732661747.0, "step": 19199 }, { "epoch": 2.442437348937794, "ewc_loss": 0.07121254503726959, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003556801821105182, "grad_norm": 8.481755256652832, "learning_rate": 1e-06, "loss": 0.5235, "mean_token_accuracy": 0.8451125025749207, "num_tokens": 732703511.0, "step": 19200 }, { "epoch": 2.4425645592163847, "ewc_loss": 0.07109871506690979, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035454181488603354, "grad_norm": 8.418499946594238, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8462503552436829, "num_tokens": 732746749.0, "step": 19201 }, { "epoch": 2.4426917694949752, "ewc_loss": 0.0713617205619812, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035717192804440856, "grad_norm": 8.497756004333496, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8667810559272766, "num_tokens": 732783184.0, "step": 19202 }, { "epoch": 2.4428189797735658, "ewc_loss": 0.07108844071626663, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003544391074683517, "grad_norm": 8.460439682006836, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8595969080924988, "num_tokens": 732818664.0, "step": 19203 }, { "epoch": 2.4429461900521563, "ewc_loss": 0.07132314145565033, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003567861276678741, "grad_norm": 8.470941543579102, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8574089407920837, "num_tokens": 732854286.0, "step": 19204 }, { "epoch": 2.443073400330747, "ewc_loss": 0.07103025913238525, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035385729279369116, "grad_norm": 8.448296546936035, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8721314072608948, "num_tokens": 732893032.0, "step": 19205 }, { "epoch": 2.4432006106093374, "ewc_loss": 0.07129989564418793, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035655367537401617, "grad_norm": 8.514816284179688, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8653554916381836, "num_tokens": 732929942.0, "step": 19206 }, { "epoch": 2.443327820887928, "ewc_loss": 0.07111085951328278, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035466323606669903, "grad_norm": 8.436203956604004, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8608362674713135, "num_tokens": 732968218.0, "step": 19207 }, { "epoch": 2.4434550311665184, "ewc_loss": 0.07117453217506409, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035529996966943145, "grad_norm": 8.485020637512207, "learning_rate": 1e-06, "loss": 0.3851, "mean_token_accuracy": 0.8897297382354736, "num_tokens": 733001464.0, "step": 19208 }, { "epoch": 2.443582241445109, "ewc_loss": 0.07096491754055023, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003532039117999375, "grad_norm": 8.391345024108887, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8576886653900146, "num_tokens": 733036311.0, "step": 19209 }, { "epoch": 2.4437094517236995, "ewc_loss": 0.07128927856683731, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035644747549667954, "grad_norm": 8.513007164001465, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8728514313697815, "num_tokens": 733070487.0, "step": 19210 }, { "epoch": 2.4438366620022896, "ewc_loss": 0.07094607502222061, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003530154353938997, "grad_norm": 8.379847526550293, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8650214076042175, "num_tokens": 733110357.0, "step": 19211 }, { "epoch": 2.4439638722808805, "ewc_loss": 0.07166044414043427, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035771774128079414, "grad_norm": 8.457627296447754, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8588007688522339, "num_tokens": 733149070.0, "step": 19212 }, { "epoch": 2.4440910825594706, "ewc_loss": 0.07108581066131592, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003544127685017884, "grad_norm": 8.423408508300781, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8818650245666504, "num_tokens": 733180848.0, "step": 19213 }, { "epoch": 2.444218292838061, "ewc_loss": 0.07139947265386581, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003575494047254324, "grad_norm": 8.554115295410156, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8666920065879822, "num_tokens": 733219204.0, "step": 19214 }, { "epoch": 2.4443455031166517, "ewc_loss": 0.07099881768226624, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035354288411326706, "grad_norm": 8.383133888244629, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8555718064308167, "num_tokens": 733255237.0, "step": 19215 }, { "epoch": 2.444472713395242, "ewc_loss": 0.07154475152492523, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035900220973417163, "grad_norm": 8.58885669708252, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8771154880523682, "num_tokens": 733293243.0, "step": 19216 }, { "epoch": 2.4445999236738327, "ewc_loss": 0.07097028195858002, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035325749195180833, "grad_norm": 8.37581729888916, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8682547211647034, "num_tokens": 733333530.0, "step": 19217 }, { "epoch": 2.4447271339524232, "ewc_loss": 0.07151143252849579, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003586689999792725, "grad_norm": 8.716251373291016, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8488884568214417, "num_tokens": 733373790.0, "step": 19218 }, { "epoch": 2.4448543442310138, "ewc_loss": 0.07076945900917053, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003512492694426328, "grad_norm": 8.284348487854004, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8682090640068054, "num_tokens": 733416037.0, "step": 19219 }, { "epoch": 2.4449815545096043, "ewc_loss": 0.07179804146289825, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003615350869949907, "grad_norm": 8.822842597961426, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8615562319755554, "num_tokens": 733453014.0, "step": 19220 }, { "epoch": 2.445108764788195, "ewc_loss": 0.070542111992836, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003489758528303355, "grad_norm": 8.322915077209473, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8524402379989624, "num_tokens": 733491197.0, "step": 19221 }, { "epoch": 2.4452359750667854, "ewc_loss": 0.07198629528284073, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036341763916425407, "grad_norm": 8.662705421447754, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.878438413143158, "num_tokens": 733531142.0, "step": 19222 }, { "epoch": 2.445363185345376, "ewc_loss": 0.07062461972236633, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034980083000846207, "grad_norm": 8.355425834655762, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8592652082443237, "num_tokens": 733573916.0, "step": 19223 }, { "epoch": 2.4454903956239664, "ewc_loss": 0.07172730565071106, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036082768929190934, "grad_norm": 8.575745582580566, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8607274293899536, "num_tokens": 733613204.0, "step": 19224 }, { "epoch": 2.445617605902557, "ewc_loss": 0.07145735621452332, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035324538475833833, "grad_norm": 8.613360404968262, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.860836923122406, "num_tokens": 733653618.0, "step": 19225 }, { "epoch": 2.4457448161811475, "ewc_loss": 0.07102707773447037, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003538254532031715, "grad_norm": 8.496281623840332, "learning_rate": 1e-06, "loss": 0.4116, "mean_token_accuracy": 0.8819552659988403, "num_tokens": 733687795.0, "step": 19226 }, { "epoch": 2.445872026459738, "ewc_loss": 0.0711299479007721, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035485412809066474, "grad_norm": 8.61108112335205, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8648655414581299, "num_tokens": 733723574.0, "step": 19227 }, { "epoch": 2.4459992367383285, "ewc_loss": 0.07063977420330048, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034995246096514165, "grad_norm": 8.339881896972656, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.8729520440101624, "num_tokens": 733762051.0, "step": 19228 }, { "epoch": 2.446126447016919, "ewc_loss": 0.07143954932689667, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035795022267848253, "grad_norm": 8.682068824768066, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8828475475311279, "num_tokens": 733800640.0, "step": 19229 }, { "epoch": 2.4462536572955096, "ewc_loss": 0.07050426304340363, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00034859729930758476, "grad_norm": 8.296624183654785, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8549672961235046, "num_tokens": 733841321.0, "step": 19230 }, { "epoch": 2.4463808675741, "ewc_loss": 0.07152577489614487, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035881242365576327, "grad_norm": 8.617369651794434, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8680307865142822, "num_tokens": 733874104.0, "step": 19231 }, { "epoch": 2.4465080778526906, "ewc_loss": 0.0705442726612091, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00034899741876870394, "grad_norm": 8.305041313171387, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8706102967262268, "num_tokens": 733913127.0, "step": 19232 }, { "epoch": 2.446635288131281, "ewc_loss": 0.07154705375432968, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003590252308640629, "grad_norm": 8.555615425109863, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.882611095905304, "num_tokens": 733951900.0, "step": 19233 }, { "epoch": 2.4467624984098717, "ewc_loss": 0.07067295908927917, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003502842737361789, "grad_norm": 8.398880004882812, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.868934154510498, "num_tokens": 733992866.0, "step": 19234 }, { "epoch": 2.4468897086884622, "ewc_loss": 0.07130015641450882, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035655623651109636, "grad_norm": 8.546072006225586, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8607705235481262, "num_tokens": 734029895.0, "step": 19235 }, { "epoch": 2.4470169189670523, "ewc_loss": 0.07083094120025635, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035186405875720084, "grad_norm": 8.418394088745117, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8824863433837891, "num_tokens": 734066954.0, "step": 19236 }, { "epoch": 2.4471441292456433, "ewc_loss": 0.07123570144176483, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035591170308180153, "grad_norm": 8.447486877441406, "learning_rate": 1e-06, "loss": 0.3923, "mean_token_accuracy": 0.887734055519104, "num_tokens": 734102412.0, "step": 19237 }, { "epoch": 2.4472713395242334, "ewc_loss": 0.07098095118999481, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003533642156980932, "grad_norm": 8.394944190979004, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.882895290851593, "num_tokens": 734141400.0, "step": 19238 }, { "epoch": 2.447398549802824, "ewc_loss": 0.07127108424901962, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035626551834866405, "grad_norm": 8.551874160766602, "learning_rate": 1e-06, "loss": 0.5346, "mean_token_accuracy": 0.8446664810180664, "num_tokens": 734176628.0, "step": 19239 }, { "epoch": 2.4475257600814144, "ewc_loss": 0.07096200436353683, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035317474976181984, "grad_norm": 8.350200653076172, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.871924877166748, "num_tokens": 734219756.0, "step": 19240 }, { "epoch": 2.447652970360005, "ewc_loss": 0.07145053893327713, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003580600896384567, "grad_norm": 8.512199401855469, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8652474880218506, "num_tokens": 734250509.0, "step": 19241 }, { "epoch": 2.4477801806385955, "ewc_loss": 0.07105103880167007, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035406509414315224, "grad_norm": 8.397360801696777, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8719079494476318, "num_tokens": 734287887.0, "step": 19242 }, { "epoch": 2.447907390917186, "ewc_loss": 0.07140547037124634, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003576093877200037, "grad_norm": 8.409407615661621, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8719885349273682, "num_tokens": 734330446.0, "step": 19243 }, { "epoch": 2.4480346011957765, "ewc_loss": 0.07132623344659805, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003568170068319887, "grad_norm": 8.437262535095215, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8726906180381775, "num_tokens": 734368427.0, "step": 19244 }, { "epoch": 2.448161811474367, "ewc_loss": 0.07128845155239105, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035643921000882983, "grad_norm": 8.424842834472656, "learning_rate": 1e-06, "loss": 0.399, "mean_token_accuracy": 0.8841207027435303, "num_tokens": 734405103.0, "step": 19245 }, { "epoch": 2.4482890217529576, "ewc_loss": 0.07144421339035034, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003579967888072133, "grad_norm": 8.45478343963623, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8628542423248291, "num_tokens": 734448069.0, "step": 19246 }, { "epoch": 2.448416232031548, "ewc_loss": 0.07119353860616684, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035549007588997483, "grad_norm": 8.452895164489746, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8525561094284058, "num_tokens": 734486016.0, "step": 19247 }, { "epoch": 2.4485434423101387, "ewc_loss": 0.07135070115327835, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003570617118384689, "grad_norm": 8.450906753540039, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8542730808258057, "num_tokens": 734526943.0, "step": 19248 }, { "epoch": 2.448670652588729, "ewc_loss": 0.07130128890275955, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003565675870049745, "grad_norm": 8.378703117370605, "learning_rate": 1e-06, "loss": 0.4235, "mean_token_accuracy": 0.8774442672729492, "num_tokens": 734566137.0, "step": 19249 }, { "epoch": 2.4487978628673197, "ewc_loss": 0.07153061777353287, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035886088153347373, "grad_norm": 8.551226615905762, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8646490573883057, "num_tokens": 734602366.0, "step": 19250 }, { "epoch": 2.4489250731459102, "ewc_loss": 0.0710131824016571, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035368651151657104, "grad_norm": 8.359441757202148, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8743385672569275, "num_tokens": 734644251.0, "step": 19251 }, { "epoch": 2.4490522834245008, "ewc_loss": 0.07151736319065094, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003587283717934042, "grad_norm": 8.447789192199707, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8624899387359619, "num_tokens": 734682938.0, "step": 19252 }, { "epoch": 2.4491794937030913, "ewc_loss": 0.0711725652217865, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003552803536877036, "grad_norm": 8.381168365478516, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8673706650733948, "num_tokens": 734722975.0, "step": 19253 }, { "epoch": 2.449306703981682, "ewc_loss": 0.07153873145580292, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003589419648051262, "grad_norm": 8.459226608276367, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8690969944000244, "num_tokens": 734761361.0, "step": 19254 }, { "epoch": 2.4494339142602723, "ewc_loss": 0.07123707234859467, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003559253818821162, "grad_norm": 8.432597160339355, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8636224269866943, "num_tokens": 734800597.0, "step": 19255 }, { "epoch": 2.449561124538863, "ewc_loss": 0.07157041132450104, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003592588473111391, "grad_norm": 8.539161682128906, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8548153638839722, "num_tokens": 734840575.0, "step": 19256 }, { "epoch": 2.4496883348174534, "ewc_loss": 0.07124659419059753, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003560205805115402, "grad_norm": 8.424570083618164, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8571308851242065, "num_tokens": 734876943.0, "step": 19257 }, { "epoch": 2.449815545096044, "ewc_loss": 0.07169691473245621, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036052384530194104, "grad_norm": 8.497827529907227, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.882908821105957, "num_tokens": 734916436.0, "step": 19258 }, { "epoch": 2.4499427553746345, "ewc_loss": 0.07129144668579102, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003564691578503698, "grad_norm": 8.506163597106934, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8663637638092041, "num_tokens": 734956336.0, "step": 19259 }, { "epoch": 2.450069965653225, "ewc_loss": 0.07141733169555664, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035772795672528446, "grad_norm": 8.46060848236084, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8650552034378052, "num_tokens": 734997132.0, "step": 19260 }, { "epoch": 2.450197175931815, "ewc_loss": 0.07141600549221039, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035771471448242664, "grad_norm": 8.473154067993164, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8542710542678833, "num_tokens": 735036192.0, "step": 19261 }, { "epoch": 2.450324386210406, "ewc_loss": 0.07135052978992462, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003570599656086415, "grad_norm": 8.483855247497559, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.8798931837081909, "num_tokens": 735079368.0, "step": 19262 }, { "epoch": 2.450451596488996, "ewc_loss": 0.07133708149194717, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003569255059119314, "grad_norm": 8.49647045135498, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8716564178466797, "num_tokens": 735109714.0, "step": 19263 }, { "epoch": 2.4505788067675867, "ewc_loss": 0.0713239386677742, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003567940730135888, "grad_norm": 8.477981567382812, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8641109466552734, "num_tokens": 735152712.0, "step": 19264 }, { "epoch": 2.450706017046177, "ewc_loss": 0.07132145762443542, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003567692474462092, "grad_norm": 8.465572357177734, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8615900278091431, "num_tokens": 735193313.0, "step": 19265 }, { "epoch": 2.4508332273247677, "ewc_loss": 0.07132129371166229, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035676767583936453, "grad_norm": 8.40487003326416, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.881308376789093, "num_tokens": 735235413.0, "step": 19266 }, { "epoch": 2.4509604376033582, "ewc_loss": 0.07145625352859497, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003581171913538128, "grad_norm": 8.520614624023438, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8679603338241577, "num_tokens": 735270809.0, "step": 19267 }, { "epoch": 2.4510876478819488, "ewc_loss": 0.07113571465015411, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003549118700902909, "grad_norm": 8.410305976867676, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8584338426589966, "num_tokens": 735309790.0, "step": 19268 }, { "epoch": 2.4512148581605393, "ewc_loss": 0.0716174989938736, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003597297181840986, "grad_norm": 8.536054611206055, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8668521046638489, "num_tokens": 735341084.0, "step": 19269 }, { "epoch": 2.45134206843913, "ewc_loss": 0.0712764710187912, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035631944774650037, "grad_norm": 8.43181324005127, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8817918300628662, "num_tokens": 735376964.0, "step": 19270 }, { "epoch": 2.4514692787177204, "ewc_loss": 0.07171374559402466, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003606921818573028, "grad_norm": 8.48790168762207, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8763055205345154, "num_tokens": 735413296.0, "step": 19271 }, { "epoch": 2.451596488996311, "ewc_loss": 0.07135358452796936, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003570905246306211, "grad_norm": 8.435920715332031, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8713361620903015, "num_tokens": 735454992.0, "step": 19272 }, { "epoch": 2.4517236992749014, "ewc_loss": 0.07152485102415085, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003588031977415085, "grad_norm": 8.522696495056152, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8682416677474976, "num_tokens": 735493315.0, "step": 19273 }, { "epoch": 2.451850909553492, "ewc_loss": 0.07130783796310425, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035663307062350214, "grad_norm": 8.432165145874023, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8766449093818665, "num_tokens": 735532666.0, "step": 19274 }, { "epoch": 2.4519781198320825, "ewc_loss": 0.07154113054275513, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000358965975465253, "grad_norm": 8.477702140808105, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8642536401748657, "num_tokens": 735571987.0, "step": 19275 }, { "epoch": 2.452105330110673, "ewc_loss": 0.07123184204101562, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035587308229878545, "grad_norm": 8.421757698059082, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8649970293045044, "num_tokens": 735609409.0, "step": 19276 }, { "epoch": 2.4522325403892635, "ewc_loss": 0.0715821161866188, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003593758447095752, "grad_norm": 8.538134574890137, "learning_rate": 1e-06, "loss": 0.5466, "mean_token_accuracy": 0.839245080947876, "num_tokens": 735647299.0, "step": 19277 }, { "epoch": 2.452359750667854, "ewc_loss": 0.07122906297445297, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003558453172445297, "grad_norm": 8.5060396194458, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.873975396156311, "num_tokens": 735677757.0, "step": 19278 }, { "epoch": 2.4524869609464446, "ewc_loss": 0.0713338851928711, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035689357900992036, "grad_norm": 8.389762878417969, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8718839287757874, "num_tokens": 735723924.0, "step": 19279 }, { "epoch": 2.452614171225035, "ewc_loss": 0.07152304798364639, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035878518247045577, "grad_norm": 8.556107521057129, "learning_rate": 1e-06, "loss": 0.5011, "mean_token_accuracy": 0.8506165742874146, "num_tokens": 735758105.0, "step": 19280 }, { "epoch": 2.4527413815036256, "ewc_loss": 0.0710405558347702, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035396029124967754, "grad_norm": 8.409360885620117, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8634015321731567, "num_tokens": 735797603.0, "step": 19281 }, { "epoch": 2.452868591782216, "ewc_loss": 0.07163543999195099, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000359909055987373, "grad_norm": 8.481740951538086, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8386043310165405, "num_tokens": 735838958.0, "step": 19282 }, { "epoch": 2.4529958020608067, "ewc_loss": 0.0711783915758133, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003553385613486171, "grad_norm": 8.43232250213623, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8591880798339844, "num_tokens": 735878894.0, "step": 19283 }, { "epoch": 2.453123012339397, "ewc_loss": 0.07145404070615768, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035809510154649615, "grad_norm": 8.493391036987305, "learning_rate": 1e-06, "loss": 0.3668, "mean_token_accuracy": 0.8964167833328247, "num_tokens": 735913222.0, "step": 19284 }, { "epoch": 2.4532502226179878, "ewc_loss": 0.07124406099319458, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035599531838670373, "grad_norm": 8.37732219696045, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8731561899185181, "num_tokens": 735951884.0, "step": 19285 }, { "epoch": 2.453377432896578, "ewc_loss": 0.07154795527458191, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035903428215533495, "grad_norm": 8.563708305358887, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8615566492080688, "num_tokens": 735998955.0, "step": 19286 }, { "epoch": 2.4535046431751684, "ewc_loss": 0.07118459045886993, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003554005816113204, "grad_norm": 8.448454856872559, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8653095960617065, "num_tokens": 736039631.0, "step": 19287 }, { "epoch": 2.453631853453759, "ewc_loss": 0.07162460684776306, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003598007606342435, "grad_norm": 8.556198120117188, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8628888130187988, "num_tokens": 736085100.0, "step": 19288 }, { "epoch": 2.4537590637323494, "ewc_loss": 0.0711156576871872, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003547112864907831, "grad_norm": 8.407598495483398, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8820430040359497, "num_tokens": 736119680.0, "step": 19289 }, { "epoch": 2.45388627401094, "ewc_loss": 0.07149993628263474, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003585540398489684, "grad_norm": 8.41751766204834, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8776843547821045, "num_tokens": 736159769.0, "step": 19290 }, { "epoch": 2.4540134842895305, "ewc_loss": 0.07130227982997894, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003565774823073298, "grad_norm": 8.41594409942627, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8734778165817261, "num_tokens": 736199566.0, "step": 19291 }, { "epoch": 2.454140694568121, "ewc_loss": 0.07153944671154022, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035894912434741855, "grad_norm": 8.457315444946289, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8713657855987549, "num_tokens": 736239238.0, "step": 19292 }, { "epoch": 2.4542679048467115, "ewc_loss": 0.07131597399711609, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035671444493345916, "grad_norm": 8.408576965332031, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8706912398338318, "num_tokens": 736274576.0, "step": 19293 }, { "epoch": 2.454395115125302, "ewc_loss": 0.07144908607006073, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035804559593088925, "grad_norm": 8.483748435974121, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8588657379150391, "num_tokens": 736307509.0, "step": 19294 }, { "epoch": 2.4545223254038926, "ewc_loss": 0.07136890292167664, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035724369809031487, "grad_norm": 8.429348945617676, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8634309768676758, "num_tokens": 736348313.0, "step": 19295 }, { "epoch": 2.454649535682483, "ewc_loss": 0.07150306552648544, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003585853264667094, "grad_norm": 8.499909400939941, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8631132245063782, "num_tokens": 736380716.0, "step": 19296 }, { "epoch": 2.4547767459610736, "ewc_loss": 0.07137683033943176, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035732294782064855, "grad_norm": 8.430196762084961, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8456023931503296, "num_tokens": 736422669.0, "step": 19297 }, { "epoch": 2.454903956239664, "ewc_loss": 0.07157456874847412, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035930037847720087, "grad_norm": 8.52026081085205, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8753068447113037, "num_tokens": 736452929.0, "step": 19298 }, { "epoch": 2.4550311665182547, "ewc_loss": 0.0712895318865776, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003564500075299293, "grad_norm": 8.44935417175293, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8688525557518005, "num_tokens": 736486755.0, "step": 19299 }, { "epoch": 2.4551583767968452, "ewc_loss": 0.07146306335926056, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003581853525247425, "grad_norm": 8.789278030395508, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8747096657752991, "num_tokens": 736521542.0, "step": 19300 }, { "epoch": 2.4552855870754358, "ewc_loss": 0.07082021981477737, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035175689845345914, "grad_norm": 8.33336067199707, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8673771023750305, "num_tokens": 736557312.0, "step": 19301 }, { "epoch": 2.4554127973540263, "ewc_loss": 0.07180570065975189, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036161165917292237, "grad_norm": 8.620924949645996, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8627901077270508, "num_tokens": 736596469.0, "step": 19302 }, { "epoch": 2.455540007632617, "ewc_loss": 0.0707300677895546, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035085537820123136, "grad_norm": 8.310649871826172, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.8510631322860718, "num_tokens": 736636078.0, "step": 19303 }, { "epoch": 2.4556672179112073, "ewc_loss": 0.07180944085121155, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003616491158027202, "grad_norm": 8.576534271240234, "learning_rate": 1e-06, "loss": 0.4083, "mean_token_accuracy": 0.8808624744415283, "num_tokens": 736674117.0, "step": 19304 }, { "epoch": 2.455794428189798, "ewc_loss": 0.0712420865893364, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035353415296413004, "grad_norm": 8.41395378112793, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8620649576187134, "num_tokens": 736709116.0, "step": 19305 }, { "epoch": 2.4559216384683884, "ewc_loss": 0.07148239016532898, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035837863106280565, "grad_norm": 8.42975902557373, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8591763973236084, "num_tokens": 736750383.0, "step": 19306 }, { "epoch": 2.456048848746979, "ewc_loss": 0.07155191898345947, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000356632488546893, "grad_norm": 8.455426216125488, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8553520441055298, "num_tokens": 736790836.0, "step": 19307 }, { "epoch": 2.4561760590255695, "ewc_loss": 0.07134409248828888, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035699561703950167, "grad_norm": 8.529948234558105, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8635609745979309, "num_tokens": 736826085.0, "step": 19308 }, { "epoch": 2.4563032693041595, "ewc_loss": 0.07152073085308075, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003563206410035491, "grad_norm": 8.409391403198242, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8655925393104553, "num_tokens": 736860599.0, "step": 19309 }, { "epoch": 2.4564304795827505, "ewc_loss": 0.07147398591041565, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035829455009661615, "grad_norm": 8.569366455078125, "learning_rate": 1e-06, "loss": 0.3958, "mean_token_accuracy": 0.8839230537414551, "num_tokens": 736893286.0, "step": 19310 }, { "epoch": 2.4565576898613406, "ewc_loss": 0.07099412381649017, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00035349593963474035, "grad_norm": 8.36361312866211, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8762439489364624, "num_tokens": 736931597.0, "step": 19311 }, { "epoch": 2.456684900139931, "ewc_loss": 0.07176685333251953, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003587818646337837, "grad_norm": 8.539689064025879, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.874975323677063, "num_tokens": 736972183.0, "step": 19312 }, { "epoch": 2.4568121104185217, "ewc_loss": 0.07136046141386032, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003547178930602968, "grad_norm": 8.517817497253418, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8732121586799622, "num_tokens": 737004184.0, "step": 19313 }, { "epoch": 2.456939320697112, "ewc_loss": 0.07146333903074265, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035574668436311185, "grad_norm": 8.369405746459961, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8668278455734253, "num_tokens": 737042644.0, "step": 19314 }, { "epoch": 2.4570665309757027, "ewc_loss": 0.07223628461360931, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00035859335912391543, "grad_norm": 36.030364990234375, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8561981916427612, "num_tokens": 737074830.0, "step": 19315 }, { "epoch": 2.4571937412542932, "ewc_loss": 0.109074167907238, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0007318549905903637, "grad_norm": 14.671209335327148, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8744072914123535, "num_tokens": 737107756.0, "step": 19316 }, { "epoch": 2.4573209515328838, "ewc_loss": 0.07132981717586517, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003544114879332483, "grad_norm": 7.3553080558776855, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8750178813934326, "num_tokens": 737143457.0, "step": 19317 }, { "epoch": 2.4574481618114743, "ewc_loss": 0.09607429802417755, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0006018562125973403, "grad_norm": 12.081598281860352, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8618009090423584, "num_tokens": 737177005.0, "step": 19318 }, { "epoch": 2.457575372090065, "ewc_loss": 0.09903688728809357, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0006314821657724679, "grad_norm": 11.694870948791504, "learning_rate": 1e-06, "loss": 0.5244, "mean_token_accuracy": 0.8569976687431335, "num_tokens": 737218265.0, "step": 19319 }, { "epoch": 2.4577025823686554, "ewc_loss": 0.08018618077039719, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000442975084297359, "grad_norm": 8.692002296447754, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8558318018913269, "num_tokens": 737259495.0, "step": 19320 }, { "epoch": 2.457829792647246, "ewc_loss": 0.08351124823093414, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00047622580314055085, "grad_norm": 10.44799518585205, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8587572574615479, "num_tokens": 737300073.0, "step": 19321 }, { "epoch": 2.4579570029258364, "ewc_loss": 0.08483413606882095, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0004894546582363546, "grad_norm": 9.645694732666016, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8673691749572754, "num_tokens": 737334716.0, "step": 19322 }, { "epoch": 2.458084213204427, "ewc_loss": 0.07813255488872528, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00042243883945047855, "grad_norm": 9.111550331115723, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8728200197219849, "num_tokens": 737377086.0, "step": 19323 }, { "epoch": 2.4582114234830175, "ewc_loss": 0.07886378467082977, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00043219258077442646, "grad_norm": 9.436750411987305, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8560136556625366, "num_tokens": 737418259.0, "step": 19324 }, { "epoch": 2.458338633761608, "ewc_loss": 0.07718241214752197, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0004153787740506232, "grad_norm": 8.969619750976562, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8562920689582825, "num_tokens": 737453150.0, "step": 19325 }, { "epoch": 2.4584658440401985, "ewc_loss": 0.0765402615070343, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0004089573340024799, "grad_norm": 9.200250625610352, "learning_rate": 1e-06, "loss": 0.4177, "mean_token_accuracy": 0.8815407752990723, "num_tokens": 737495552.0, "step": 19326 }, { "epoch": 2.458593054318789, "ewc_loss": 0.0753244161605835, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003967988886870444, "grad_norm": 8.86167049407959, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8714239597320557, "num_tokens": 737537518.0, "step": 19327 }, { "epoch": 2.4587202645973796, "ewc_loss": 0.0751418024301529, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003949727106373757, "grad_norm": 8.840611457824707, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8632919788360596, "num_tokens": 737577472.0, "step": 19328 }, { "epoch": 2.45884747487597, "ewc_loss": 0.0739789679646492, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00038334436248987913, "grad_norm": 8.748533248901367, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8588146567344666, "num_tokens": 737617250.0, "step": 19329 }, { "epoch": 2.4589746851545606, "ewc_loss": 0.0741373747587204, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003849283966701478, "grad_norm": 8.726335525512695, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.869702935218811, "num_tokens": 737655544.0, "step": 19330 }, { "epoch": 2.459101895433151, "ewc_loss": 0.07326819002628326, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003762365668080747, "grad_norm": 8.650522232055664, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8703739643096924, "num_tokens": 737688644.0, "step": 19331 }, { "epoch": 2.4592291057117417, "ewc_loss": 0.073299340903759, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003765480942092836, "grad_norm": 8.643437385559082, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8696964979171753, "num_tokens": 737730264.0, "step": 19332 }, { "epoch": 2.459356315990332, "ewc_loss": 0.0727347582578659, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003709022421389818, "grad_norm": 8.589010238647461, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8613908886909485, "num_tokens": 737763578.0, "step": 19333 }, { "epoch": 2.4594835262689223, "ewc_loss": 0.07272857427597046, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00037084039649926126, "grad_norm": 8.581061363220215, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8687978982925415, "num_tokens": 737804062.0, "step": 19334 }, { "epoch": 2.4596107365475133, "ewc_loss": 0.07245109975337982, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003680656664073467, "grad_norm": 8.495979309082031, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8808279633522034, "num_tokens": 737845608.0, "step": 19335 }, { "epoch": 2.4597379468261034, "ewc_loss": 0.07247468084096909, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036830149474553764, "grad_norm": 8.575031280517578, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.8488743901252747, "num_tokens": 737881366.0, "step": 19336 }, { "epoch": 2.459865157104694, "ewc_loss": 0.07209447026252747, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036449942854233086, "grad_norm": 8.412375450134277, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8593906164169312, "num_tokens": 737922595.0, "step": 19337 }, { "epoch": 2.4599923673832844, "ewc_loss": 0.07237318158149719, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003672865277621895, "grad_norm": 8.551767349243164, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8743402361869812, "num_tokens": 737956368.0, "step": 19338 }, { "epoch": 2.460119577661875, "ewc_loss": 0.07189976423978806, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036255232407711446, "grad_norm": 8.383612632751465, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8716247081756592, "num_tokens": 738000167.0, "step": 19339 }, { "epoch": 2.4602467879404655, "ewc_loss": 0.07235123962163925, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036706708488054574, "grad_norm": 8.587333679199219, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8596633672714233, "num_tokens": 738040850.0, "step": 19340 }, { "epoch": 2.460373998219056, "ewc_loss": 0.07178524136543274, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036140705924481153, "grad_norm": 8.46619987487793, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.865054190158844, "num_tokens": 738080370.0, "step": 19341 }, { "epoch": 2.4605012084976465, "ewc_loss": 0.07218394428491592, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036539413849823177, "grad_norm": 8.527626037597656, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8707777261734009, "num_tokens": 738116751.0, "step": 19342 }, { "epoch": 2.460628418776237, "ewc_loss": 0.07176738977432251, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003612286236602813, "grad_norm": 8.387758255004883, "learning_rate": 1e-06, "loss": 0.3864, "mean_token_accuracy": 0.8853390216827393, "num_tokens": 738154597.0, "step": 19343 }, { "epoch": 2.4607556290548276, "ewc_loss": 0.07208316773176193, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036438636016100645, "grad_norm": 8.571061134338379, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.873847246170044, "num_tokens": 738186484.0, "step": 19344 }, { "epoch": 2.460882839333418, "ewc_loss": 0.07164743542671204, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036002908018417656, "grad_norm": 8.41781997680664, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8716548681259155, "num_tokens": 738217722.0, "step": 19345 }, { "epoch": 2.4610100496120086, "ewc_loss": 0.07208353281021118, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036438999813981354, "grad_norm": 8.48320198059082, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8799689412117004, "num_tokens": 738255304.0, "step": 19346 }, { "epoch": 2.461137259890599, "ewc_loss": 0.07168936729431152, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036044834996573627, "grad_norm": 8.505228996276855, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8624329566955566, "num_tokens": 738292162.0, "step": 19347 }, { "epoch": 2.4612644701691897, "ewc_loss": 0.07174853980541229, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003610400890465826, "grad_norm": 8.403016090393066, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8793084025382996, "num_tokens": 738332851.0, "step": 19348 }, { "epoch": 2.4613916804477802, "ewc_loss": 0.07193870842456818, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036294182064011693, "grad_norm": 8.485307693481445, "learning_rate": 1e-06, "loss": 0.4248, "mean_token_accuracy": 0.8769721984863281, "num_tokens": 738375120.0, "step": 19349 }, { "epoch": 2.4615188907263708, "ewc_loss": 0.07163530588150024, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000359907717211172, "grad_norm": 8.479998588562012, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8576935529708862, "num_tokens": 738413857.0, "step": 19350 }, { "epoch": 2.4616461010049613, "ewc_loss": 0.07171784341335297, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003607331309467554, "grad_norm": 8.48326301574707, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8608231544494629, "num_tokens": 738450097.0, "step": 19351 }, { "epoch": 2.461773311283552, "ewc_loss": 0.07161597907543182, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003597145259846002, "grad_norm": 8.492919921875, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8495250940322876, "num_tokens": 738483784.0, "step": 19352 }, { "epoch": 2.4619005215621423, "ewc_loss": 0.07164229452610016, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003599776537157595, "grad_norm": 8.490279197692871, "learning_rate": 1e-06, "loss": 0.5276, "mean_token_accuracy": 0.8474618196487427, "num_tokens": 738526820.0, "step": 19353 }, { "epoch": 2.462027731840733, "ewc_loss": 0.07159729301929474, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035952756297774613, "grad_norm": 8.391424179077148, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8520966172218323, "num_tokens": 738569049.0, "step": 19354 }, { "epoch": 2.4621549421193234, "ewc_loss": 0.07174131274223328, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036096779513172805, "grad_norm": 8.485871315002441, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.864144504070282, "num_tokens": 738608320.0, "step": 19355 }, { "epoch": 2.462282152397914, "ewc_loss": 0.0714617669582367, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035817240132018924, "grad_norm": 8.356985092163086, "learning_rate": 1e-06, "loss": 0.4154, "mean_token_accuracy": 0.8806767463684082, "num_tokens": 738645566.0, "step": 19356 }, { "epoch": 2.4624093626765045, "ewc_loss": 0.0718742311000824, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000362297025276348, "grad_norm": 8.479279518127441, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.843756914138794, "num_tokens": 738686184.0, "step": 19357 }, { "epoch": 2.462536572955095, "ewc_loss": 0.07153572142124176, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035891192965209484, "grad_norm": 8.460041046142578, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.8526334762573242, "num_tokens": 738728820.0, "step": 19358 }, { "epoch": 2.462663783233685, "ewc_loss": 0.07151903212070465, "ewc_loss_diag": 3.528594970703125e-05, "ewc_loss_parallel": 0.0003611863648984581, "grad_norm": 8.549034118652344, "learning_rate": 1e-06, "loss": 0.5541, "mean_token_accuracy": 0.835978627204895, "num_tokens": 738760180.0, "step": 19359 }, { "epoch": 2.462790993512276, "ewc_loss": 0.07156994938850403, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003592541324906051, "grad_norm": 8.469280242919922, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8741052150726318, "num_tokens": 738795501.0, "step": 19360 }, { "epoch": 2.462918203790866, "ewc_loss": 0.07171829789876938, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036073767114430666, "grad_norm": 8.504910469055176, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8562910556793213, "num_tokens": 738831027.0, "step": 19361 }, { "epoch": 2.4630454140694567, "ewc_loss": 0.07153794914484024, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003589341649785638, "grad_norm": 8.409459114074707, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8479037284851074, "num_tokens": 738870383.0, "step": 19362 }, { "epoch": 2.463172624348047, "ewc_loss": 0.07166366279125214, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036019133403897285, "grad_norm": 8.403213500976562, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8718159198760986, "num_tokens": 738908231.0, "step": 19363 }, { "epoch": 2.4632998346266377, "ewc_loss": 0.0716816857457161, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036037154495716095, "grad_norm": 8.468893051147461, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8575261831283569, "num_tokens": 738950927.0, "step": 19364 }, { "epoch": 2.4634270449052282, "ewc_loss": 0.07154490053653717, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003590037231333554, "grad_norm": 8.451767921447754, "learning_rate": 1e-06, "loss": 0.4278, "mean_token_accuracy": 0.8774428963661194, "num_tokens": 738989857.0, "step": 19365 }, { "epoch": 2.4635542551838188, "ewc_loss": 0.07168345153331757, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003603892109822482, "grad_norm": 8.571619987487793, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8799794912338257, "num_tokens": 739021281.0, "step": 19366 }, { "epoch": 2.4636814654624093, "ewc_loss": 0.07151006907224655, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035621397546492517, "grad_norm": 8.429413795471191, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8733842968940735, "num_tokens": 739063475.0, "step": 19367 }, { "epoch": 2.463808675741, "ewc_loss": 0.07166896760463715, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000360244361218065, "grad_norm": 8.528428077697754, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8620399236679077, "num_tokens": 739107083.0, "step": 19368 }, { "epoch": 2.4639358860195903, "ewc_loss": 0.0711795911192894, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035535061033442616, "grad_norm": 8.408890724182129, "learning_rate": 1e-06, "loss": 0.5245, "mean_token_accuracy": 0.844606876373291, "num_tokens": 739152002.0, "step": 19369 }, { "epoch": 2.464063096298181, "ewc_loss": 0.07162860035896301, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003598407201934606, "grad_norm": 8.506452560424805, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.863393247127533, "num_tokens": 739191781.0, "step": 19370 }, { "epoch": 2.4641903065767714, "ewc_loss": 0.07129170745611191, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003564717771951109, "grad_norm": 8.376080513000488, "learning_rate": 1e-06, "loss": 0.3966, "mean_token_accuracy": 0.887773871421814, "num_tokens": 739230374.0, "step": 19371 }, { "epoch": 2.464317516855362, "ewc_loss": 0.0716547891497612, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003601025673560798, "grad_norm": 8.527853965759277, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8612805604934692, "num_tokens": 739267310.0, "step": 19372 }, { "epoch": 2.4644447271339525, "ewc_loss": 0.07137808203697205, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035733546246774495, "grad_norm": 8.40274429321289, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.870008111000061, "num_tokens": 739305074.0, "step": 19373 }, { "epoch": 2.464571937412543, "ewc_loss": 0.07163937389850616, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003599484625738114, "grad_norm": 8.506260871887207, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8581575155258179, "num_tokens": 739343028.0, "step": 19374 }, { "epoch": 2.4646991476911335, "ewc_loss": 0.07137498259544373, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035730452509596944, "grad_norm": 8.461660385131836, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8650972247123718, "num_tokens": 739384364.0, "step": 19375 }, { "epoch": 2.464826357969724, "ewc_loss": 0.0716504231095314, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003600589116103947, "grad_norm": 8.49089527130127, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8814982175827026, "num_tokens": 739423564.0, "step": 19376 }, { "epoch": 2.4649535682483146, "ewc_loss": 0.07139821350574493, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035753686097450554, "grad_norm": 8.422903060913086, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8751259446144104, "num_tokens": 739457609.0, "step": 19377 }, { "epoch": 2.465080778526905, "ewc_loss": 0.07161400467157364, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003596947353798896, "grad_norm": 8.644881248474121, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8668379783630371, "num_tokens": 739497204.0, "step": 19378 }, { "epoch": 2.4652079888054956, "ewc_loss": 0.07119225710630417, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003554772411007434, "grad_norm": 8.64434814453125, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8614503145217896, "num_tokens": 739543688.0, "step": 19379 }, { "epoch": 2.465335199084086, "ewc_loss": 0.0713621973991394, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000357176671968773, "grad_norm": 8.736449241638184, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8771346807479858, "num_tokens": 739578221.0, "step": 19380 }, { "epoch": 2.4654624093626767, "ewc_loss": 0.07074693590402603, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003510240640025586, "grad_norm": 8.31643295288086, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8719364404678345, "num_tokens": 739615892.0, "step": 19381 }, { "epoch": 2.4655896196412668, "ewc_loss": 0.07158856093883514, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003594403387978673, "grad_norm": 8.620013236999512, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8527183532714844, "num_tokens": 739657697.0, "step": 19382 }, { "epoch": 2.4657168299198577, "ewc_loss": 0.07073554396629333, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035091009340249, "grad_norm": 8.383096694946289, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8762943744659424, "num_tokens": 739694173.0, "step": 19383 }, { "epoch": 2.465844040198448, "ewc_loss": 0.07184123992919922, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003619671333581209, "grad_norm": 8.541362762451172, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8633366823196411, "num_tokens": 739734111.0, "step": 19384 }, { "epoch": 2.4659712504770384, "ewc_loss": 0.07085960358381271, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035215073148719966, "grad_norm": 8.346545219421387, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8670224547386169, "num_tokens": 739771409.0, "step": 19385 }, { "epoch": 2.466098460755629, "ewc_loss": 0.07170550525188446, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003606097016017884, "grad_norm": 8.551190376281738, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.852469265460968, "num_tokens": 739807185.0, "step": 19386 }, { "epoch": 2.4662256710342194, "ewc_loss": 0.07117939740419388, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035534866037778556, "grad_norm": 8.33923053741455, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8771770000457764, "num_tokens": 739837750.0, "step": 19387 }, { "epoch": 2.46635288131281, "ewc_loss": 0.07178260385990143, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003613807784859091, "grad_norm": 8.471848487854004, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8657074570655823, "num_tokens": 739879516.0, "step": 19388 }, { "epoch": 2.4664800915914005, "ewc_loss": 0.07129804790019989, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003565351362340152, "grad_norm": 8.38682746887207, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8729773759841919, "num_tokens": 739918688.0, "step": 19389 }, { "epoch": 2.466607301869991, "ewc_loss": 0.07190020382404327, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003625567478593439, "grad_norm": 8.454643249511719, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8709774613380432, "num_tokens": 739958978.0, "step": 19390 }, { "epoch": 2.4667345121485815, "ewc_loss": 0.07148374617099762, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003583921934477985, "grad_norm": 8.330949783325195, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8704750537872314, "num_tokens": 740002428.0, "step": 19391 }, { "epoch": 2.466861722427172, "ewc_loss": 0.07200095057487488, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036356423515826464, "grad_norm": 8.450642585754395, "learning_rate": 1e-06, "loss": 0.4017, "mean_token_accuracy": 0.8872537612915039, "num_tokens": 740045856.0, "step": 19392 }, { "epoch": 2.4669889327057626, "ewc_loss": 0.07162594050168991, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035981409018859267, "grad_norm": 8.432838439941406, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8640190362930298, "num_tokens": 740084512.0, "step": 19393 }, { "epoch": 2.467116142984353, "ewc_loss": 0.07188864052295685, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003624411183409393, "grad_norm": 8.442728996276855, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8776335120201111, "num_tokens": 740129624.0, "step": 19394 }, { "epoch": 2.4672433532629436, "ewc_loss": 0.07173436880111694, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003608984115999192, "grad_norm": 8.450252532958984, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8498764634132385, "num_tokens": 740168368.0, "step": 19395 }, { "epoch": 2.467370563541534, "ewc_loss": 0.07169002294540405, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003604548692237586, "grad_norm": 8.504071235656738, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8591508865356445, "num_tokens": 740206670.0, "step": 19396 }, { "epoch": 2.4674977738201247, "ewc_loss": 0.07170213013887405, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036057597026228905, "grad_norm": 8.41466999053955, "learning_rate": 1e-06, "loss": 0.5248, "mean_token_accuracy": 0.8442906141281128, "num_tokens": 740246422.0, "step": 19397 }, { "epoch": 2.4676249840987152, "ewc_loss": 0.0719088464975357, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003626431862358004, "grad_norm": 8.484159469604492, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8641457557678223, "num_tokens": 740284450.0, "step": 19398 }, { "epoch": 2.4677521943773058, "ewc_loss": 0.07162915170192719, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003598462208174169, "grad_norm": 8.45702075958252, "learning_rate": 1e-06, "loss": 0.5247, "mean_token_accuracy": 0.8430373668670654, "num_tokens": 740319062.0, "step": 19399 }, { "epoch": 2.4678794046558963, "ewc_loss": 0.07192718982696533, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003628265403676778, "grad_norm": 8.511103630065918, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8669897317886353, "num_tokens": 740353996.0, "step": 19400 }, { "epoch": 2.468006614934487, "ewc_loss": 0.07172944396734238, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003608491097111255, "grad_norm": 8.46908950805664, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8652545213699341, "num_tokens": 740394694.0, "step": 19401 }, { "epoch": 2.4681338252130773, "ewc_loss": 0.07184554636478424, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036201017792336643, "grad_norm": 8.465692520141602, "learning_rate": 1e-06, "loss": 0.398, "mean_token_accuracy": 0.887832522392273, "num_tokens": 740436567.0, "step": 19402 }, { "epoch": 2.468261035491668, "ewc_loss": 0.07171247899532318, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003606795216910541, "grad_norm": 8.422381401062012, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8575608730316162, "num_tokens": 740476458.0, "step": 19403 }, { "epoch": 2.4683882457702584, "ewc_loss": 0.07177157700061798, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036127050407230854, "grad_norm": 8.438850402832031, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8761090040206909, "num_tokens": 740514703.0, "step": 19404 }, { "epoch": 2.468515456048849, "ewc_loss": 0.07180562615394592, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036161093157716095, "grad_norm": 8.556029319763184, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8762767314910889, "num_tokens": 740552328.0, "step": 19405 }, { "epoch": 2.4686426663274394, "ewc_loss": 0.07169169187545776, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036047163303010166, "grad_norm": 8.426040649414062, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.8492366075515747, "num_tokens": 740593141.0, "step": 19406 }, { "epoch": 2.4687698766060295, "ewc_loss": 0.07201368361711502, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003636915353126824, "grad_norm": 8.529980659484863, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8756742477416992, "num_tokens": 740634097.0, "step": 19407 }, { "epoch": 2.4688970868846205, "ewc_loss": 0.07162037491798401, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003597584436647594, "grad_norm": 8.446736335754395, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8725223541259766, "num_tokens": 740672917.0, "step": 19408 }, { "epoch": 2.4690242971632106, "ewc_loss": 0.07189355790615082, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003624902165029198, "grad_norm": 8.483059883117676, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8610700368881226, "num_tokens": 740709351.0, "step": 19409 }, { "epoch": 2.469151507441801, "ewc_loss": 0.07168246805667877, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003603793156798929, "grad_norm": 8.483731269836426, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8569294810295105, "num_tokens": 740744049.0, "step": 19410 }, { "epoch": 2.4692787177203916, "ewc_loss": 0.07177575677633286, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036131226806901395, "grad_norm": 8.482070922851562, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8692576289176941, "num_tokens": 740784765.0, "step": 19411 }, { "epoch": 2.469405927998982, "ewc_loss": 0.07166345417499542, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036018918035551906, "grad_norm": 8.507883071899414, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8593252897262573, "num_tokens": 740815859.0, "step": 19412 }, { "epoch": 2.4695331382775727, "ewc_loss": 0.07167293131351471, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003602840006351471, "grad_norm": 8.460715293884277, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8849050998687744, "num_tokens": 740854300.0, "step": 19413 }, { "epoch": 2.4696603485561632, "ewc_loss": 0.07182133197784424, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003617679758463055, "grad_norm": 8.485268592834473, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8500702381134033, "num_tokens": 740891836.0, "step": 19414 }, { "epoch": 2.4697875588347538, "ewc_loss": 0.07170690596103668, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035818235483020544, "grad_norm": 8.373156547546387, "learning_rate": 1e-06, "loss": 0.4321, "mean_token_accuracy": 0.8749276995658875, "num_tokens": 740927434.0, "step": 19415 }, { "epoch": 2.4699147691133443, "ewc_loss": 0.07191996276378632, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036275433376431465, "grad_norm": 8.439952850341797, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8688399195671082, "num_tokens": 740973707.0, "step": 19416 }, { "epoch": 2.470041979391935, "ewc_loss": 0.07170715928077698, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036062629078514874, "grad_norm": 8.393882751464844, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8734243512153625, "num_tokens": 741015953.0, "step": 19417 }, { "epoch": 2.4701691896705253, "ewc_loss": 0.07182653993368149, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036182007170282304, "grad_norm": 8.45417594909668, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8736551403999329, "num_tokens": 741049634.0, "step": 19418 }, { "epoch": 2.470296399949116, "ewc_loss": 0.07191920280456543, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036030527553521097, "grad_norm": 8.405158996582031, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.875177800655365, "num_tokens": 741084267.0, "step": 19419 }, { "epoch": 2.4704236102277064, "ewc_loss": 0.0720171332359314, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003637259651441127, "grad_norm": 8.450112342834473, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8730995655059814, "num_tokens": 741122237.0, "step": 19420 }, { "epoch": 2.470550820506297, "ewc_loss": 0.0718117207288742, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003616719041019678, "grad_norm": 8.453091621398926, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8534615635871887, "num_tokens": 741163593.0, "step": 19421 }, { "epoch": 2.4706780307848875, "ewc_loss": 0.07186821848154068, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003622368676587939, "grad_norm": 8.416287422180176, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8568331003189087, "num_tokens": 741202328.0, "step": 19422 }, { "epoch": 2.470805241063478, "ewc_loss": 0.07189841568470001, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.000362538849003613, "grad_norm": 8.512547492980957, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8856338858604431, "num_tokens": 741240095.0, "step": 19423 }, { "epoch": 2.4709324513420685, "ewc_loss": 0.07175129652023315, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036106762127019465, "grad_norm": 8.416985511779785, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8685940504074097, "num_tokens": 741278509.0, "step": 19424 }, { "epoch": 2.471059661620659, "ewc_loss": 0.07212474942207336, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036480213748291135, "grad_norm": 8.496576309204102, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8606985211372375, "num_tokens": 741321446.0, "step": 19425 }, { "epoch": 2.4711868718992496, "ewc_loss": 0.0717894434928894, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036144917248748243, "grad_norm": 8.479228973388672, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8634206652641296, "num_tokens": 741360469.0, "step": 19426 }, { "epoch": 2.47131408217784, "ewc_loss": 0.07197625935077667, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036331723094917834, "grad_norm": 8.503447532653809, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8554661870002747, "num_tokens": 741393833.0, "step": 19427 }, { "epoch": 2.4714412924564306, "ewc_loss": 0.07175938785076141, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036114852991886437, "grad_norm": 8.461549758911133, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8699787855148315, "num_tokens": 741426956.0, "step": 19428 }, { "epoch": 2.471568502735021, "ewc_loss": 0.07184038311243057, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003619585186243057, "grad_norm": 8.46982192993164, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8662482500076294, "num_tokens": 741464820.0, "step": 19429 }, { "epoch": 2.4716957130136117, "ewc_loss": 0.07177398353815079, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036129451473243535, "grad_norm": 8.46102523803711, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.874149739742279, "num_tokens": 741499519.0, "step": 19430 }, { "epoch": 2.471822923292202, "ewc_loss": 0.07178087532520294, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003613634326029569, "grad_norm": 8.46796703338623, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8731018304824829, "num_tokens": 741533443.0, "step": 19431 }, { "epoch": 2.4719501335707923, "ewc_loss": 0.07199534773826599, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003610667772591114, "grad_norm": 8.406076431274414, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8640182018280029, "num_tokens": 741576849.0, "step": 19432 }, { "epoch": 2.4720773438493833, "ewc_loss": 0.07212918251752853, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036240508779883385, "grad_norm": 8.445345878601074, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8715336322784424, "num_tokens": 741614653.0, "step": 19433 }, { "epoch": 2.4722045541279734, "ewc_loss": 0.07200057804584503, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036111901863478124, "grad_norm": 8.419453620910645, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8617813587188721, "num_tokens": 741649354.0, "step": 19434 }, { "epoch": 2.472331764406564, "ewc_loss": 0.07207125425338745, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036182577605359256, "grad_norm": 8.459622383117676, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8610470294952393, "num_tokens": 741692257.0, "step": 19435 }, { "epoch": 2.4724589746851544, "ewc_loss": 0.07201120257377625, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003612252767197788, "grad_norm": 8.398691177368164, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8636687994003296, "num_tokens": 741729739.0, "step": 19436 }, { "epoch": 2.472586184963745, "ewc_loss": 0.07223479449748993, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036346123670227826, "grad_norm": 8.437030792236328, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8590192794799805, "num_tokens": 741763409.0, "step": 19437 }, { "epoch": 2.4727133952423355, "ewc_loss": 0.07197603583335876, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003608735860325396, "grad_norm": 8.405665397644043, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8638206720352173, "num_tokens": 741798746.0, "step": 19438 }, { "epoch": 2.472840605520926, "ewc_loss": 0.07221827656030655, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036329604336060584, "grad_norm": 8.410257339477539, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8756690621376038, "num_tokens": 741840488.0, "step": 19439 }, { "epoch": 2.4729678157995165, "ewc_loss": 0.07215295732021332, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036264286609366536, "grad_norm": 8.415989875793457, "learning_rate": 1e-06, "loss": 0.4054, "mean_token_accuracy": 0.8839409351348877, "num_tokens": 741880839.0, "step": 19440 }, { "epoch": 2.473095026078107, "ewc_loss": 0.07208190858364105, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003619323833845556, "grad_norm": 8.492249488830566, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8601056337356567, "num_tokens": 741919317.0, "step": 19441 }, { "epoch": 2.4732222363566976, "ewc_loss": 0.07193167507648468, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036043007276020944, "grad_norm": 8.426812171936035, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8448231220245361, "num_tokens": 741955782.0, "step": 19442 }, { "epoch": 2.473349446635288, "ewc_loss": 0.0720507949590683, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003616212052293122, "grad_norm": 8.443665504455566, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8601606488227844, "num_tokens": 741990410.0, "step": 19443 }, { "epoch": 2.4734766569138786, "ewc_loss": 0.07194210588932037, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003605343517847359, "grad_norm": 8.407089233398438, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8603442311286926, "num_tokens": 742031536.0, "step": 19444 }, { "epoch": 2.473603867192469, "ewc_loss": 0.07204660028219223, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003615792957134545, "grad_norm": 8.427807807922363, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8545848727226257, "num_tokens": 742067222.0, "step": 19445 }, { "epoch": 2.4737310774710597, "ewc_loss": 0.07194405794143677, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003605538804549724, "grad_norm": 8.354788780212402, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8778784275054932, "num_tokens": 742114740.0, "step": 19446 }, { "epoch": 2.47385828774965, "ewc_loss": 0.07211075723171234, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036222083144821227, "grad_norm": 8.429658889770508, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8723651170730591, "num_tokens": 742149344.0, "step": 19447 }, { "epoch": 2.4739854980282407, "ewc_loss": 0.0719052255153656, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003601655480451882, "grad_norm": 8.41413688659668, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8675777316093445, "num_tokens": 742191089.0, "step": 19448 }, { "epoch": 2.4741127083068313, "ewc_loss": 0.07192414999008179, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003603547520469874, "grad_norm": 8.424088478088379, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8666298985481262, "num_tokens": 742224944.0, "step": 19449 }, { "epoch": 2.474239918585422, "ewc_loss": 0.07191528379917145, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036026607267558575, "grad_norm": 8.450301170349121, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8565875887870789, "num_tokens": 742263674.0, "step": 19450 }, { "epoch": 2.4743671288640123, "ewc_loss": 0.07185153663158417, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003596286696847528, "grad_norm": 8.429902076721191, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8621020913124084, "num_tokens": 742300424.0, "step": 19451 }, { "epoch": 2.474494339142603, "ewc_loss": 0.07199341058731079, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003610474232118577, "grad_norm": 8.422542572021484, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8760332465171814, "num_tokens": 742333854.0, "step": 19452 }, { "epoch": 2.4746215494211934, "ewc_loss": 0.0719752386212349, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003608656697906554, "grad_norm": 8.44207763671875, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8748198747634888, "num_tokens": 742377126.0, "step": 19453 }, { "epoch": 2.474748759699784, "ewc_loss": 0.07197026908397675, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003608159313444048, "grad_norm": 8.400643348693848, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8642994165420532, "num_tokens": 742421284.0, "step": 19454 }, { "epoch": 2.4748759699783744, "ewc_loss": 0.07203324139118195, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003614456800278276, "grad_norm": 8.467280387878418, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.848405122756958, "num_tokens": 742459124.0, "step": 19455 }, { "epoch": 2.475003180256965, "ewc_loss": 0.07186552882194519, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003597685426939279, "grad_norm": 8.394187927246094, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8586679697036743, "num_tokens": 742493137.0, "step": 19456 }, { "epoch": 2.475130390535555, "ewc_loss": 0.0721435397863388, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036254871520213783, "grad_norm": 8.480828285217285, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8682504892349243, "num_tokens": 742529805.0, "step": 19457 }, { "epoch": 2.475257600814146, "ewc_loss": 0.07186029851436615, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003597162722144276, "grad_norm": 8.406561851501465, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8794772028923035, "num_tokens": 742560859.0, "step": 19458 }, { "epoch": 2.475384811092736, "ewc_loss": 0.07219332456588745, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003630465071182698, "grad_norm": 8.39332389831543, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.878250241279602, "num_tokens": 742601465.0, "step": 19459 }, { "epoch": 2.4755120213713266, "ewc_loss": 0.07203526794910431, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003614659362938255, "grad_norm": 8.422261238098145, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8595770597457886, "num_tokens": 742639242.0, "step": 19460 }, { "epoch": 2.475639231649917, "ewc_loss": 0.07214054465293884, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003625187382567674, "grad_norm": 8.488207817077637, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8662320375442505, "num_tokens": 742676574.0, "step": 19461 }, { "epoch": 2.4757664419285077, "ewc_loss": 0.0719820037484169, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003609333361964673, "grad_norm": 8.385149002075195, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8560395836830139, "num_tokens": 742713326.0, "step": 19462 }, { "epoch": 2.4758936522070982, "ewc_loss": 0.07216528058052063, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036276609171181917, "grad_norm": 8.44270133972168, "learning_rate": 1e-06, "loss": 0.5255, "mean_token_accuracy": 0.8431369066238403, "num_tokens": 742756883.0, "step": 19463 }, { "epoch": 2.4760208624856888, "ewc_loss": 0.07197681069374084, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036088141496293247, "grad_norm": 8.467720031738281, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8616229295730591, "num_tokens": 742794301.0, "step": 19464 }, { "epoch": 2.4761480727642793, "ewc_loss": 0.07207326591014862, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036184588680043817, "grad_norm": 8.433154106140137, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8689318299293518, "num_tokens": 742832966.0, "step": 19465 }, { "epoch": 2.47627528304287, "ewc_loss": 0.07206553965806961, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036176867433823645, "grad_norm": 8.42788028717041, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8624095320701599, "num_tokens": 742872926.0, "step": 19466 }, { "epoch": 2.4764024933214603, "ewc_loss": 0.0719836950302124, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036095018731430173, "grad_norm": 8.4345703125, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.872229814529419, "num_tokens": 742910838.0, "step": 19467 }, { "epoch": 2.476529703600051, "ewc_loss": 0.0720733106136322, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003618464106693864, "grad_norm": 8.428366661071777, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.874139130115509, "num_tokens": 742950098.0, "step": 19468 }, { "epoch": 2.4766569138786414, "ewc_loss": 0.07206451147794724, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617584006860852, "grad_norm": 8.441757202148438, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8775194883346558, "num_tokens": 742983872.0, "step": 19469 }, { "epoch": 2.476784124157232, "ewc_loss": 0.07192839682102203, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036039724363945425, "grad_norm": 8.414692878723145, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.855292797088623, "num_tokens": 743029816.0, "step": 19470 }, { "epoch": 2.4769113344358225, "ewc_loss": 0.07194359600543976, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003605492820497602, "grad_norm": 8.426284790039062, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8579695224761963, "num_tokens": 743068565.0, "step": 19471 }, { "epoch": 2.477038544714413, "ewc_loss": 0.07201912999153137, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036130461376160383, "grad_norm": 8.42317008972168, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8686524033546448, "num_tokens": 743112137.0, "step": 19472 }, { "epoch": 2.4771657549930035, "ewc_loss": 0.0720207691192627, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000361320999218151, "grad_norm": 8.463645935058594, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8798011541366577, "num_tokens": 743145067.0, "step": 19473 }, { "epoch": 2.477292965271594, "ewc_loss": 0.0720691829919815, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036180514143779874, "grad_norm": 8.466354370117188, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8713964223861694, "num_tokens": 743177017.0, "step": 19474 }, { "epoch": 2.4774201755501846, "ewc_loss": 0.07198767364025116, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000360990030458197, "grad_norm": 8.451693534851074, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8665958046913147, "num_tokens": 743218121.0, "step": 19475 }, { "epoch": 2.477547385828775, "ewc_loss": 0.07204460352659225, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003615593013819307, "grad_norm": 8.47272777557373, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8712195158004761, "num_tokens": 743256635.0, "step": 19476 }, { "epoch": 2.4776745961073656, "ewc_loss": 0.07194672524929047, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003605805104598403, "grad_norm": 8.462303161621094, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8634061813354492, "num_tokens": 743293801.0, "step": 19477 }, { "epoch": 2.477801806385956, "ewc_loss": 0.0719211995601654, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036032532807439566, "grad_norm": 8.48996353149414, "learning_rate": 1e-06, "loss": 0.4179, "mean_token_accuracy": 0.8813591003417969, "num_tokens": 743331582.0, "step": 19478 }, { "epoch": 2.4779290166645467, "ewc_loss": 0.07194923609495163, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003606056561693549, "grad_norm": 8.463808059692383, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8685572147369385, "num_tokens": 743369901.0, "step": 19479 }, { "epoch": 2.4780562269431368, "ewc_loss": 0.07187193632125854, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003598326875362545, "grad_norm": 8.50209903717041, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8605761528015137, "num_tokens": 743405808.0, "step": 19480 }, { "epoch": 2.4781834372217277, "ewc_loss": 0.0715835765004158, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035939045483246446, "grad_norm": 8.416489601135254, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8680391311645508, "num_tokens": 743447236.0, "step": 19481 }, { "epoch": 2.478310647500318, "ewc_loss": 0.07159911096096039, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035954584018327296, "grad_norm": 8.44093132019043, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8726801872253418, "num_tokens": 743486494.0, "step": 19482 }, { "epoch": 2.4784378577789083, "ewc_loss": 0.07147697359323502, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003583244397304952, "grad_norm": 8.482196807861328, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8630968332290649, "num_tokens": 743523941.0, "step": 19483 }, { "epoch": 2.478565068057499, "ewc_loss": 0.07151004672050476, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003586551465559751, "grad_norm": 8.496000289916992, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8688176870346069, "num_tokens": 743554169.0, "step": 19484 }, { "epoch": 2.4786922783360894, "ewc_loss": 0.07175588607788086, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035867217229679227, "grad_norm": 8.454574584960938, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8504523038864136, "num_tokens": 743594656.0, "step": 19485 }, { "epoch": 2.47881948861468, "ewc_loss": 0.0714845210313797, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003583998477552086, "grad_norm": 8.443685531616211, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8735367059707642, "num_tokens": 743628638.0, "step": 19486 }, { "epoch": 2.4789466988932705, "ewc_loss": 0.0717267394065857, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035838072653859854, "grad_norm": 8.547395706176758, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8584665060043335, "num_tokens": 743665382.0, "step": 19487 }, { "epoch": 2.479073909171861, "ewc_loss": 0.07127971947193146, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003563518403097987, "grad_norm": 8.353718757629395, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8803142309188843, "num_tokens": 743701712.0, "step": 19488 }, { "epoch": 2.4792011194504515, "ewc_loss": 0.07204428315162659, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003615561290644109, "grad_norm": 8.497889518737793, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8656898140907288, "num_tokens": 743740347.0, "step": 19489 }, { "epoch": 2.479328329729042, "ewc_loss": 0.07145890593528748, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003557023883331567, "grad_norm": 8.429563522338867, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8677055835723877, "num_tokens": 743775434.0, "step": 19490 }, { "epoch": 2.4794555400076326, "ewc_loss": 0.07198571413755417, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036097041447646916, "grad_norm": 8.454237937927246, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8651179075241089, "num_tokens": 743812475.0, "step": 19491 }, { "epoch": 2.479582750286223, "ewc_loss": 0.07173220813274384, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003584353835321963, "grad_norm": 8.484161376953125, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8575311899185181, "num_tokens": 743851350.0, "step": 19492 }, { "epoch": 2.4797099605648136, "ewc_loss": 0.07176318764686584, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003587451938074082, "grad_norm": 8.479467391967773, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.86192387342453, "num_tokens": 743885086.0, "step": 19493 }, { "epoch": 2.479837170843404, "ewc_loss": 0.0717812180519104, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003589254629332572, "grad_norm": 8.387303352355957, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8695416450500488, "num_tokens": 743931747.0, "step": 19494 }, { "epoch": 2.4799643811219947, "ewc_loss": 0.07193085551261902, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036042180727235973, "grad_norm": 8.464903831481934, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8792576789855957, "num_tokens": 743971806.0, "step": 19495 }, { "epoch": 2.480091591400585, "ewc_loss": 0.07174518704414368, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003585651866160333, "grad_norm": 8.463165283203125, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8765449523925781, "num_tokens": 744005304.0, "step": 19496 }, { "epoch": 2.4802188016791757, "ewc_loss": 0.07192426174879074, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003603559162002057, "grad_norm": 8.428585052490234, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.8584750890731812, "num_tokens": 744051159.0, "step": 19497 }, { "epoch": 2.4803460119577663, "ewc_loss": 0.07182365655899048, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035934988409280777, "grad_norm": 8.533160209655762, "learning_rate": 1e-06, "loss": 0.5105, "mean_token_accuracy": 0.8490390181541443, "num_tokens": 744089856.0, "step": 19498 }, { "epoch": 2.480473222236357, "ewc_loss": 0.07169121503829956, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003580253978725523, "grad_norm": 8.347254753112793, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8459576368331909, "num_tokens": 744136355.0, "step": 19499 }, { "epoch": 2.4806004325149473, "ewc_loss": 0.07213877141475677, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003625009849201888, "grad_norm": 8.54610538482666, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8556068539619446, "num_tokens": 744177682.0, "step": 19500 }, { "epoch": 2.480727642793538, "ewc_loss": 0.07156646251678467, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035677789128385484, "grad_norm": 8.368120193481445, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8675297498703003, "num_tokens": 744214209.0, "step": 19501 }, { "epoch": 2.4808548530721284, "ewc_loss": 0.07203420996665955, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003638968337327242, "grad_norm": 8.538806915283203, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8493630290031433, "num_tokens": 744257923.0, "step": 19502 }, { "epoch": 2.480982063350719, "ewc_loss": 0.07143084704875946, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003578632022254169, "grad_norm": 8.43990707397461, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8717515468597412, "num_tokens": 744293370.0, "step": 19503 }, { "epoch": 2.4811092736293094, "ewc_loss": 0.0721079409122467, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036219265894033015, "grad_norm": 8.595648765563965, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8736239671707153, "num_tokens": 744328572.0, "step": 19504 }, { "epoch": 2.4812364839078995, "ewc_loss": 0.07150706648826599, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035618391120806336, "grad_norm": 8.409672737121582, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.850304901599884, "num_tokens": 744364124.0, "step": 19505 }, { "epoch": 2.4813636941864905, "ewc_loss": 0.07224631309509277, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003635763714555651, "grad_norm": 8.603874206542969, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8698195815086365, "num_tokens": 744403679.0, "step": 19506 }, { "epoch": 2.4814909044650806, "ewc_loss": 0.07153559476137161, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003564692160580307, "grad_norm": 8.468588829040527, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.865939736366272, "num_tokens": 744440922.0, "step": 19507 }, { "epoch": 2.481618114743671, "ewc_loss": 0.07201679050922394, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036128118517808616, "grad_norm": 8.585789680480957, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8626096248626709, "num_tokens": 744479025.0, "step": 19508 }, { "epoch": 2.4817453250222616, "ewc_loss": 0.07155530154705048, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035666627809405327, "grad_norm": 8.440074920654297, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8673321604728699, "num_tokens": 744514165.0, "step": 19509 }, { "epoch": 2.481872535300852, "ewc_loss": 0.07188374549150467, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035995073267258704, "grad_norm": 8.58385181427002, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8619807958602905, "num_tokens": 744547210.0, "step": 19510 }, { "epoch": 2.4819997455794427, "ewc_loss": 0.07152409851551056, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035635431413538754, "grad_norm": 8.497693061828613, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8658009171485901, "num_tokens": 744585132.0, "step": 19511 }, { "epoch": 2.4821269558580332, "ewc_loss": 0.07170768082141876, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035819009644910693, "grad_norm": 8.481307029724121, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8772493600845337, "num_tokens": 744617921.0, "step": 19512 }, { "epoch": 2.4822541661366238, "ewc_loss": 0.07167968153953552, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003579101467039436, "grad_norm": 8.524574279785156, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8624253869056702, "num_tokens": 744653095.0, "step": 19513 }, { "epoch": 2.4823813764152143, "ewc_loss": 0.07156555354595184, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035676881088875234, "grad_norm": 8.536853790283203, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8716963529586792, "num_tokens": 744688258.0, "step": 19514 }, { "epoch": 2.482508586693805, "ewc_loss": 0.07170853018760681, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003581985947676003, "grad_norm": 8.558307647705078, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8656558990478516, "num_tokens": 744723500.0, "step": 19515 }, { "epoch": 2.4826357969723953, "ewc_loss": 0.07142731547355652, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035538640804588795, "grad_norm": 8.430228233337402, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8670099973678589, "num_tokens": 744757133.0, "step": 19516 }, { "epoch": 2.482763007250986, "ewc_loss": 0.07171005010604858, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003582137287594378, "grad_norm": 8.487637519836426, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8776751160621643, "num_tokens": 744793814.0, "step": 19517 }, { "epoch": 2.4828902175295764, "ewc_loss": 0.07150617241859436, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035617497633211315, "grad_norm": 8.4917573928833, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8701232075691223, "num_tokens": 744828680.0, "step": 19518 }, { "epoch": 2.483017427808167, "ewc_loss": 0.07165417820215225, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003576550807338208, "grad_norm": 8.527449607849121, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.861850380897522, "num_tokens": 744861642.0, "step": 19519 }, { "epoch": 2.4831446380867574, "ewc_loss": 0.07160088419914246, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003571221313904971, "grad_norm": 8.49631118774414, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8774653673171997, "num_tokens": 744900345.0, "step": 19520 }, { "epoch": 2.483271848365348, "ewc_loss": 0.071610227227211, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035721558379009366, "grad_norm": 8.44223403930664, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8733616471290588, "num_tokens": 744938589.0, "step": 19521 }, { "epoch": 2.4833990586439385, "ewc_loss": 0.07157108187675476, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035682410816662014, "grad_norm": 8.444117546081543, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8563479781150818, "num_tokens": 744975385.0, "step": 19522 }, { "epoch": 2.483526268922529, "ewc_loss": 0.07146257907152176, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035818046308122575, "grad_norm": 8.541512489318848, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.85560142993927, "num_tokens": 745010860.0, "step": 19523 }, { "epoch": 2.4836534792011196, "ewc_loss": 0.07152539491653442, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003563671780284494, "grad_norm": 8.447431564331055, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8576257228851318, "num_tokens": 745047183.0, "step": 19524 }, { "epoch": 2.48378068947971, "ewc_loss": 0.07176485657691956, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003587618994060904, "grad_norm": 8.493273735046387, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8635367155075073, "num_tokens": 745081906.0, "step": 19525 }, { "epoch": 2.4839078997583006, "ewc_loss": 0.0716429352760315, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000357542623532936, "grad_norm": 8.458171844482422, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8711773157119751, "num_tokens": 745127285.0, "step": 19526 }, { "epoch": 2.484035110036891, "ewc_loss": 0.07172207534313202, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003583340148907155, "grad_norm": 8.42423152923584, "learning_rate": 1e-06, "loss": 0.4119, "mean_token_accuracy": 0.882224977016449, "num_tokens": 745171564.0, "step": 19527 }, { "epoch": 2.4841623203154817, "ewc_loss": 0.0717516615986824, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035862988443113863, "grad_norm": 8.488943099975586, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8783640265464783, "num_tokens": 745214252.0, "step": 19528 }, { "epoch": 2.484289530594072, "ewc_loss": 0.07138083875179291, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035736305289901793, "grad_norm": 8.430627822875977, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.8687329292297363, "num_tokens": 745251983.0, "step": 19529 }, { "epoch": 2.4844167408726623, "ewc_loss": 0.07182785868644714, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003593918227124959, "grad_norm": 8.515804290771484, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8678032755851746, "num_tokens": 745287037.0, "step": 19530 }, { "epoch": 2.4845439511512533, "ewc_loss": 0.07154597342014313, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000356572971213609, "grad_norm": 8.434117317199707, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8633955717086792, "num_tokens": 745327854.0, "step": 19531 }, { "epoch": 2.4846711614298433, "ewc_loss": 0.0719127506017685, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003602408105507493, "grad_norm": 8.50474739074707, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8542990684509277, "num_tokens": 745370227.0, "step": 19532 }, { "epoch": 2.484798371708434, "ewc_loss": 0.07157252728939056, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035683857277035713, "grad_norm": 8.425251007080078, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8646105527877808, "num_tokens": 745409416.0, "step": 19533 }, { "epoch": 2.4849255819870244, "ewc_loss": 0.07195161283016205, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003606294631026685, "grad_norm": 8.527106285095215, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8474249839782715, "num_tokens": 745449908.0, "step": 19534 }, { "epoch": 2.485052792265615, "ewc_loss": 0.07161447405815125, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035725804627873003, "grad_norm": 8.47240161895752, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8557693958282471, "num_tokens": 745486384.0, "step": 19535 }, { "epoch": 2.4851800025442055, "ewc_loss": 0.07184499502182007, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035956327337771654, "grad_norm": 8.516326904296875, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.856171727180481, "num_tokens": 745523058.0, "step": 19536 }, { "epoch": 2.485307212822796, "ewc_loss": 0.07172814756631851, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003583947545848787, "grad_norm": 8.43435287475586, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8480437994003296, "num_tokens": 745559276.0, "step": 19537 }, { "epoch": 2.4854344231013865, "ewc_loss": 0.07160016894340515, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035955640487372875, "grad_norm": 8.478974342346191, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8647865653038025, "num_tokens": 745592726.0, "step": 19538 }, { "epoch": 2.485561633379977, "ewc_loss": 0.0715956911444664, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003570701810531318, "grad_norm": 8.425585746765137, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8682459592819214, "num_tokens": 745630117.0, "step": 19539 }, { "epoch": 2.4856888436585676, "ewc_loss": 0.07185570150613785, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003621116920839995, "grad_norm": 8.542440414428711, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8522122502326965, "num_tokens": 745664968.0, "step": 19540 }, { "epoch": 2.485816053937158, "ewc_loss": 0.07127387821674347, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035629348712973297, "grad_norm": 8.389240264892578, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8669857978820801, "num_tokens": 745701893.0, "step": 19541 }, { "epoch": 2.4859432642157486, "ewc_loss": 0.07198429107666016, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036095624091103673, "grad_norm": 8.485709190368652, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8610367178916931, "num_tokens": 745736884.0, "step": 19542 }, { "epoch": 2.486070474494339, "ewc_loss": 0.07168431580066681, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035795639269053936, "grad_norm": 8.414327621459961, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8771629333496094, "num_tokens": 745782381.0, "step": 19543 }, { "epoch": 2.4861976847729297, "ewc_loss": 0.07199367135763168, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036104998434893787, "grad_norm": 8.510432243347168, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.879962682723999, "num_tokens": 745827630.0, "step": 19544 }, { "epoch": 2.48632489505152, "ewc_loss": 0.07162081450223923, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003573214344214648, "grad_norm": 8.447033882141113, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8739029169082642, "num_tokens": 745862223.0, "step": 19545 }, { "epoch": 2.4864521053301107, "ewc_loss": 0.0719854086637497, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003609674167819321, "grad_norm": 8.495567321777344, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8823084831237793, "num_tokens": 745903077.0, "step": 19546 }, { "epoch": 2.4865793156087013, "ewc_loss": 0.07156737148761749, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003567870007827878, "grad_norm": 8.394185066223145, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8664113283157349, "num_tokens": 745945534.0, "step": 19547 }, { "epoch": 2.486706525887292, "ewc_loss": 0.07201121747493744, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036122542223893106, "grad_norm": 8.575876235961914, "learning_rate": 1e-06, "loss": 0.5101, "mean_token_accuracy": 0.8492547273635864, "num_tokens": 745986308.0, "step": 19548 }, { "epoch": 2.4868337361658823, "ewc_loss": 0.07160242646932602, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003571375273168087, "grad_norm": 8.459156036376953, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8565607070922852, "num_tokens": 746022206.0, "step": 19549 }, { "epoch": 2.486960946444473, "ewc_loss": 0.07206611335277557, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617744368966669, "grad_norm": 8.488907814025879, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8733419179916382, "num_tokens": 746059854.0, "step": 19550 }, { "epoch": 2.4870881567230634, "ewc_loss": 0.07175806164741516, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003586939419619739, "grad_norm": 8.45496940612793, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8736931085586548, "num_tokens": 746103994.0, "step": 19551 }, { "epoch": 2.487215367001654, "ewc_loss": 0.07193122804164886, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003604255907703191, "grad_norm": 8.500483512878418, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8559908866882324, "num_tokens": 746138491.0, "step": 19552 }, { "epoch": 2.4873425772802444, "ewc_loss": 0.07182343304157257, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035934761399403214, "grad_norm": 8.522421836853027, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8643038272857666, "num_tokens": 746172598.0, "step": 19553 }, { "epoch": 2.487469787558835, "ewc_loss": 0.0718277096748352, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000359390425728634, "grad_norm": 8.406087875366211, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8742226362228394, "num_tokens": 746210385.0, "step": 19554 }, { "epoch": 2.487596997837425, "ewc_loss": 0.07205513119697571, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036166454083286226, "grad_norm": 8.54161548614502, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8682605028152466, "num_tokens": 746252563.0, "step": 19555 }, { "epoch": 2.487724208116016, "ewc_loss": 0.07173050940036774, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003584183577913791, "grad_norm": 8.405888557434082, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8714591264724731, "num_tokens": 746292356.0, "step": 19556 }, { "epoch": 2.487851418394606, "ewc_loss": 0.07209806144237518, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036209390964359045, "grad_norm": 8.548989295959473, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8549180030822754, "num_tokens": 746325467.0, "step": 19557 }, { "epoch": 2.4879786286731966, "ewc_loss": 0.07154585421085358, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035657177795656025, "grad_norm": 8.39097785949707, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8607590198516846, "num_tokens": 746362285.0, "step": 19558 }, { "epoch": 2.488105838951787, "ewc_loss": 0.07214301824569702, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003625434765126556, "grad_norm": 8.524162292480469, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8741956353187561, "num_tokens": 746401540.0, "step": 19559 }, { "epoch": 2.4882330492303777, "ewc_loss": 0.07126976549625397, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003562523634172976, "grad_norm": 8.392260551452637, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.867892861366272, "num_tokens": 746439282.0, "step": 19560 }, { "epoch": 2.488360259508968, "ewc_loss": 0.07208593934774399, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036197269218973815, "grad_norm": 8.522536277770996, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8775765895843506, "num_tokens": 746480993.0, "step": 19561 }, { "epoch": 2.4884874697875587, "ewc_loss": 0.07166394591331482, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035775272408500314, "grad_norm": 8.456768989562988, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8635236620903015, "num_tokens": 746519318.0, "step": 19562 }, { "epoch": 2.4886146800661493, "ewc_loss": 0.07203491032123566, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036146241473034024, "grad_norm": 8.540609359741211, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8519349098205566, "num_tokens": 746559761.0, "step": 19563 }, { "epoch": 2.48874189034474, "ewc_loss": 0.07168596237897873, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035797289456240833, "grad_norm": 8.509982109069824, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8737962245941162, "num_tokens": 746591622.0, "step": 19564 }, { "epoch": 2.4888691006233303, "ewc_loss": 0.07195137441158295, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036062707658857107, "grad_norm": 8.571568489074707, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8635683655738831, "num_tokens": 746627709.0, "step": 19565 }, { "epoch": 2.488996310901921, "ewc_loss": 0.07163900136947632, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035750330425798893, "grad_norm": 8.44735050201416, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8640086650848389, "num_tokens": 746663445.0, "step": 19566 }, { "epoch": 2.4891235211805114, "ewc_loss": 0.07192641496658325, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036037745303474367, "grad_norm": 8.524462699890137, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8539561629295349, "num_tokens": 746706869.0, "step": 19567 }, { "epoch": 2.489250731459102, "ewc_loss": 0.0717214047908783, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035832737921737134, "grad_norm": 8.513378143310547, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8777973651885986, "num_tokens": 746749502.0, "step": 19568 }, { "epoch": 2.4893779417376924, "ewc_loss": 0.07177090644836426, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000358822348061949, "grad_norm": 8.472387313842773, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8708847165107727, "num_tokens": 746786510.0, "step": 19569 }, { "epoch": 2.489505152016283, "ewc_loss": 0.07190360128879547, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003601492499001324, "grad_norm": 8.670807838439941, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8711068630218506, "num_tokens": 746819430.0, "step": 19570 }, { "epoch": 2.4896323622948735, "ewc_loss": 0.07151515036821365, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003562647907529026, "grad_norm": 8.353949546813965, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8695130348205566, "num_tokens": 746856018.0, "step": 19571 }, { "epoch": 2.489759572573464, "ewc_loss": 0.07232087850570679, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003643220697995275, "grad_norm": 8.652647018432617, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8600979447364807, "num_tokens": 746897350.0, "step": 19572 }, { "epoch": 2.4898867828520546, "ewc_loss": 0.07135967910289764, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035471003502607346, "grad_norm": 8.378514289855957, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8586245179176331, "num_tokens": 746937163.0, "step": 19573 }, { "epoch": 2.490013993130645, "ewc_loss": 0.07225745916366577, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003636878391262144, "grad_norm": 8.546248435974121, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8757802248001099, "num_tokens": 746976140.0, "step": 19574 }, { "epoch": 2.4901412034092356, "ewc_loss": 0.0715101808309555, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035621505230665207, "grad_norm": 8.403407096862793, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8659517168998718, "num_tokens": 747012576.0, "step": 19575 }, { "epoch": 2.490268413687826, "ewc_loss": 0.07213373482227325, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003624506061896682, "grad_norm": 8.599027633666992, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.86528080701828, "num_tokens": 747048179.0, "step": 19576 }, { "epoch": 2.4903956239664167, "ewc_loss": 0.07153688371181488, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003564821381587535, "grad_norm": 8.424616813659668, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8485844731330872, "num_tokens": 747085291.0, "step": 19577 }, { "epoch": 2.4905228342450068, "ewc_loss": 0.07204219698905945, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003615352907218039, "grad_norm": 8.543526649475098, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8730159997940063, "num_tokens": 747119085.0, "step": 19578 }, { "epoch": 2.4906500445235977, "ewc_loss": 0.071523517370224, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003563484060578048, "grad_norm": 8.472284317016602, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8658930063247681, "num_tokens": 747153463.0, "step": 19579 }, { "epoch": 2.490777254802188, "ewc_loss": 0.07181638479232788, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003592771536204964, "grad_norm": 8.495675086975098, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8410547971725464, "num_tokens": 747194781.0, "step": 19580 }, { "epoch": 2.4909044650807783, "ewc_loss": 0.07172522693872452, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003583655634429306, "grad_norm": 8.476127624511719, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8678674101829529, "num_tokens": 747235328.0, "step": 19581 }, { "epoch": 2.491031675359369, "ewc_loss": 0.07168978452682495, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000358011107891798, "grad_norm": 9.070984840393066, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8626258373260498, "num_tokens": 747278481.0, "step": 19582 }, { "epoch": 2.4911588856379594, "ewc_loss": 0.07092395424842834, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003503528714645654, "grad_norm": 8.288555145263672, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.8814889788627625, "num_tokens": 747313077.0, "step": 19583 }, { "epoch": 2.49128609591655, "ewc_loss": 0.07262727618217468, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003673860337585211, "grad_norm": 8.65343189239502, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8720747232437134, "num_tokens": 747352693.0, "step": 19584 }, { "epoch": 2.4914133061951405, "ewc_loss": 0.07100832462310791, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035119650419801474, "grad_norm": 8.26225757598877, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8573324680328369, "num_tokens": 747397717.0, "step": 19585 }, { "epoch": 2.491540516473731, "ewc_loss": 0.07265457510948181, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036765902768820524, "grad_norm": 8.669052124023438, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8692160248756409, "num_tokens": 747435457.0, "step": 19586 }, { "epoch": 2.4916677267523215, "ewc_loss": 0.07138107717037201, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035492400638759136, "grad_norm": 8.350861549377441, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8698962926864624, "num_tokens": 747479017.0, "step": 19587 }, { "epoch": 2.491794937030912, "ewc_loss": 0.07250619679689407, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036617525620386004, "grad_norm": 8.609389305114746, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8643490076065063, "num_tokens": 747520247.0, "step": 19588 }, { "epoch": 2.4919221473095026, "ewc_loss": 0.07142705470323563, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003578252508305013, "grad_norm": 8.41500186920166, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8521252274513245, "num_tokens": 747559983.0, "step": 19589 }, { "epoch": 2.492049357588093, "ewc_loss": 0.07199238985776901, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036347858258523047, "grad_norm": 8.563034057617188, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8619668483734131, "num_tokens": 747595746.0, "step": 19590 }, { "epoch": 2.4921765678666836, "ewc_loss": 0.07148375362157822, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035839222255162895, "grad_norm": 8.403103828430176, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8628455400466919, "num_tokens": 747639261.0, "step": 19591 }, { "epoch": 2.492303778145274, "ewc_loss": 0.07211502641439438, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003622635267674923, "grad_norm": 8.591338157653809, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8473268747329712, "num_tokens": 747680665.0, "step": 19592 }, { "epoch": 2.4924309884238647, "ewc_loss": 0.0714750587940216, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00035830523120239377, "grad_norm": 8.469910621643066, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8701087236404419, "num_tokens": 747713320.0, "step": 19593 }, { "epoch": 2.492558198702455, "ewc_loss": 0.07180260121822357, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003615807509049773, "grad_norm": 9.164987564086914, "learning_rate": 1e-06, "loss": 0.5576, "mean_token_accuracy": 0.8401491641998291, "num_tokens": 747752167.0, "step": 19594 }, { "epoch": 2.4926854089810457, "ewc_loss": 0.07115684449672699, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003526817017700523, "grad_norm": 8.321096420288086, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8709924817085266, "num_tokens": 747787274.0, "step": 19595 }, { "epoch": 2.4928126192596363, "ewc_loss": 0.07286880910396576, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036980138975195587, "grad_norm": 8.668405532836914, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8599352836608887, "num_tokens": 747826647.0, "step": 19596 }, { "epoch": 2.492939829538227, "ewc_loss": 0.07122724503278732, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003533857234288007, "grad_norm": 8.314704895019531, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8616892099380493, "num_tokens": 747866797.0, "step": 19597 }, { "epoch": 2.4930670398168173, "ewc_loss": 0.0728895291686058, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00037000857992097735, "grad_norm": 8.718395233154297, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8667181134223938, "num_tokens": 747907589.0, "step": 19598 }, { "epoch": 2.493194250095408, "ewc_loss": 0.07150526344776154, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003561659250408411, "grad_norm": 8.376004219055176, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8582440614700317, "num_tokens": 747939865.0, "step": 19599 }, { "epoch": 2.4933214603739984, "ewc_loss": 0.07243570685386658, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003679117653518915, "grad_norm": 8.691100120544434, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8630623817443848, "num_tokens": 747976238.0, "step": 19600 }, { "epoch": 2.493448670652589, "ewc_loss": 0.07163846492767334, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000357497890945524, "grad_norm": 8.433537483215332, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8637503385543823, "num_tokens": 748012076.0, "step": 19601 }, { "epoch": 2.4935758809311794, "ewc_loss": 0.07244653254747391, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036557859857566655, "grad_norm": 8.704255104064941, "learning_rate": 1e-06, "loss": 0.4864, "mean_token_accuracy": 0.8588796257972717, "num_tokens": 748042518.0, "step": 19602 }, { "epoch": 2.4937030912097695, "ewc_loss": 0.07157567143440247, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003568700049072504, "grad_norm": 8.455155372619629, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8730765581130981, "num_tokens": 748077509.0, "step": 19603 }, { "epoch": 2.4938303014883605, "ewc_loss": 0.07204742729663849, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036158753209747374, "grad_norm": 8.564327239990234, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8708503842353821, "num_tokens": 748118164.0, "step": 19604 }, { "epoch": 2.4939575117669506, "ewc_loss": 0.07171779870986938, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035829131957143545, "grad_norm": 8.496964454650879, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.857587456703186, "num_tokens": 748153609.0, "step": 19605 }, { "epoch": 2.494084722045541, "ewc_loss": 0.0719003677368164, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036011694464832544, "grad_norm": 8.519817352294922, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.863936185836792, "num_tokens": 748189573.0, "step": 19606 }, { "epoch": 2.4942119323241316, "ewc_loss": 0.07188308238983154, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035994412610307336, "grad_norm": 8.44148063659668, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.860680103302002, "num_tokens": 748231121.0, "step": 19607 }, { "epoch": 2.494339142602722, "ewc_loss": 0.0718856006860733, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003599692427087575, "grad_norm": 8.455955505371094, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8755604028701782, "num_tokens": 748268744.0, "step": 19608 }, { "epoch": 2.4944663528813127, "ewc_loss": 0.07190104573965073, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003601237549446523, "grad_norm": 8.495302200317383, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8697920441627502, "num_tokens": 748307441.0, "step": 19609 }, { "epoch": 2.494593563159903, "ewc_loss": 0.07197864353656769, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036089966306462884, "grad_norm": 8.478128433227539, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8651113510131836, "num_tokens": 748343506.0, "step": 19610 }, { "epoch": 2.4947207734384937, "ewc_loss": 0.07198905944824219, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003610038256738335, "grad_norm": 8.461931228637695, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8671817779541016, "num_tokens": 748382209.0, "step": 19611 }, { "epoch": 2.4948479837170843, "ewc_loss": 0.07183457911014557, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003594591107685119, "grad_norm": 8.485569953918457, "learning_rate": 1e-06, "loss": 0.3918, "mean_token_accuracy": 0.8866062164306641, "num_tokens": 748413660.0, "step": 19612 }, { "epoch": 2.494975193995675, "ewc_loss": 0.07193780690431595, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003604913654271513, "grad_norm": 8.478804588317871, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8665804862976074, "num_tokens": 748457037.0, "step": 19613 }, { "epoch": 2.4951024042742653, "ewc_loss": 0.07208310067653656, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003619442868512124, "grad_norm": 8.530547142028809, "learning_rate": 1e-06, "loss": 0.5364, "mean_token_accuracy": 0.8435971736907959, "num_tokens": 748497227.0, "step": 19614 }, { "epoch": 2.495229614552856, "ewc_loss": 0.07192246615886688, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036033798824064434, "grad_norm": 8.537772178649902, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8628625869750977, "num_tokens": 748529901.0, "step": 19615 }, { "epoch": 2.4953568248314464, "ewc_loss": 0.07194149494171143, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036052821087650955, "grad_norm": 8.495903968811035, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8528379201889038, "num_tokens": 748568238.0, "step": 19616 }, { "epoch": 2.495484035110037, "ewc_loss": 0.07183124870061874, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035942578688263893, "grad_norm": 8.46386432647705, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8542956113815308, "num_tokens": 748608153.0, "step": 19617 }, { "epoch": 2.4956112453886274, "ewc_loss": 0.07198193669319153, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003609326668083668, "grad_norm": 8.490331649780273, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8605990409851074, "num_tokens": 748646977.0, "step": 19618 }, { "epoch": 2.495738455667218, "ewc_loss": 0.07180102914571762, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003591235727071762, "grad_norm": 8.472455978393555, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8562372922897339, "num_tokens": 748689586.0, "step": 19619 }, { "epoch": 2.4958656659458085, "ewc_loss": 0.0718604177236557, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003597174654714763, "grad_norm": 8.470110893249512, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8641928434371948, "num_tokens": 748730915.0, "step": 19620 }, { "epoch": 2.495992876224399, "ewc_loss": 0.07203906774520874, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036150391679257154, "grad_norm": 8.567378997802734, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8612650632858276, "num_tokens": 748766996.0, "step": 19621 }, { "epoch": 2.4961200865029896, "ewc_loss": 0.07186666131019592, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035977992229163647, "grad_norm": 8.484167098999023, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.857759952545166, "num_tokens": 748801225.0, "step": 19622 }, { "epoch": 2.49624729678158, "ewc_loss": 0.07189540565013885, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003600673808250576, "grad_norm": 8.4672212600708, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8599393367767334, "num_tokens": 748841355.0, "step": 19623 }, { "epoch": 2.4963745070601706, "ewc_loss": 0.07189314067363739, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036004470894113183, "grad_norm": 8.455336570739746, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8639598488807678, "num_tokens": 748879636.0, "step": 19624 }, { "epoch": 2.496501717338761, "ewc_loss": 0.07185974717140198, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003597108006943017, "grad_norm": 8.455803871154785, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8612117171287537, "num_tokens": 748917795.0, "step": 19625 }, { "epoch": 2.4966289276173517, "ewc_loss": 0.07186713069677353, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035978457890450954, "grad_norm": 8.466769218444824, "learning_rate": 1e-06, "loss": 0.556, "mean_token_accuracy": 0.8365433216094971, "num_tokens": 748956512.0, "step": 19626 }, { "epoch": 2.496756137895942, "ewc_loss": 0.0719294399023056, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036040766281075776, "grad_norm": 8.497058868408203, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8683231472969055, "num_tokens": 748995577.0, "step": 19627 }, { "epoch": 2.4968833481745323, "ewc_loss": 0.07195977121591568, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003607110120356083, "grad_norm": 8.542061805725098, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8603677153587341, "num_tokens": 749027672.0, "step": 19628 }, { "epoch": 2.4970105584531233, "ewc_loss": 0.07178579270839691, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003589712141547352, "grad_norm": 8.442008972167969, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8752729892730713, "num_tokens": 749059101.0, "step": 19629 }, { "epoch": 2.4971377687317133, "ewc_loss": 0.07204755395650864, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036158881266601384, "grad_norm": 8.598638534545898, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8502676486968994, "num_tokens": 749097034.0, "step": 19630 }, { "epoch": 2.497264979010304, "ewc_loss": 0.07164883613586426, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003576016752049327, "grad_norm": 8.368377685546875, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8625907897949219, "num_tokens": 749137536.0, "step": 19631 }, { "epoch": 2.4973921892888944, "ewc_loss": 0.07226279377937317, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003637412446551025, "grad_norm": 8.549589157104492, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8646771907806396, "num_tokens": 749171487.0, "step": 19632 }, { "epoch": 2.497519399567485, "ewc_loss": 0.07162262499332428, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003573395370040089, "grad_norm": 8.460765838623047, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8574166893959045, "num_tokens": 749213234.0, "step": 19633 }, { "epoch": 2.4976466098460754, "ewc_loss": 0.0720280259847641, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036139358417131007, "grad_norm": 8.574665069580078, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8536357879638672, "num_tokens": 749256184.0, "step": 19634 }, { "epoch": 2.497773820124666, "ewc_loss": 0.07167254388332367, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035783875500783324, "grad_norm": 8.487455368041992, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8619688749313354, "num_tokens": 749292909.0, "step": 19635 }, { "epoch": 2.4979010304032565, "ewc_loss": 0.07183410227298737, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035945430863648653, "grad_norm": 8.516050338745117, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.8517177104949951, "num_tokens": 749327947.0, "step": 19636 }, { "epoch": 2.498028240681847, "ewc_loss": 0.07173440605401993, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035845732782036066, "grad_norm": 8.558627128601074, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8723872900009155, "num_tokens": 749364107.0, "step": 19637 }, { "epoch": 2.4981554509604376, "ewc_loss": 0.0716443583369255, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003575568553060293, "grad_norm": 8.447871208190918, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.879196286201477, "num_tokens": 749398055.0, "step": 19638 }, { "epoch": 2.498282661239028, "ewc_loss": 0.07186547666788101, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003597680479288101, "grad_norm": 8.49583625793457, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8641775846481323, "num_tokens": 749434626.0, "step": 19639 }, { "epoch": 2.4984098715176186, "ewc_loss": 0.0717017650604248, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003581309283617884, "grad_norm": 8.446455001831055, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8578575253486633, "num_tokens": 749470616.0, "step": 19640 }, { "epoch": 2.498537081796209, "ewc_loss": 0.07179967314004898, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035911001032218337, "grad_norm": 8.522235870361328, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.8678957223892212, "num_tokens": 749513072.0, "step": 19641 }, { "epoch": 2.4986642920747997, "ewc_loss": 0.07164108753204346, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035752414260059595, "grad_norm": 8.460575103759766, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8773249387741089, "num_tokens": 749546988.0, "step": 19642 }, { "epoch": 2.49879150235339, "ewc_loss": 0.07190239429473877, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036013720091432333, "grad_norm": 8.488471031188965, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.856548547744751, "num_tokens": 749585851.0, "step": 19643 }, { "epoch": 2.4989187126319807, "ewc_loss": 0.07173872739076614, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003585005470085889, "grad_norm": 8.462905883789062, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8624305129051208, "num_tokens": 749619510.0, "step": 19644 }, { "epoch": 2.4990459229105713, "ewc_loss": 0.0718126893043518, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035924019175581634, "grad_norm": 8.496859550476074, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8709264397621155, "num_tokens": 749653660.0, "step": 19645 }, { "epoch": 2.499173133189162, "ewc_loss": 0.07180696725845337, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003591829154174775, "grad_norm": 8.441725730895996, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8646430969238281, "num_tokens": 749691321.0, "step": 19646 }, { "epoch": 2.4993003434677523, "ewc_loss": 0.07191447913646698, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036025812732987106, "grad_norm": 8.478776931762695, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8649818897247314, "num_tokens": 749729305.0, "step": 19647 }, { "epoch": 2.499427553746343, "ewc_loss": 0.0717138797044754, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003582520585041493, "grad_norm": 8.43451976776123, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8705794811248779, "num_tokens": 749766134.0, "step": 19648 }, { "epoch": 2.4995547640249334, "ewc_loss": 0.07200319319963455, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003611452120821923, "grad_norm": 8.489156723022461, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8611811399459839, "num_tokens": 749807715.0, "step": 19649 }, { "epoch": 2.499681974303524, "ewc_loss": 0.07168959826231003, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035800927435047925, "grad_norm": 8.441349029541016, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8650895357131958, "num_tokens": 749838799.0, "step": 19650 }, { "epoch": 2.499809184582114, "ewc_loss": 0.0720498263835907, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036161154275760055, "grad_norm": 8.500861167907715, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8765988349914551, "num_tokens": 749874283.0, "step": 19651 }, { "epoch": 2.499936394860705, "ewc_loss": 0.07182101160287857, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035932339960709214, "grad_norm": 8.435510635375977, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8629100322723389, "num_tokens": 749915912.0, "step": 19652 }, { "epoch": 2.500063605139295, "ewc_loss": 0.07200908660888672, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036120411823503673, "grad_norm": 8.491418838500977, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8710711598396301, "num_tokens": 749959133.0, "step": 19653 }, { "epoch": 2.500190815417886, "ewc_loss": 0.07185488939285278, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035966222640126944, "grad_norm": 8.432806968688965, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8620947003364563, "num_tokens": 749997846.0, "step": 19654 }, { "epoch": 2.500318025696476, "ewc_loss": 0.07212992757558823, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036241253837943077, "grad_norm": 8.449462890625, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8689932823181152, "num_tokens": 750040973.0, "step": 19655 }, { "epoch": 2.5004452359750666, "ewc_loss": 0.07180298864841461, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035914313048124313, "grad_norm": 8.422113418579102, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8609066009521484, "num_tokens": 750082171.0, "step": 19656 }, { "epoch": 2.500572446253657, "ewc_loss": 0.07207131385803223, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036182644544169307, "grad_norm": 8.516345977783203, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8757628798484802, "num_tokens": 750123083.0, "step": 19657 }, { "epoch": 2.5006996565322477, "ewc_loss": 0.07188259065151215, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035993914934806526, "grad_norm": 8.44985580444336, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8507645726203918, "num_tokens": 750162235.0, "step": 19658 }, { "epoch": 2.500826866810838, "ewc_loss": 0.07203761488199234, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003614894230850041, "grad_norm": 8.50310230255127, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8766301870346069, "num_tokens": 750196501.0, "step": 19659 }, { "epoch": 2.5009540770894287, "ewc_loss": 0.07180476933717728, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003591609711293131, "grad_norm": 8.497170448303223, "learning_rate": 1e-06, "loss": 0.5662, "mean_token_accuracy": 0.8366950154304504, "num_tokens": 750241178.0, "step": 19660 }, { "epoch": 2.5010812873680193, "ewc_loss": 0.0719236508011818, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036034980439580977, "grad_norm": 8.47894287109375, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8692170977592468, "num_tokens": 750284104.0, "step": 19661 }, { "epoch": 2.50120849764661, "ewc_loss": 0.07207439839839935, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003618572955019772, "grad_norm": 8.481782913208008, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8710017204284668, "num_tokens": 750318525.0, "step": 19662 }, { "epoch": 2.5013357079252003, "ewc_loss": 0.0719200000166893, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036031327908858657, "grad_norm": 8.462579727172852, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.873116135597229, "num_tokens": 750358892.0, "step": 19663 }, { "epoch": 2.501462918203791, "ewc_loss": 0.07206429541110992, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617562470026314, "grad_norm": 8.532363891601562, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8513774871826172, "num_tokens": 750399325.0, "step": 19664 }, { "epoch": 2.5015901284823814, "ewc_loss": 0.07190048694610596, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036011813790537417, "grad_norm": 8.507231712341309, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8592175245285034, "num_tokens": 750437493.0, "step": 19665 }, { "epoch": 2.501717338760972, "ewc_loss": 0.07196317613124847, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003607450344134122, "grad_norm": 8.56972599029541, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8720818758010864, "num_tokens": 750472381.0, "step": 19666 }, { "epoch": 2.5018445490395624, "ewc_loss": 0.07168629765510559, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003579762123990804, "grad_norm": 8.492084503173828, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8461716771125793, "num_tokens": 750511786.0, "step": 19667 }, { "epoch": 2.501971759318153, "ewc_loss": 0.07193654775619507, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000360478792572394, "grad_norm": 8.55549430847168, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8711446523666382, "num_tokens": 750550596.0, "step": 19668 }, { "epoch": 2.5020989695967435, "ewc_loss": 0.07162860780954361, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035739937447942793, "grad_norm": 8.361557006835938, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8682867884635925, "num_tokens": 750594790.0, "step": 19669 }, { "epoch": 2.502226179875334, "ewc_loss": 0.07224856317043304, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003635989560279995, "grad_norm": 8.553976058959961, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8754901885986328, "num_tokens": 750636738.0, "step": 19670 }, { "epoch": 2.5023533901539246, "ewc_loss": 0.07162278890609741, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035734116681851447, "grad_norm": 8.427216529846191, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8769248127937317, "num_tokens": 750672338.0, "step": 19671 }, { "epoch": 2.502480600432515, "ewc_loss": 0.07224838435649872, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036359712248668075, "grad_norm": 8.638379096984863, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8580437898635864, "num_tokens": 750709149.0, "step": 19672 }, { "epoch": 2.5026078107111056, "ewc_loss": 0.07136128842830658, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.0003571675915736705, "grad_norm": 8.412970542907715, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8765360116958618, "num_tokens": 750741727.0, "step": 19673 }, { "epoch": 2.5027350209896957, "ewc_loss": 0.07218877971172333, "ewc_loss_diag": 3.552436828613281e-05, "ewc_loss_parallel": 0.00036544250906445086, "grad_norm": 8.627190589904785, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8530112504959106, "num_tokens": 750775903.0, "step": 19674 }, { "epoch": 2.5028622312682867, "ewc_loss": 0.07168559730052948, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003579692274797708, "grad_norm": 8.467158317565918, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8598829507827759, "num_tokens": 750810502.0, "step": 19675 }, { "epoch": 2.5029894415468767, "ewc_loss": 0.07240602374076843, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003651735605672002, "grad_norm": 8.615137100219727, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8725416660308838, "num_tokens": 750847762.0, "step": 19676 }, { "epoch": 2.5031166518254677, "ewc_loss": 0.07173517346382141, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003584649821277708, "grad_norm": 8.488725662231445, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8658499717712402, "num_tokens": 750887201.0, "step": 19677 }, { "epoch": 2.503243862104058, "ewc_loss": 0.07224653661251068, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003635786706581712, "grad_norm": 8.590394973754883, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8636433482170105, "num_tokens": 750927323.0, "step": 19678 }, { "epoch": 2.5033710723826488, "ewc_loss": 0.07167297601699829, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003578430041670799, "grad_norm": 8.406710624694824, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.85555100440979, "num_tokens": 750972501.0, "step": 19679 }, { "epoch": 2.503498282661239, "ewc_loss": 0.07221667468547821, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003632799780461937, "grad_norm": 8.569727897644043, "learning_rate": 1e-06, "loss": 0.4316, "mean_token_accuracy": 0.8741847276687622, "num_tokens": 751010743.0, "step": 19680 }, { "epoch": 2.5036254929398294, "ewc_loss": 0.07170476019382477, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035816090530715883, "grad_norm": 8.399234771728516, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.860123872756958, "num_tokens": 751050440.0, "step": 19681 }, { "epoch": 2.50375270321842, "ewc_loss": 0.07233646512031555, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036447792081162333, "grad_norm": 8.587453842163086, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8681960701942444, "num_tokens": 751088319.0, "step": 19682 }, { "epoch": 2.5038799134970104, "ewc_loss": 0.07175508141517639, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035866411053575575, "grad_norm": 8.430208206176758, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8637009859085083, "num_tokens": 751123766.0, "step": 19683 }, { "epoch": 2.504007123775601, "ewc_loss": 0.07221352308988571, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036324851680547, "grad_norm": 8.607502937316895, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8814129829406738, "num_tokens": 751163375.0, "step": 19684 }, { "epoch": 2.5041343340541915, "ewc_loss": 0.07170049846172333, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003581182681955397, "grad_norm": 8.413493156433105, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8599966764450073, "num_tokens": 751204526.0, "step": 19685 }, { "epoch": 2.504261544332782, "ewc_loss": 0.07241016626358032, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036521494621410966, "grad_norm": 8.62718677520752, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8695607781410217, "num_tokens": 751245133.0, "step": 19686 }, { "epoch": 2.5043887546113726, "ewc_loss": 0.07175997644662857, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000358713063178584, "grad_norm": 8.479029655456543, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8537529706954956, "num_tokens": 751284578.0, "step": 19687 }, { "epoch": 2.504515964889963, "ewc_loss": 0.07230319827795029, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036414526402950287, "grad_norm": 8.602526664733887, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8573178052902222, "num_tokens": 751325794.0, "step": 19688 }, { "epoch": 2.5046431751685536, "ewc_loss": 0.07177640497684479, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003588773834053427, "grad_norm": 8.442056655883789, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8705976009368896, "num_tokens": 751363777.0, "step": 19689 }, { "epoch": 2.504770385447144, "ewc_loss": 0.07226568460464478, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003637700865510851, "grad_norm": 8.545855522155762, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8560222387313843, "num_tokens": 751400499.0, "step": 19690 }, { "epoch": 2.5048975957257347, "ewc_loss": 0.07190226018428802, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003601358912419528, "grad_norm": 8.536117553710938, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.861431896686554, "num_tokens": 751434895.0, "step": 19691 }, { "epoch": 2.505024806004325, "ewc_loss": 0.07216900587081909, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003628033446148038, "grad_norm": 8.564847946166992, "learning_rate": 1e-06, "loss": 0.4254, "mean_token_accuracy": 0.8747310638427734, "num_tokens": 751473984.0, "step": 19692 }, { "epoch": 2.5051520162829157, "ewc_loss": 0.07187192142009735, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003598325129132718, "grad_norm": 8.486702919006348, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8646767139434814, "num_tokens": 751515030.0, "step": 19693 }, { "epoch": 2.5052792265615063, "ewc_loss": 0.07221244275569916, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036323766107670963, "grad_norm": 8.539913177490234, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8616517186164856, "num_tokens": 751550023.0, "step": 19694 }, { "epoch": 2.505406436840097, "ewc_loss": 0.07194057106971741, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036051898496225476, "grad_norm": 8.513145446777344, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8662573099136353, "num_tokens": 751584793.0, "step": 19695 }, { "epoch": 2.5055336471186873, "ewc_loss": 0.07211573421955109, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036227062810212374, "grad_norm": 8.563078880310059, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8454622626304626, "num_tokens": 751619817.0, "step": 19696 }, { "epoch": 2.505660857397278, "ewc_loss": 0.07195067405700684, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036062003346160054, "grad_norm": 8.4767427444458, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8710927367210388, "num_tokens": 751657097.0, "step": 19697 }, { "epoch": 2.5057880676758684, "ewc_loss": 0.07221128046512604, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003632261068560183, "grad_norm": 8.595510482788086, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8542699813842773, "num_tokens": 751687042.0, "step": 19698 }, { "epoch": 2.5059152779544585, "ewc_loss": 0.07185256481170654, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035963894333690405, "grad_norm": 8.43177604675293, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8737232685089111, "num_tokens": 751724281.0, "step": 19699 }, { "epoch": 2.5060424882330494, "ewc_loss": 0.07240414619445801, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003651547012850642, "grad_norm": 8.66828441619873, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8459993600845337, "num_tokens": 751765144.0, "step": 19700 }, { "epoch": 2.5061696985116395, "ewc_loss": 0.07167688012123108, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035788206150755286, "grad_norm": 8.40051555633545, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8725116848945618, "num_tokens": 751799602.0, "step": 19701 }, { "epoch": 2.5062969087902305, "ewc_loss": 0.07243192195892334, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003654324682429433, "grad_norm": 8.579269409179688, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8734723925590515, "num_tokens": 751836972.0, "step": 19702 }, { "epoch": 2.5064241190688206, "ewc_loss": 0.07159772515296936, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035709055373445153, "grad_norm": 8.463372230529785, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.866126298904419, "num_tokens": 751869627.0, "step": 19703 }, { "epoch": 2.5065513293474115, "ewc_loss": 0.07219778001308441, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036309112329036, "grad_norm": 8.545361518859863, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8584892749786377, "num_tokens": 751911103.0, "step": 19704 }, { "epoch": 2.5066785396260016, "ewc_loss": 0.0718030035495758, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035914333420805633, "grad_norm": 8.486157417297363, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8591009378433228, "num_tokens": 751948956.0, "step": 19705 }, { "epoch": 2.506805749904592, "ewc_loss": 0.07203790545463562, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003614923043642193, "grad_norm": 8.5701265335083, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8681600093841553, "num_tokens": 751992185.0, "step": 19706 }, { "epoch": 2.5069329601831827, "ewc_loss": 0.07171225547790527, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000358235789462924, "grad_norm": 8.420571327209473, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8605579733848572, "num_tokens": 752032661.0, "step": 19707 }, { "epoch": 2.507060170461773, "ewc_loss": 0.07220505177974701, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003631638246588409, "grad_norm": 8.622384071350098, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8697922825813293, "num_tokens": 752073946.0, "step": 19708 }, { "epoch": 2.5071873807403637, "ewc_loss": 0.07153262943029404, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035643958835862577, "grad_norm": 8.32557487487793, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8485938906669617, "num_tokens": 752118766.0, "step": 19709 }, { "epoch": 2.5073145910189543, "ewc_loss": 0.07233290374279022, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036444232682697475, "grad_norm": 8.54326343536377, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.86887127161026, "num_tokens": 752159669.0, "step": 19710 }, { "epoch": 2.507441801297545, "ewc_loss": 0.07159674912691116, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035708077484741807, "grad_norm": 8.432342529296875, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.868018627166748, "num_tokens": 752198139.0, "step": 19711 }, { "epoch": 2.5075690115761353, "ewc_loss": 0.07211755216121674, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003622888179961592, "grad_norm": 8.481380462646484, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.8671410083770752, "num_tokens": 752239686.0, "step": 19712 }, { "epoch": 2.507696221854726, "ewc_loss": 0.07188842445611954, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035999753163196146, "grad_norm": 8.461965560913086, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8469021916389465, "num_tokens": 752275740.0, "step": 19713 }, { "epoch": 2.5078234321333164, "ewc_loss": 0.0721045583486557, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036215889849700034, "grad_norm": 8.464386940002441, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8509207367897034, "num_tokens": 752310554.0, "step": 19714 }, { "epoch": 2.507950642411907, "ewc_loss": 0.07213246077299118, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003624378878157586, "grad_norm": 8.47108268737793, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8706028461456299, "num_tokens": 752347761.0, "step": 19715 }, { "epoch": 2.5080778526904974, "ewc_loss": 0.07203435897827148, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036145682679489255, "grad_norm": 8.457489967346191, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8532649874687195, "num_tokens": 752387128.0, "step": 19716 }, { "epoch": 2.508205062969088, "ewc_loss": 0.07218492776155472, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003629625425674021, "grad_norm": 8.457086563110352, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8625330924987793, "num_tokens": 752428722.0, "step": 19717 }, { "epoch": 2.5083322732476785, "ewc_loss": 0.07196663320064545, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003607796679716557, "grad_norm": 8.410204887390137, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8633089065551758, "num_tokens": 752468920.0, "step": 19718 }, { "epoch": 2.508459483526269, "ewc_loss": 0.07225914299488068, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036370469024404883, "grad_norm": 8.551312446594238, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8553138971328735, "num_tokens": 752505226.0, "step": 19719 }, { "epoch": 2.5085866938048595, "ewc_loss": 0.07187052071094513, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003598185139708221, "grad_norm": 8.40492057800293, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8663280606269836, "num_tokens": 752543693.0, "step": 19720 }, { "epoch": 2.50871390408345, "ewc_loss": 0.07242302596569061, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036534349783323705, "grad_norm": 8.579614639282227, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8486896753311157, "num_tokens": 752578758.0, "step": 19721 }, { "epoch": 2.5088411143620406, "ewc_loss": 0.07185864448547363, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003596997121348977, "grad_norm": 8.414056777954102, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.876162588596344, "num_tokens": 752612115.0, "step": 19722 }, { "epoch": 2.508968324640631, "ewc_loss": 0.07238832116127014, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003649964928627014, "grad_norm": 8.627106666564941, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8704444169998169, "num_tokens": 752644601.0, "step": 19723 }, { "epoch": 2.509095534919221, "ewc_loss": 0.0718715712428093, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003598289913497865, "grad_norm": 8.404569625854492, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8693702220916748, "num_tokens": 752682333.0, "step": 19724 }, { "epoch": 2.509222745197812, "ewc_loss": 0.07243287563323975, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036544204340316355, "grad_norm": 8.54518985748291, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.859457790851593, "num_tokens": 752721802.0, "step": 19725 }, { "epoch": 2.5093499554764023, "ewc_loss": 0.07184434682130814, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035955675411969423, "grad_norm": 8.42236042022705, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8792366981506348, "num_tokens": 752755673.0, "step": 19726 }, { "epoch": 2.5094771657549932, "ewc_loss": 0.07234534621238708, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003645667457021773, "grad_norm": 8.546477317810059, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8753109574317932, "num_tokens": 752795575.0, "step": 19727 }, { "epoch": 2.5096043760335833, "ewc_loss": 0.07195860147476196, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036069925408810377, "grad_norm": 8.464208602905273, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8809783458709717, "num_tokens": 752833172.0, "step": 19728 }, { "epoch": 2.5097315863121743, "ewc_loss": 0.07216450572013855, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036275837919674814, "grad_norm": 8.52956485748291, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8501694798469543, "num_tokens": 752872857.0, "step": 19729 }, { "epoch": 2.5098587965907644, "ewc_loss": 0.07202942669391632, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003614075540099293, "grad_norm": 8.512290954589844, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8734805583953857, "num_tokens": 752908553.0, "step": 19730 }, { "epoch": 2.509986006869355, "ewc_loss": 0.0721278190612793, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000362391525413841, "grad_norm": 8.534928321838379, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8702314496040344, "num_tokens": 752937762.0, "step": 19731 }, { "epoch": 2.5101132171479454, "ewc_loss": 0.0720655620098114, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617688489612192, "grad_norm": 8.604604721069336, "learning_rate": 1e-06, "loss": 0.5008, "mean_token_accuracy": 0.8503909111022949, "num_tokens": 752977184.0, "step": 19732 }, { "epoch": 2.510240427426536, "ewc_loss": 0.07174843549728394, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003585976082831621, "grad_norm": 8.418249130249023, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8593397736549377, "num_tokens": 753021071.0, "step": 19733 }, { "epoch": 2.5103676377051265, "ewc_loss": 0.07234767824411392, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036459005787037313, "grad_norm": 8.61801815032959, "learning_rate": 1e-06, "loss": 0.5503, "mean_token_accuracy": 0.8375126719474792, "num_tokens": 753056597.0, "step": 19734 }, { "epoch": 2.510494847983717, "ewc_loss": 0.07167249917984009, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003578383184503764, "grad_norm": 8.41835880279541, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8671466708183289, "num_tokens": 753096178.0, "step": 19735 }, { "epoch": 2.5106220582623076, "ewc_loss": 0.07266117632389069, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365283660357818, "grad_norm": 8.655829429626465, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8674135804176331, "num_tokens": 753130388.0, "step": 19736 }, { "epoch": 2.510749268540898, "ewc_loss": 0.07163608074188232, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035747408401221037, "grad_norm": 8.394491195678711, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8730250597000122, "num_tokens": 753166585.0, "step": 19737 }, { "epoch": 2.5108764788194886, "ewc_loss": 0.07239660620689392, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003650793805718422, "grad_norm": 8.635651588439941, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8740777373313904, "num_tokens": 753203403.0, "step": 19738 }, { "epoch": 2.511003689098079, "ewc_loss": 0.07158291339874268, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003569423861335963, "grad_norm": 8.410991668701172, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8707353472709656, "num_tokens": 753241632.0, "step": 19739 }, { "epoch": 2.5111308993766697, "ewc_loss": 0.07241915166378021, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003653048479463905, "grad_norm": 8.638992309570312, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8786506652832031, "num_tokens": 753279292.0, "step": 19740 }, { "epoch": 2.51125810965526, "ewc_loss": 0.07158014178276062, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003569146792870015, "grad_norm": 8.406688690185547, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8540650010108948, "num_tokens": 753319121.0, "step": 19741 }, { "epoch": 2.5113853199338507, "ewc_loss": 0.07230421900749207, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003641554503701627, "grad_norm": 8.596700668334961, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8500640988349915, "num_tokens": 753353074.0, "step": 19742 }, { "epoch": 2.5115125302124413, "ewc_loss": 0.07170248031616211, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003581381170079112, "grad_norm": 8.445770263671875, "learning_rate": 1e-06, "loss": 0.4074, "mean_token_accuracy": 0.8823031187057495, "num_tokens": 753388994.0, "step": 19743 }, { "epoch": 2.511639740491032, "ewc_loss": 0.07221949100494385, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036330815055407584, "grad_norm": 8.5065336227417, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8710058927536011, "num_tokens": 753430271.0, "step": 19744 }, { "epoch": 2.5117669507696223, "ewc_loss": 0.07178018987178802, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035891516017727554, "grad_norm": 8.420276641845703, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8570722341537476, "num_tokens": 753473820.0, "step": 19745 }, { "epoch": 2.511894161048213, "ewc_loss": 0.07221884280443192, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003633017186075449, "grad_norm": 8.534902572631836, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8604741096496582, "num_tokens": 753516397.0, "step": 19746 }, { "epoch": 2.5120213713268034, "ewc_loss": 0.07188253104686737, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003599385963752866, "grad_norm": 8.456100463867188, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8755605220794678, "num_tokens": 753557589.0, "step": 19747 }, { "epoch": 2.512148581605394, "ewc_loss": 0.07219916582107544, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000363104889402166, "grad_norm": 8.510631561279297, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8736730813980103, "num_tokens": 753593764.0, "step": 19748 }, { "epoch": 2.512275791883984, "ewc_loss": 0.07193443179130554, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036045763408765197, "grad_norm": 8.486931800842285, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8515424728393555, "num_tokens": 753635967.0, "step": 19749 }, { "epoch": 2.512403002162575, "ewc_loss": 0.07193584740161896, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003604717494454235, "grad_norm": 8.532841682434082, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8720471858978271, "num_tokens": 753674595.0, "step": 19750 }, { "epoch": 2.512530212441165, "ewc_loss": 0.07206324487924576, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036174574051983654, "grad_norm": 8.52301025390625, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8489485383033752, "num_tokens": 753714118.0, "step": 19751 }, { "epoch": 2.512657422719756, "ewc_loss": 0.07202434539794922, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036135673872195184, "grad_norm": 8.499969482421875, "learning_rate": 1e-06, "loss": 0.5172, "mean_token_accuracy": 0.8504286408424377, "num_tokens": 753758139.0, "step": 19752 }, { "epoch": 2.512784632998346, "ewc_loss": 0.07199229300022125, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036103621823713183, "grad_norm": 8.49716567993164, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8539350032806396, "num_tokens": 753800373.0, "step": 19753 }, { "epoch": 2.5129118432769366, "ewc_loss": 0.07218091189861298, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003629224083852023, "grad_norm": 8.60850715637207, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8605116009712219, "num_tokens": 753832366.0, "step": 19754 }, { "epoch": 2.513039053555527, "ewc_loss": 0.07192297279834747, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003603429940994829, "grad_norm": 8.511731147766113, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8838339447975159, "num_tokens": 753866176.0, "step": 19755 }, { "epoch": 2.5131662638341177, "ewc_loss": 0.07174673676490784, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003610220446716994, "grad_norm": 8.491004943847656, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.864056408405304, "num_tokens": 753911470.0, "step": 19756 }, { "epoch": 2.513293474112708, "ewc_loss": 0.07201999425888062, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036131319939158857, "grad_norm": 8.491352081298828, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8626137375831604, "num_tokens": 753953027.0, "step": 19757 }, { "epoch": 2.5134206843912987, "ewc_loss": 0.07206755876541138, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036178884329274297, "grad_norm": 8.52664566040039, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8597373962402344, "num_tokens": 753987992.0, "step": 19758 }, { "epoch": 2.5135478946698893, "ewc_loss": 0.07198008894920349, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003609142149798572, "grad_norm": 8.507990837097168, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8692520260810852, "num_tokens": 754026671.0, "step": 19759 }, { "epoch": 2.51367510494848, "ewc_loss": 0.07197943329811096, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036090760841034353, "grad_norm": 8.499344825744629, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.864920973777771, "num_tokens": 754063526.0, "step": 19760 }, { "epoch": 2.5138023152270703, "ewc_loss": 0.07207217067480087, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036183500196784735, "grad_norm": 8.519577980041504, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8490015268325806, "num_tokens": 754103361.0, "step": 19761 }, { "epoch": 2.513929525505661, "ewc_loss": 0.07195612788200378, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036067457403987646, "grad_norm": 8.410837173461914, "learning_rate": 1e-06, "loss": 0.4541, "mean_token_accuracy": 0.8667309284210205, "num_tokens": 754141969.0, "step": 19762 }, { "epoch": 2.5140567357842514, "ewc_loss": 0.07225073128938675, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003636205801740289, "grad_norm": 8.560823440551758, "learning_rate": 1e-06, "loss": 0.415, "mean_token_accuracy": 0.8805958032608032, "num_tokens": 754182691.0, "step": 19763 }, { "epoch": 2.514183946062842, "ewc_loss": 0.07183300703763962, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003594433655962348, "grad_norm": 8.395637512207031, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8644130825996399, "num_tokens": 754219956.0, "step": 19764 }, { "epoch": 2.5143111563414324, "ewc_loss": 0.07236798107624054, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036479305708780885, "grad_norm": 8.546004295349121, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8593233823776245, "num_tokens": 754259467.0, "step": 19765 }, { "epoch": 2.514438366620023, "ewc_loss": 0.07198481261730194, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036096139228902757, "grad_norm": 8.459425926208496, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8680208921432495, "num_tokens": 754300344.0, "step": 19766 }, { "epoch": 2.5145655768986135, "ewc_loss": 0.07266200333833694, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036529189674183726, "grad_norm": 8.54834270477295, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8659477233886719, "num_tokens": 754342606.0, "step": 19767 }, { "epoch": 2.514692787177204, "ewc_loss": 0.07180829346179962, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003616375906858593, "grad_norm": 8.485872268676758, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8612710237503052, "num_tokens": 754379946.0, "step": 19768 }, { "epoch": 2.5148199974557945, "ewc_loss": 0.0722322016954422, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036343533429317176, "grad_norm": 8.504571914672852, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.8792829513549805, "num_tokens": 754414353.0, "step": 19769 }, { "epoch": 2.514947207734385, "ewc_loss": 0.07199731469154358, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003635278553701937, "grad_norm": 8.540897369384766, "learning_rate": 1e-06, "loss": 0.5142, "mean_token_accuracy": 0.8511557579040527, "num_tokens": 754455506.0, "step": 19770 }, { "epoch": 2.5150744180129756, "ewc_loss": 0.07191582769155502, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036271297722123563, "grad_norm": 8.554301261901855, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8484240770339966, "num_tokens": 754498316.0, "step": 19771 }, { "epoch": 2.5152016282915657, "ewc_loss": 0.07194726169109344, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036302735679782927, "grad_norm": 8.490148544311523, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8523573875427246, "num_tokens": 754534943.0, "step": 19772 }, { "epoch": 2.5153288385701567, "ewc_loss": 0.07193265855312347, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036288125556893647, "grad_norm": 8.52786636352539, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.861984372138977, "num_tokens": 754572775.0, "step": 19773 }, { "epoch": 2.5154560488487467, "ewc_loss": 0.07218795269727707, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036299281055107713, "grad_norm": 8.488249778747559, "learning_rate": 1e-06, "loss": 0.4199, "mean_token_accuracy": 0.8746316432952881, "num_tokens": 754610473.0, "step": 19774 }, { "epoch": 2.5155832591273377, "ewc_loss": 0.0720963180065155, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003620765055529773, "grad_norm": 8.528264999389648, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.865634560585022, "num_tokens": 754649527.0, "step": 19775 }, { "epoch": 2.515710469405928, "ewc_loss": 0.07220296561717987, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036314292810857296, "grad_norm": 8.474350929260254, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8666518926620483, "num_tokens": 754691513.0, "step": 19776 }, { "epoch": 2.5158376796845188, "ewc_loss": 0.07223193347454071, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003634326276369393, "grad_norm": 8.518878936767578, "learning_rate": 1e-06, "loss": 0.5106, "mean_token_accuracy": 0.8517710566520691, "num_tokens": 754727932.0, "step": 19777 }, { "epoch": 2.515964889963109, "ewc_loss": 0.07207858562469482, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036189917591400445, "grad_norm": 8.443673133850098, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8704423308372498, "num_tokens": 754764571.0, "step": 19778 }, { "epoch": 2.5160921002416994, "ewc_loss": 0.07227648794651031, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003638781199697405, "grad_norm": 8.492471694946289, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8788594007492065, "num_tokens": 754805846.0, "step": 19779 }, { "epoch": 2.51621931052029, "ewc_loss": 0.0721990168094635, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036310343421064317, "grad_norm": 8.496615409851074, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8738592863082886, "num_tokens": 754844016.0, "step": 19780 }, { "epoch": 2.5163465207988804, "ewc_loss": 0.07216769456863403, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003627902187872678, "grad_norm": 8.45200252532959, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8642624616622925, "num_tokens": 754886493.0, "step": 19781 }, { "epoch": 2.516473731077471, "ewc_loss": 0.07238219678401947, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036493520019575953, "grad_norm": 8.465520858764648, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8672407269477844, "num_tokens": 754928195.0, "step": 19782 }, { "epoch": 2.5166009413560615, "ewc_loss": 0.07233233749866486, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036443662247620523, "grad_norm": 8.564900398254395, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8744304180145264, "num_tokens": 754958399.0, "step": 19783 }, { "epoch": 2.516728151634652, "ewc_loss": 0.07224829494953156, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036359619116410613, "grad_norm": 8.500632286071777, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8694261312484741, "num_tokens": 754997926.0, "step": 19784 }, { "epoch": 2.5168553619132426, "ewc_loss": 0.0724276602268219, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036538983113132417, "grad_norm": 8.50058364868164, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8720055818557739, "num_tokens": 755040577.0, "step": 19785 }, { "epoch": 2.516982572191833, "ewc_loss": 0.07217251509428024, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003628384438343346, "grad_norm": 8.56733512878418, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8619412779808044, "num_tokens": 755070458.0, "step": 19786 }, { "epoch": 2.5171097824704236, "ewc_loss": 0.07215416431427002, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003626549441833049, "grad_norm": 8.446614265441895, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8446024656295776, "num_tokens": 755117205.0, "step": 19787 }, { "epoch": 2.517236992749014, "ewc_loss": 0.07244148850440979, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036552813253365457, "grad_norm": 8.51743221282959, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8683454990386963, "num_tokens": 755162098.0, "step": 19788 }, { "epoch": 2.5173642030276047, "ewc_loss": 0.07209210842847824, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000362034363206476, "grad_norm": 8.475369453430176, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.862369179725647, "num_tokens": 755199655.0, "step": 19789 }, { "epoch": 2.517491413306195, "ewc_loss": 0.07231636345386505, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003642769588623196, "grad_norm": 8.554091453552246, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.878261148929596, "num_tokens": 755234165.0, "step": 19790 }, { "epoch": 2.5176186235847857, "ewc_loss": 0.0721435621380806, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000362548918928951, "grad_norm": 8.544588088989258, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8659083843231201, "num_tokens": 755271933.0, "step": 19791 }, { "epoch": 2.5177458338633762, "ewc_loss": 0.07225161790847778, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036362948594614863, "grad_norm": 8.529229164123535, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8741467595100403, "num_tokens": 755306903.0, "step": 19792 }, { "epoch": 2.5178730441419668, "ewc_loss": 0.07215052843093872, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000362618564395234, "grad_norm": 8.45116901397705, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8732956647872925, "num_tokens": 755347613.0, "step": 19793 }, { "epoch": 2.5180002544205573, "ewc_loss": 0.07240048795938492, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036511814687401056, "grad_norm": 8.559460639953613, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8848756551742554, "num_tokens": 755380541.0, "step": 19794 }, { "epoch": 2.518127464699148, "ewc_loss": 0.07215836644172668, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036269697011448443, "grad_norm": 8.421151161193848, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8639068603515625, "num_tokens": 755422502.0, "step": 19795 }, { "epoch": 2.5182546749777384, "ewc_loss": 0.07283994555473328, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036707130493596196, "grad_norm": 8.610784530639648, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8657963275909424, "num_tokens": 755463975.0, "step": 19796 }, { "epoch": 2.5183818852563284, "ewc_loss": 0.07203274965286255, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036144076148048043, "grad_norm": 8.436821937561035, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8658135533332825, "num_tokens": 755504873.0, "step": 19797 }, { "epoch": 2.5185090955349194, "ewc_loss": 0.07241041213274002, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000365217390935868, "grad_norm": 8.590505599975586, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8402929306030273, "num_tokens": 755545646.0, "step": 19798 }, { "epoch": 2.5186363058135095, "ewc_loss": 0.07171118259429932, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003606664831750095, "grad_norm": 8.500754356384277, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8523366451263428, "num_tokens": 755577228.0, "step": 19799 }, { "epoch": 2.5187635160921005, "ewc_loss": 0.0719609409570694, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003631641448009759, "grad_norm": 8.477315902709961, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8744482398033142, "num_tokens": 755621437.0, "step": 19800 }, { "epoch": 2.5188907263706906, "ewc_loss": 0.07187353819608688, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036229006946086884, "grad_norm": 8.507390975952148, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8645736575126648, "num_tokens": 755658266.0, "step": 19801 }, { "epoch": 2.5190179366492815, "ewc_loss": 0.07192829996347427, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036283768713474274, "grad_norm": 8.486759185791016, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8531690835952759, "num_tokens": 755696755.0, "step": 19802 }, { "epoch": 2.5191451469278716, "ewc_loss": 0.07190629839897156, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036261766217648983, "grad_norm": 8.45458698272705, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8793171644210815, "num_tokens": 755732434.0, "step": 19803 }, { "epoch": 2.519272357206462, "ewc_loss": 0.07245548069477081, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036322668893262744, "grad_norm": 8.556196212768555, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8695183396339417, "num_tokens": 755768488.0, "step": 19804 }, { "epoch": 2.5193995674850527, "ewc_loss": 0.07212117314338684, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003623249940574169, "grad_norm": 8.495492935180664, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8705755472183228, "num_tokens": 755799637.0, "step": 19805 }, { "epoch": 2.519526777763643, "ewc_loss": 0.07225102186203003, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003636234614532441, "grad_norm": 8.516738891601562, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8503139019012451, "num_tokens": 755834229.0, "step": 19806 }, { "epoch": 2.5196539880422337, "ewc_loss": 0.07206279039382935, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617411421146244, "grad_norm": 8.465065956115723, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8691661953926086, "num_tokens": 755871776.0, "step": 19807 }, { "epoch": 2.5197811983208243, "ewc_loss": 0.07239282131195068, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000365041516488418, "grad_norm": 8.528372764587402, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8758397102355957, "num_tokens": 755911125.0, "step": 19808 }, { "epoch": 2.519908408599415, "ewc_loss": 0.07206301391124725, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036174344131723046, "grad_norm": 8.421415328979492, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.873481810092926, "num_tokens": 755952204.0, "step": 19809 }, { "epoch": 2.5200356188780053, "ewc_loss": 0.0724439024925232, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003655523178167641, "grad_norm": 8.56318473815918, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8555715084075928, "num_tokens": 755996702.0, "step": 19810 }, { "epoch": 2.520162829156596, "ewc_loss": 0.07199729979038239, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003610862768255174, "grad_norm": 8.475069046020508, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8829867839813232, "num_tokens": 756033285.0, "step": 19811 }, { "epoch": 2.5202900394351864, "ewc_loss": 0.07241131365299225, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003652264131233096, "grad_norm": 8.570252418518066, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.865047812461853, "num_tokens": 756068851.0, "step": 19812 }, { "epoch": 2.520417249713777, "ewc_loss": 0.07207789272069931, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003618922200985253, "grad_norm": 8.445286750793457, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8580436706542969, "num_tokens": 756111676.0, "step": 19813 }, { "epoch": 2.5205444599923674, "ewc_loss": 0.07242114841938019, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003653247549664229, "grad_norm": 8.620078086853027, "learning_rate": 1e-06, "loss": 0.5279, "mean_token_accuracy": 0.8484334349632263, "num_tokens": 756146575.0, "step": 19814 }, { "epoch": 2.520671670270958, "ewc_loss": 0.07197773456573486, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003608906117733568, "grad_norm": 8.451791763305664, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.854575514793396, "num_tokens": 756190006.0, "step": 19815 }, { "epoch": 2.5207988805495485, "ewc_loss": 0.07271268218755722, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003657987108454108, "grad_norm": 8.576735496520996, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8767245411872864, "num_tokens": 756232828.0, "step": 19816 }, { "epoch": 2.520926090828139, "ewc_loss": 0.07196421921253204, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003607554826885462, "grad_norm": 8.484907150268555, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8714480400085449, "num_tokens": 756273077.0, "step": 19817 }, { "epoch": 2.5210533011067295, "ewc_loss": 0.07240334153175354, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003651467268355191, "grad_norm": 8.545808792114258, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8494514226913452, "num_tokens": 756313421.0, "step": 19818 }, { "epoch": 2.52118051138532, "ewc_loss": 0.07202248275279999, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036133814137429, "grad_norm": 8.440728187561035, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8608278632164001, "num_tokens": 756356461.0, "step": 19819 }, { "epoch": 2.5213077216639106, "ewc_loss": 0.07247845828533173, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003658978675957769, "grad_norm": 8.66129207611084, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8613200187683105, "num_tokens": 756388309.0, "step": 19820 }, { "epoch": 2.521434931942501, "ewc_loss": 0.07206203043460846, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617336042225361, "grad_norm": 8.46619701385498, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.876814603805542, "num_tokens": 756426655.0, "step": 19821 }, { "epoch": 2.521562142221091, "ewc_loss": 0.0724579319357872, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036569259827956557, "grad_norm": 8.580229759216309, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8611969947814941, "num_tokens": 756464961.0, "step": 19822 }, { "epoch": 2.521689352499682, "ewc_loss": 0.07202206552028656, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003613338922150433, "grad_norm": 8.523591041564941, "learning_rate": 1e-06, "loss": 0.4081, "mean_token_accuracy": 0.8790785074234009, "num_tokens": 756497338.0, "step": 19823 }, { "epoch": 2.5218165627782723, "ewc_loss": 0.07243633270263672, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036547656054608524, "grad_norm": 8.738898277282715, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8718875646591187, "num_tokens": 756535324.0, "step": 19824 }, { "epoch": 2.5219437730568632, "ewc_loss": 0.07174286991357803, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035854196175932884, "grad_norm": 8.510743141174316, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8752155900001526, "num_tokens": 756574138.0, "step": 19825 }, { "epoch": 2.5220709833354533, "ewc_loss": 0.0722535252571106, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036364851985126734, "grad_norm": 8.537137985229492, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8757888078689575, "num_tokens": 756608967.0, "step": 19826 }, { "epoch": 2.522198193614044, "ewc_loss": 0.07176108658313751, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035872418084181845, "grad_norm": 8.387335777282715, "learning_rate": 1e-06, "loss": 0.5381, "mean_token_accuracy": 0.8445837497711182, "num_tokens": 756653882.0, "step": 19827 }, { "epoch": 2.5223254038926344, "ewc_loss": 0.07235683500766754, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036468167672865093, "grad_norm": 8.521451950073242, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8593823909759521, "num_tokens": 756695638.0, "step": 19828 }, { "epoch": 2.522452614171225, "ewc_loss": 0.0720008835196495, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003611221327446401, "grad_norm": 8.459076881408691, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8569366931915283, "num_tokens": 756734964.0, "step": 19829 }, { "epoch": 2.5225798244498154, "ewc_loss": 0.07240448892116547, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036515819374471903, "grad_norm": 8.570405960083008, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.869218111038208, "num_tokens": 756768454.0, "step": 19830 }, { "epoch": 2.522707034728406, "ewc_loss": 0.07194056361913681, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036051892675459385, "grad_norm": 8.4478759765625, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8539643287658691, "num_tokens": 756801696.0, "step": 19831 }, { "epoch": 2.5228342450069965, "ewc_loss": 0.07243886590003967, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036550190998241305, "grad_norm": 8.573397636413574, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8671121597290039, "num_tokens": 756839874.0, "step": 19832 }, { "epoch": 2.522961455285587, "ewc_loss": 0.07227620482444763, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036143389297649264, "grad_norm": 8.41069507598877, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.855201244354248, "num_tokens": 756884179.0, "step": 19833 }, { "epoch": 2.5230886655641775, "ewc_loss": 0.07262110710144043, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036732430453412235, "grad_norm": 8.840673446655273, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8666777610778809, "num_tokens": 756922725.0, "step": 19834 }, { "epoch": 2.523215875842768, "ewc_loss": 0.0718245580792427, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035691747325472534, "grad_norm": 8.635660171508789, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8622947931289673, "num_tokens": 756956080.0, "step": 19835 }, { "epoch": 2.5233430861213586, "ewc_loss": 0.07243002951145172, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003629721177276224, "grad_norm": 8.46675968170166, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8559557795524597, "num_tokens": 756995137.0, "step": 19836 }, { "epoch": 2.523470296399949, "ewc_loss": 0.07218752056360245, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003605470701586455, "grad_norm": 8.484819412231445, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8693541288375854, "num_tokens": 757035300.0, "step": 19837 }, { "epoch": 2.5235975066785397, "ewc_loss": 0.07198762148618698, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003609894774854183, "grad_norm": 8.623151779174805, "learning_rate": 1e-06, "loss": 0.5441, "mean_token_accuracy": 0.8397094011306763, "num_tokens": 757068789.0, "step": 19838 }, { "epoch": 2.52372471695713, "ewc_loss": 0.07182545959949493, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035936792846769094, "grad_norm": 8.42357063293457, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8583556413650513, "num_tokens": 757102913.0, "step": 19839 }, { "epoch": 2.5238519272357207, "ewc_loss": 0.0724220722913742, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003653339808806777, "grad_norm": 8.800469398498535, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8674283027648926, "num_tokens": 757138932.0, "step": 19840 }, { "epoch": 2.5239791375143112, "ewc_loss": 0.07153435051441193, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035645681782625616, "grad_norm": 8.286211967468262, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8677812814712524, "num_tokens": 757179181.0, "step": 19841 }, { "epoch": 2.5241063477929018, "ewc_loss": 0.07335416972637177, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037221357342787087, "grad_norm": 9.186634063720703, "learning_rate": 1e-06, "loss": 0.5604, "mean_token_accuracy": 0.834221363067627, "num_tokens": 757224209.0, "step": 19842 }, { "epoch": 2.5242335580714923, "ewc_loss": 0.07151910662651062, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035386293893679976, "grad_norm": 8.276958465576172, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8785789012908936, "num_tokens": 757262398.0, "step": 19843 }, { "epoch": 2.524360768350083, "ewc_loss": 0.0738161951303482, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00037927518133074045, "grad_norm": 9.227455139160156, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8554736375808716, "num_tokens": 757301744.0, "step": 19844 }, { "epoch": 2.5244879786286734, "ewc_loss": 0.07129606604576111, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003540739417076111, "grad_norm": 8.304137229919434, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8686031103134155, "num_tokens": 757338273.0, "step": 19845 }, { "epoch": 2.524615188907264, "ewc_loss": 0.07388949394226074, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00038000819040462375, "grad_norm": 8.973800659179688, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8571902513504028, "num_tokens": 757383169.0, "step": 19846 }, { "epoch": 2.524742399185854, "ewc_loss": 0.07181994616985321, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035931277670897543, "grad_norm": 8.667221069335938, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8540199995040894, "num_tokens": 757420998.0, "step": 19847 }, { "epoch": 2.524869609464445, "ewc_loss": 0.07279016077518463, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036901485873386264, "grad_norm": 8.656805992126465, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8687801361083984, "num_tokens": 757463281.0, "step": 19848 }, { "epoch": 2.524996819743035, "ewc_loss": 0.07241401821374893, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003628120757639408, "grad_norm": 8.733972549438477, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8592886924743652, "num_tokens": 757495022.0, "step": 19849 }, { "epoch": 2.525124030021626, "ewc_loss": 0.07202936708927155, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003614069428294897, "grad_norm": 8.486312866210938, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8700146675109863, "num_tokens": 757536801.0, "step": 19850 }, { "epoch": 2.525251240300216, "ewc_loss": 0.07243318855762482, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036544512840919197, "grad_norm": 8.57036018371582, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.8803579807281494, "num_tokens": 757573419.0, "step": 19851 }, { "epoch": 2.5253784505788066, "ewc_loss": 0.07198034226894379, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003609167179092765, "grad_norm": 8.581853866577148, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8575801849365234, "num_tokens": 757610773.0, "step": 19852 }, { "epoch": 2.525505660857397, "ewc_loss": 0.07219840586185455, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036309732240624726, "grad_norm": 8.585001945495605, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8737567663192749, "num_tokens": 757648067.0, "step": 19853 }, { "epoch": 2.5256328711359877, "ewc_loss": 0.07201729714870453, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003612862783484161, "grad_norm": 8.534616470336914, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8708088397979736, "num_tokens": 757681598.0, "step": 19854 }, { "epoch": 2.525760081414578, "ewc_loss": 0.07214395701885223, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036255287704989314, "grad_norm": 8.574138641357422, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.8457201719284058, "num_tokens": 757724752.0, "step": 19855 }, { "epoch": 2.5258872916931687, "ewc_loss": 0.07204359024763107, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036154917324893177, "grad_norm": 8.566062927246094, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8753305673599243, "num_tokens": 757763801.0, "step": 19856 }, { "epoch": 2.5260145019717593, "ewc_loss": 0.07206152379512787, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617284819483757, "grad_norm": 8.591397285461426, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8620032668113708, "num_tokens": 757799235.0, "step": 19857 }, { "epoch": 2.52614171225035, "ewc_loss": 0.0719902366399765, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036101561272516847, "grad_norm": 8.473258018493652, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.868938684463501, "num_tokens": 757839695.0, "step": 19858 }, { "epoch": 2.5262689225289403, "ewc_loss": 0.07229684293270111, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003640817594714463, "grad_norm": 8.564494132995605, "learning_rate": 1e-06, "loss": 0.5169, "mean_token_accuracy": 0.852820634841919, "num_tokens": 757877152.0, "step": 19859 }, { "epoch": 2.526396132807531, "ewc_loss": 0.07193179428577423, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036043120780959725, "grad_norm": 8.478201866149902, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.863635778427124, "num_tokens": 757916173.0, "step": 19860 }, { "epoch": 2.5265233430861214, "ewc_loss": 0.07234101742506027, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036452343920245767, "grad_norm": 8.63156795501709, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8704407215118408, "num_tokens": 757957271.0, "step": 19861 }, { "epoch": 2.526650553364712, "ewc_loss": 0.07204285264015198, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003591003769543022, "grad_norm": 8.461095809936523, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.858324408531189, "num_tokens": 757995553.0, "step": 19862 }, { "epoch": 2.5267777636433024, "ewc_loss": 0.07266609370708466, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365332787623629, "grad_norm": 8.614195823669434, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8538210391998291, "num_tokens": 758035095.0, "step": 19863 }, { "epoch": 2.526904973921893, "ewc_loss": 0.07185027003288269, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00035961600951850414, "grad_norm": 8.493782043457031, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8631361722946167, "num_tokens": 758071185.0, "step": 19864 }, { "epoch": 2.5270321842004835, "ewc_loss": 0.07231183350086212, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036423158599063754, "grad_norm": 8.574617385864258, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8666384220123291, "num_tokens": 758110251.0, "step": 19865 }, { "epoch": 2.527159394479074, "ewc_loss": 0.07189486175775528, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003600618802011013, "grad_norm": 8.489006042480469, "learning_rate": 1e-06, "loss": 0.4932, "mean_token_accuracy": 0.8513282537460327, "num_tokens": 758144385.0, "step": 19866 }, { "epoch": 2.5272866047576645, "ewc_loss": 0.0722971260547638, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003640845825430006, "grad_norm": 8.537837028503418, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8766661882400513, "num_tokens": 758185935.0, "step": 19867 }, { "epoch": 2.527413815036255, "ewc_loss": 0.07210490107536316, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036216233274899423, "grad_norm": 8.465633392333984, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.868392825126648, "num_tokens": 758226688.0, "step": 19868 }, { "epoch": 2.5275410253148456, "ewc_loss": 0.07234537601470947, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003645670076366514, "grad_norm": 8.544353485107422, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8706218004226685, "num_tokens": 758263866.0, "step": 19869 }, { "epoch": 2.5276682355934357, "ewc_loss": 0.07215414196252823, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036265471135266125, "grad_norm": 8.471904754638672, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8713976144790649, "num_tokens": 758303607.0, "step": 19870 }, { "epoch": 2.5277954458720266, "ewc_loss": 0.07271522283554077, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036582414759323, "grad_norm": 8.4941987991333, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8675392866134644, "num_tokens": 758340120.0, "step": 19871 }, { "epoch": 2.5279226561506167, "ewc_loss": 0.07233884185552597, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003645016986411065, "grad_norm": 8.588314056396484, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8631688952445984, "num_tokens": 758373792.0, "step": 19872 }, { "epoch": 2.5280498664292077, "ewc_loss": 0.07230344414710999, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003641476796474308, "grad_norm": 8.515579223632812, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8527286052703857, "num_tokens": 758418157.0, "step": 19873 }, { "epoch": 2.528177076707798, "ewc_loss": 0.07253460586071014, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003664593677967787, "grad_norm": 8.547246932983398, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8567491173744202, "num_tokens": 758454042.0, "step": 19874 }, { "epoch": 2.5283042869863888, "ewc_loss": 0.07227060198783875, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000363819272024557, "grad_norm": 8.538804054260254, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.867374062538147, "num_tokens": 758495305.0, "step": 19875 }, { "epoch": 2.528431497264979, "ewc_loss": 0.0724027156829834, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036514041130430996, "grad_norm": 8.532374382019043, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8711150288581848, "num_tokens": 758532646.0, "step": 19876 }, { "epoch": 2.5285587075435694, "ewc_loss": 0.07240766286849976, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003651898878160864, "grad_norm": 8.670512199401855, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8590495586395264, "num_tokens": 758568329.0, "step": 19877 }, { "epoch": 2.52868591782216, "ewc_loss": 0.07212202996015549, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036233357968740165, "grad_norm": 8.56800365447998, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8688109517097473, "num_tokens": 758608246.0, "step": 19878 }, { "epoch": 2.5288131281007504, "ewc_loss": 0.072173111140728, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036284441011957824, "grad_norm": 8.526029586791992, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8741309642791748, "num_tokens": 758645356.0, "step": 19879 }, { "epoch": 2.528940338379341, "ewc_loss": 0.07202401757240295, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036135344998911023, "grad_norm": 8.445744514465332, "learning_rate": 1e-06, "loss": 0.4127, "mean_token_accuracy": 0.8805189728736877, "num_tokens": 758685657.0, "step": 19880 }, { "epoch": 2.5290675486579315, "ewc_loss": 0.07237207889556885, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036483403528109193, "grad_norm": 8.588998794555664, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8693118095397949, "num_tokens": 758718843.0, "step": 19881 }, { "epoch": 2.529194758936522, "ewc_loss": 0.07215417176485062, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003626550023909658, "grad_norm": 8.533764839172363, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8693835735321045, "num_tokens": 758757946.0, "step": 19882 }, { "epoch": 2.5293219692151125, "ewc_loss": 0.07258323580026627, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645042306743562, "grad_norm": 8.574579238891602, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8444523811340332, "num_tokens": 758792832.0, "step": 19883 }, { "epoch": 2.529449179493703, "ewc_loss": 0.07207818329334259, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036189513048157096, "grad_norm": 8.515021324157715, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8555805683135986, "num_tokens": 758826062.0, "step": 19884 }, { "epoch": 2.5295763897722936, "ewc_loss": 0.07230514287948608, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003641646762844175, "grad_norm": 8.544204711914062, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8743872046470642, "num_tokens": 758860940.0, "step": 19885 }, { "epoch": 2.529703600050884, "ewc_loss": 0.07226470112800598, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003637603367678821, "grad_norm": 8.52652359008789, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.863770604133606, "num_tokens": 758900740.0, "step": 19886 }, { "epoch": 2.5298308103294747, "ewc_loss": 0.07249392569065094, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003636110923253, "grad_norm": 8.5632963180542, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8612579703330994, "num_tokens": 758942786.0, "step": 19887 }, { "epoch": 2.529958020608065, "ewc_loss": 0.07217517495155334, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036286498652771115, "grad_norm": 8.513650894165039, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8740079998970032, "num_tokens": 758983376.0, "step": 19888 }, { "epoch": 2.5300852308866557, "ewc_loss": 0.0723189115524292, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036430242471396923, "grad_norm": 8.582756042480469, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8536295294761658, "num_tokens": 759024007.0, "step": 19889 }, { "epoch": 2.5302124411652462, "ewc_loss": 0.0723206102848053, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003618780174292624, "grad_norm": 8.528909683227539, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8729745149612427, "num_tokens": 759061611.0, "step": 19890 }, { "epoch": 2.5303396514438368, "ewc_loss": 0.07266278564929962, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003652997547760606, "grad_norm": 8.66771411895752, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8476660847663879, "num_tokens": 759101942.0, "step": 19891 }, { "epoch": 2.5304668617224273, "ewc_loss": 0.07195604592561722, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036067375913262367, "grad_norm": 8.514155387878418, "learning_rate": 1e-06, "loss": 0.4106, "mean_token_accuracy": 0.8811565637588501, "num_tokens": 759140281.0, "step": 19892 }, { "epoch": 2.530594072001018, "ewc_loss": 0.07238613069057465, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036497454857453704, "grad_norm": 8.597736358642578, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8647318482398987, "num_tokens": 759174319.0, "step": 19893 }, { "epoch": 2.5307212822796084, "ewc_loss": 0.07188327610492706, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003599459887482226, "grad_norm": 8.562499046325684, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.87166428565979, "num_tokens": 759208981.0, "step": 19894 }, { "epoch": 2.5308484925581984, "ewc_loss": 0.07221047580242157, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036321807419881225, "grad_norm": 8.662172317504883, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.866438627243042, "num_tokens": 759240559.0, "step": 19895 }, { "epoch": 2.5309757028367894, "ewc_loss": 0.07190828025341034, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036019604885950685, "grad_norm": 8.501815795898438, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.876511812210083, "num_tokens": 759280550.0, "step": 19896 }, { "epoch": 2.5311029131153795, "ewc_loss": 0.07211336493492126, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036224693758413196, "grad_norm": 8.680517196655273, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8591256737709045, "num_tokens": 759321298.0, "step": 19897 }, { "epoch": 2.5312301233939705, "ewc_loss": 0.07205231487751007, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003591950226109475, "grad_norm": 8.418450355529785, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.8767771124839783, "num_tokens": 759354627.0, "step": 19898 }, { "epoch": 2.5313573336725606, "ewc_loss": 0.07265628129243851, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003652346786111593, "grad_norm": 8.677200317382812, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.859795868396759, "num_tokens": 759388129.0, "step": 19899 }, { "epoch": 2.5314845439511515, "ewc_loss": 0.07198257744312286, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003584976075217128, "grad_norm": 8.530938148498535, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.874923825263977, "num_tokens": 759423667.0, "step": 19900 }, { "epoch": 2.5316117542297416, "ewc_loss": 0.07247419655323029, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036341382656246424, "grad_norm": 8.506999015808105, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8561638593673706, "num_tokens": 759466586.0, "step": 19901 }, { "epoch": 2.531738964508332, "ewc_loss": 0.07228745520114899, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036154643748886883, "grad_norm": 8.506518363952637, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8599357604980469, "num_tokens": 759504214.0, "step": 19902 }, { "epoch": 2.5318661747869227, "ewc_loss": 0.07227323949337006, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036140429438091815, "grad_norm": 8.521989822387695, "learning_rate": 1e-06, "loss": 0.5222, "mean_token_accuracy": 0.8499272465705872, "num_tokens": 759543865.0, "step": 19903 }, { "epoch": 2.531993385065513, "ewc_loss": 0.0723070278763771, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036174216074869037, "grad_norm": 8.456489562988281, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8601325154304504, "num_tokens": 759581917.0, "step": 19904 }, { "epoch": 2.5321205953441037, "ewc_loss": 0.07255398482084274, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003642117080744356, "grad_norm": 8.580904006958008, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8673925399780273, "num_tokens": 759617641.0, "step": 19905 }, { "epoch": 2.5322478056226942, "ewc_loss": 0.0721115693449974, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003597875765990466, "grad_norm": 8.49819278717041, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.8759534358978271, "num_tokens": 759649097.0, "step": 19906 }, { "epoch": 2.5323750159012848, "ewc_loss": 0.07261493802070618, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003648212878033519, "grad_norm": 8.536286354064941, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8595393896102905, "num_tokens": 759688802.0, "step": 19907 }, { "epoch": 2.5325022261798753, "ewc_loss": 0.0723409429192543, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003620813076850027, "grad_norm": 8.513274192810059, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8737860918045044, "num_tokens": 759724243.0, "step": 19908 }, { "epoch": 2.532629436458466, "ewc_loss": 0.07244337350130081, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036310561699792743, "grad_norm": 8.583949089050293, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8582486510276794, "num_tokens": 759764646.0, "step": 19909 }, { "epoch": 2.5327566467370564, "ewc_loss": 0.0720629096031189, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003617424226831645, "grad_norm": 8.501360893249512, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8686505556106567, "num_tokens": 759806939.0, "step": 19910 }, { "epoch": 2.532883857015647, "ewc_loss": 0.0721750557422638, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003628638223744929, "grad_norm": 8.473332405090332, "learning_rate": 1e-06, "loss": 0.4586, "mean_token_accuracy": 0.868639349937439, "num_tokens": 759848138.0, "step": 19911 }, { "epoch": 2.5330110672942374, "ewc_loss": 0.072547547519207, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036414735950529575, "grad_norm": 8.581210136413574, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8621813058853149, "num_tokens": 759881877.0, "step": 19912 }, { "epoch": 2.533138277572828, "ewc_loss": 0.07219228148460388, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036059465492144227, "grad_norm": 8.47857666015625, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8760586977005005, "num_tokens": 759922625.0, "step": 19913 }, { "epoch": 2.5332654878514185, "ewc_loss": 0.07260890305042267, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036476089735515416, "grad_norm": 8.609623908996582, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8772847652435303, "num_tokens": 759952429.0, "step": 19914 }, { "epoch": 2.533392698130009, "ewc_loss": 0.07189231365919113, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036003641434945166, "grad_norm": 8.529241561889648, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8674821257591248, "num_tokens": 759987678.0, "step": 19915 }, { "epoch": 2.5335199084085995, "ewc_loss": 0.07236143946647644, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000362286256859079, "grad_norm": 8.544818878173828, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8511662483215332, "num_tokens": 760023943.0, "step": 19916 }, { "epoch": 2.53364711868719, "ewc_loss": 0.07230404019355774, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036171230021864176, "grad_norm": 8.567154884338379, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8688254356384277, "num_tokens": 760068229.0, "step": 19917 }, { "epoch": 2.5337743289657806, "ewc_loss": 0.07213708758354187, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036004272988066077, "grad_norm": 8.542482376098633, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8528828620910645, "num_tokens": 760109440.0, "step": 19918 }, { "epoch": 2.533901539244371, "ewc_loss": 0.07226137816905975, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036128569627180696, "grad_norm": 8.539080619812012, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8648965954780579, "num_tokens": 760146616.0, "step": 19919 }, { "epoch": 2.534028749522961, "ewc_loss": 0.07219958305358887, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036066773463971913, "grad_norm": 8.498245239257812, "learning_rate": 1e-06, "loss": 0.5192, "mean_token_accuracy": 0.8459032773971558, "num_tokens": 760188708.0, "step": 19920 }, { "epoch": 2.534155959801552, "ewc_loss": 0.0724032074213028, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036270395503379405, "grad_norm": 8.490577697753906, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.880764365196228, "num_tokens": 760231355.0, "step": 19921 }, { "epoch": 2.5342831700801423, "ewc_loss": 0.0721927136182785, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003605989913921803, "grad_norm": 8.51261043548584, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8518351316452026, "num_tokens": 760268316.0, "step": 19922 }, { "epoch": 2.5344103803587332, "ewc_loss": 0.07232530415058136, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003619249037001282, "grad_norm": 8.549714088439941, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8560605645179749, "num_tokens": 760305420.0, "step": 19923 }, { "epoch": 2.5345375906373233, "ewc_loss": 0.07224389165639877, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000361110782250762, "grad_norm": 8.464969635009766, "learning_rate": 1e-06, "loss": 0.4028, "mean_token_accuracy": 0.8841835260391235, "num_tokens": 760342195.0, "step": 19924 }, { "epoch": 2.534664800915914, "ewc_loss": 0.0724216178059578, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036288806586526334, "grad_norm": 8.515019416809082, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.879439115524292, "num_tokens": 760375394.0, "step": 19925 }, { "epoch": 2.5347920111945044, "ewc_loss": 0.07223785668611526, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003610504500102252, "grad_norm": 8.527984619140625, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8538969159126282, "num_tokens": 760411629.0, "step": 19926 }, { "epoch": 2.534919221473095, "ewc_loss": 0.07236665487289429, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003623384691309184, "grad_norm": 8.496024131774902, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8673384189605713, "num_tokens": 760456212.0, "step": 19927 }, { "epoch": 2.5350464317516854, "ewc_loss": 0.07233430445194244, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003620148927439004, "grad_norm": 8.553110122680664, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8570859432220459, "num_tokens": 760496385.0, "step": 19928 }, { "epoch": 2.535173642030276, "ewc_loss": 0.07219772040843964, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003606491081882268, "grad_norm": 8.543991088867188, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.86407071352005, "num_tokens": 760530426.0, "step": 19929 }, { "epoch": 2.5353008523088665, "ewc_loss": 0.07214654237031937, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003625787212513387, "grad_norm": 8.579791069030762, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8758604526519775, "num_tokens": 760568174.0, "step": 19930 }, { "epoch": 2.535428062587457, "ewc_loss": 0.07205513119697571, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003592231951188296, "grad_norm": 8.46438217163086, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8835810422897339, "num_tokens": 760608097.0, "step": 19931 }, { "epoch": 2.5355552728660475, "ewc_loss": 0.07228738069534302, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.000363987113814801, "grad_norm": 8.62345027923584, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8655306696891785, "num_tokens": 760643018.0, "step": 19932 }, { "epoch": 2.535682483144638, "ewc_loss": 0.07200595736503601, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003587314859032631, "grad_norm": 8.453814506530762, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8719381093978882, "num_tokens": 760676553.0, "step": 19933 }, { "epoch": 2.5358096934232286, "ewc_loss": 0.07243677973747253, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000363039638614282, "grad_norm": 8.546186447143555, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8805731534957886, "num_tokens": 760708850.0, "step": 19934 }, { "epoch": 2.535936903701819, "ewc_loss": 0.07194923609495163, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003606056561693549, "grad_norm": 8.461833000183105, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8606202602386475, "num_tokens": 760746088.0, "step": 19935 }, { "epoch": 2.5360641139804097, "ewc_loss": 0.07235641777515411, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036223605275154114, "grad_norm": 8.514164924621582, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8712354302406311, "num_tokens": 760785716.0, "step": 19936 }, { "epoch": 2.536191324259, "ewc_loss": 0.07228334993124008, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615053719840944, "grad_norm": 8.442530632019043, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8739067912101746, "num_tokens": 760821301.0, "step": 19937 }, { "epoch": 2.5363185345375907, "ewc_loss": 0.07245191931724548, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036319109494797885, "grad_norm": 8.534074783325195, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8561327457427979, "num_tokens": 760861200.0, "step": 19938 }, { "epoch": 2.5364457448161812, "ewc_loss": 0.07230126112699509, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003616844769567251, "grad_norm": 8.492514610290527, "learning_rate": 1e-06, "loss": 0.4712, "mean_token_accuracy": 0.8624275922775269, "num_tokens": 760896398.0, "step": 19939 }, { "epoch": 2.5365729550947718, "ewc_loss": 0.07260650396347046, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003647369157988578, "grad_norm": 8.476470947265625, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8446581363677979, "num_tokens": 760937300.0, "step": 19940 }, { "epoch": 2.5367001653733623, "ewc_loss": 0.07250475883483887, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003637194458860904, "grad_norm": 8.53416633605957, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8588730096817017, "num_tokens": 760973726.0, "step": 19941 }, { "epoch": 2.536827375651953, "ewc_loss": 0.07248865813016891, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003635584725998342, "grad_norm": 8.532230377197266, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8629735708236694, "num_tokens": 761009048.0, "step": 19942 }, { "epoch": 2.5369545859305433, "ewc_loss": 0.0726255252957344, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036492710933089256, "grad_norm": 8.527870178222656, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8630247712135315, "num_tokens": 761050836.0, "step": 19943 }, { "epoch": 2.537081796209134, "ewc_loss": 0.07239052653312683, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003625771205406636, "grad_norm": 8.494091987609863, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8561790585517883, "num_tokens": 761087517.0, "step": 19944 }, { "epoch": 2.537209006487724, "ewc_loss": 0.07255092263221741, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003641810908447951, "grad_norm": 8.549093246459961, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.8786149024963379, "num_tokens": 761121401.0, "step": 19945 }, { "epoch": 2.537336216766315, "ewc_loss": 0.07238931953907013, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036256510065868497, "grad_norm": 8.533004760742188, "learning_rate": 1e-06, "loss": 0.4959, "mean_token_accuracy": 0.8542845249176025, "num_tokens": 761154973.0, "step": 19946 }, { "epoch": 2.537463427044905, "ewc_loss": 0.07247781753540039, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003634500317275524, "grad_norm": 8.506085395812988, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8732426166534424, "num_tokens": 761194211.0, "step": 19947 }, { "epoch": 2.537590637323496, "ewc_loss": 0.07231305539608002, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036424380959942937, "grad_norm": 8.52994155883789, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8575881719589233, "num_tokens": 761233486.0, "step": 19948 }, { "epoch": 2.537717847602086, "ewc_loss": 0.07240794599056244, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003627513360697776, "grad_norm": 8.442456245422363, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8652024269104004, "num_tokens": 761275846.0, "step": 19949 }, { "epoch": 2.5378450578806766, "ewc_loss": 0.07244223356246948, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036553561221808195, "grad_norm": 8.564712524414062, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8545598983764648, "num_tokens": 761311210.0, "step": 19950 }, { "epoch": 2.537972268159267, "ewc_loss": 0.07231390476226807, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036181093310005963, "grad_norm": 8.527495384216309, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8653950095176697, "num_tokens": 761349961.0, "step": 19951 }, { "epoch": 2.5380994784378577, "ewc_loss": 0.07253804802894592, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000364052364602685, "grad_norm": 8.563180923461914, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8722262382507324, "num_tokens": 761386993.0, "step": 19952 }, { "epoch": 2.538226688716448, "ewc_loss": 0.0722787082195282, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036145892227068543, "grad_norm": 8.467625617980957, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.8557721376419067, "num_tokens": 761426910.0, "step": 19953 }, { "epoch": 2.5383538989950387, "ewc_loss": 0.07263736426830292, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036504556192085147, "grad_norm": 8.552702903747559, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8685469031333923, "num_tokens": 761466633.0, "step": 19954 }, { "epoch": 2.5384811092736292, "ewc_loss": 0.07225732505321503, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003612451546359807, "grad_norm": 8.585135459899902, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8511024713516235, "num_tokens": 761497670.0, "step": 19955 }, { "epoch": 2.5386083195522198, "ewc_loss": 0.07241913676261902, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003628632694017142, "grad_norm": 8.49973201751709, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8615970015525818, "num_tokens": 761534626.0, "step": 19956 }, { "epoch": 2.5387355298308103, "ewc_loss": 0.07244130969047546, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036308501148596406, "grad_norm": 8.501970291137695, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8718979954719543, "num_tokens": 761581125.0, "step": 19957 }, { "epoch": 2.538862740109401, "ewc_loss": 0.07228505611419678, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036152239772491157, "grad_norm": 8.545252799987793, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8659334778785706, "num_tokens": 761616533.0, "step": 19958 }, { "epoch": 2.5389899503879914, "ewc_loss": 0.07243002206087112, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036297208862379193, "grad_norm": 8.476390838623047, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8554582595825195, "num_tokens": 761650615.0, "step": 19959 }, { "epoch": 2.539117160666582, "ewc_loss": 0.07241223752498627, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003627942060120404, "grad_norm": 8.552756309509277, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8758317232131958, "num_tokens": 761687020.0, "step": 19960 }, { "epoch": 2.5392443709451724, "ewc_loss": 0.0723160058259964, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036183191696181893, "grad_norm": 8.458316802978516, "learning_rate": 1e-06, "loss": 0.4196, "mean_token_accuracy": 0.8800655603408813, "num_tokens": 761721653.0, "step": 19961 }, { "epoch": 2.539371581223763, "ewc_loss": 0.07253873348236084, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036405917489901185, "grad_norm": 8.572322845458984, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8723407983779907, "num_tokens": 761758377.0, "step": 19962 }, { "epoch": 2.5394987915023535, "ewc_loss": 0.07225008308887482, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036117274430580437, "grad_norm": 8.519747734069824, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8634690046310425, "num_tokens": 761795263.0, "step": 19963 }, { "epoch": 2.539626001780944, "ewc_loss": 0.07248270511627197, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003634989261627197, "grad_norm": 8.518011093139648, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8751825094223022, "num_tokens": 761834758.0, "step": 19964 }, { "epoch": 2.5397532120595345, "ewc_loss": 0.07230614125728607, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003617333131842315, "grad_norm": 8.473496437072754, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8576527833938599, "num_tokens": 761880482.0, "step": 19965 }, { "epoch": 2.539880422338125, "ewc_loss": 0.07256166636943817, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003642885712906718, "grad_norm": 8.566339492797852, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8644335865974426, "num_tokens": 761922494.0, "step": 19966 }, { "epoch": 2.5400076326167156, "ewc_loss": 0.07229232788085938, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615951573010534, "grad_norm": 8.58009147644043, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8834945559501648, "num_tokens": 761959462.0, "step": 19967 }, { "epoch": 2.5401348428953057, "ewc_loss": 0.07226292043924332, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003613010630942881, "grad_norm": 8.526530265808105, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.855334997177124, "num_tokens": 761990316.0, "step": 19968 }, { "epoch": 2.5402620531738966, "ewc_loss": 0.07242226600646973, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036289452691562474, "grad_norm": 8.638529777526855, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.86014324426651, "num_tokens": 762023671.0, "step": 19969 }, { "epoch": 2.5403892634524867, "ewc_loss": 0.07219678163528442, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036063973675481975, "grad_norm": 8.520379066467285, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.870334267616272, "num_tokens": 762056719.0, "step": 19970 }, { "epoch": 2.5405164737310777, "ewc_loss": 0.07261380553245544, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036480987910181284, "grad_norm": 8.534368515014648, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8664527535438538, "num_tokens": 762098802.0, "step": 19971 }, { "epoch": 2.540643684009668, "ewc_loss": 0.0721665620803833, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036033749347552657, "grad_norm": 8.514054298400879, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8677279353141785, "num_tokens": 762134470.0, "step": 19972 }, { "epoch": 2.5407708942882588, "ewc_loss": 0.07245755940675735, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036324746906757355, "grad_norm": 8.549878120422363, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8669752478599548, "num_tokens": 762177163.0, "step": 19973 }, { "epoch": 2.540898104566849, "ewc_loss": 0.07218849658966064, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003605568199418485, "grad_norm": 8.443090438842773, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8732879161834717, "num_tokens": 762217323.0, "step": 19974 }, { "epoch": 2.5410253148454394, "ewc_loss": 0.07248630374670029, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003635349276009947, "grad_norm": 8.643253326416016, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8604389429092407, "num_tokens": 762252144.0, "step": 19975 }, { "epoch": 2.54115252512403, "ewc_loss": 0.07197288423776627, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003584007208701223, "grad_norm": 8.481104850769043, "learning_rate": 1e-06, "loss": 0.4899, "mean_token_accuracy": 0.8567782640457153, "num_tokens": 762282108.0, "step": 19976 }, { "epoch": 2.5412797354026204, "ewc_loss": 0.07258397340774536, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645115939434618, "grad_norm": 8.637055397033691, "learning_rate": 1e-06, "loss": 0.4536, "mean_token_accuracy": 0.8692676424980164, "num_tokens": 762318154.0, "step": 19977 }, { "epoch": 2.541406945681211, "ewc_loss": 0.07201874256134033, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003588593390304595, "grad_norm": 8.674115180969238, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8632602691650391, "num_tokens": 762354704.0, "step": 19978 }, { "epoch": 2.5415341559598015, "ewc_loss": 0.07217332720756531, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000360405130777508, "grad_norm": 8.539999961853027, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8667854070663452, "num_tokens": 762389943.0, "step": 19979 }, { "epoch": 2.541661366238392, "ewc_loss": 0.07222343981266022, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000360906298737973, "grad_norm": 8.555869102478027, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8672741055488586, "num_tokens": 762421560.0, "step": 19980 }, { "epoch": 2.5417885765169825, "ewc_loss": 0.07205483317375183, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003592201683204621, "grad_norm": 8.450008392333984, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8699011206626892, "num_tokens": 762458509.0, "step": 19981 }, { "epoch": 2.541915786795573, "ewc_loss": 0.07250373065471649, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003637091431301087, "grad_norm": 8.563570976257324, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8639664649963379, "num_tokens": 762502382.0, "step": 19982 }, { "epoch": 2.5420429970741636, "ewc_loss": 0.07201912999153137, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035886320983991027, "grad_norm": 8.439913749694824, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8624454736709595, "num_tokens": 762536611.0, "step": 19983 }, { "epoch": 2.542170207352754, "ewc_loss": 0.07246831059455872, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003633549786172807, "grad_norm": 8.5289306640625, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8660982847213745, "num_tokens": 762577801.0, "step": 19984 }, { "epoch": 2.5422974176313446, "ewc_loss": 0.07219696044921875, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036064142477698624, "grad_norm": 8.426826477050781, "learning_rate": 1e-06, "loss": 0.4608, "mean_token_accuracy": 0.8666978478431702, "num_tokens": 762617413.0, "step": 19985 }, { "epoch": 2.542424627909935, "ewc_loss": 0.07259456813335419, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036461756099015474, "grad_norm": 8.51590633392334, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8744498491287231, "num_tokens": 762654811.0, "step": 19986 }, { "epoch": 2.5425518381885257, "ewc_loss": 0.07230159640312195, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036168788210488856, "grad_norm": 8.47274112701416, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8700933456420898, "num_tokens": 762692902.0, "step": 19987 }, { "epoch": 2.5426790484671162, "ewc_loss": 0.07247427105903625, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003634145832620561, "grad_norm": 8.508851051330566, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.865885853767395, "num_tokens": 762731984.0, "step": 19988 }, { "epoch": 2.5428062587457068, "ewc_loss": 0.07236817479133606, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003623536613304168, "grad_norm": 8.510847091674805, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8744055032730103, "num_tokens": 762767104.0, "step": 19989 }, { "epoch": 2.5429334690242973, "ewc_loss": 0.07240558415651321, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003627277328632772, "grad_norm": 8.515486717224121, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8544771671295166, "num_tokens": 762804982.0, "step": 19990 }, { "epoch": 2.543060679302888, "ewc_loss": 0.07243220508098602, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000362993887392804, "grad_norm": 8.577157974243164, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8562896251678467, "num_tokens": 762842956.0, "step": 19991 }, { "epoch": 2.5431878895814783, "ewc_loss": 0.07232122123241425, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003618840710259974, "grad_norm": 8.534801483154297, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8718445301055908, "num_tokens": 762875445.0, "step": 19992 }, { "epoch": 2.5433150998600684, "ewc_loss": 0.07253779470920563, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003640497743617743, "grad_norm": 8.560396194458008, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8790208697319031, "num_tokens": 762906061.0, "step": 19993 }, { "epoch": 2.5434423101386594, "ewc_loss": 0.07228387892246246, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615106106735766, "grad_norm": 8.545790672302246, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8739577531814575, "num_tokens": 762945550.0, "step": 19994 }, { "epoch": 2.5435695204172495, "ewc_loss": 0.07244192063808441, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003630910941865295, "grad_norm": 8.592073440551758, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.865875244140625, "num_tokens": 762982586.0, "step": 19995 }, { "epoch": 2.5436967306958405, "ewc_loss": 0.0721556693315506, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000360228557838127, "grad_norm": 8.485626220703125, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8651601076126099, "num_tokens": 763013527.0, "step": 19996 }, { "epoch": 2.5438239409744305, "ewc_loss": 0.07247800379991531, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003634519234765321, "grad_norm": 8.57730770111084, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8587378263473511, "num_tokens": 763054363.0, "step": 19997 }, { "epoch": 2.5439511512530215, "ewc_loss": 0.07220180332660675, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036068991175852716, "grad_norm": 8.512520790100098, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8612544536590576, "num_tokens": 763096834.0, "step": 19998 }, { "epoch": 2.5440783615316116, "ewc_loss": 0.07243071496486664, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036297901533544064, "grad_norm": 8.520049095153809, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8646066188812256, "num_tokens": 763139611.0, "step": 19999 }, { "epoch": 2.544205571810202, "ewc_loss": 0.07225680351257324, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036123994505032897, "grad_norm": 8.498517990112305, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8657606840133667, "num_tokens": 763181085.0, "step": 20000 }, { "epoch": 2.5443327820887927, "ewc_loss": 0.07242129743099213, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036288483534008265, "grad_norm": 8.539630889892578, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.863322913646698, "num_tokens": 763219811.0, "step": 20001 }, { "epoch": 2.544459992367383, "ewc_loss": 0.07228847593069077, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615566238295287, "grad_norm": 8.546976089477539, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8554637432098389, "num_tokens": 763257745.0, "step": 20002 }, { "epoch": 2.5445872026459737, "ewc_loss": 0.0723319798707962, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036199166788719594, "grad_norm": 8.52639389038086, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8505390882492065, "num_tokens": 763299475.0, "step": 20003 }, { "epoch": 2.5447144129245642, "ewc_loss": 0.07226500660181046, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003613219305407256, "grad_norm": 8.567364692687988, "learning_rate": 1e-06, "loss": 0.5299, "mean_token_accuracy": 0.8433077335357666, "num_tokens": 763333960.0, "step": 20004 }, { "epoch": 2.5448416232031548, "ewc_loss": 0.07220323383808136, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036070423084311187, "grad_norm": 8.530654907226562, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8580643534660339, "num_tokens": 763378646.0, "step": 20005 }, { "epoch": 2.5449688334817453, "ewc_loss": 0.07234390079975128, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036211084807291627, "grad_norm": 8.55756950378418, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8694621324539185, "num_tokens": 763412441.0, "step": 20006 }, { "epoch": 2.545096043760336, "ewc_loss": 0.07217967510223389, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036046866443939507, "grad_norm": 8.556144714355469, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8683739900588989, "num_tokens": 763446713.0, "step": 20007 }, { "epoch": 2.5452232540389264, "ewc_loss": 0.07226353138685226, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000361307174898684, "grad_norm": 8.487753868103027, "learning_rate": 1e-06, "loss": 0.4285, "mean_token_accuracy": 0.8752012252807617, "num_tokens": 763478686.0, "step": 20008 }, { "epoch": 2.545350464317517, "ewc_loss": 0.0722922757267952, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615946334321052, "grad_norm": 8.480521202087402, "learning_rate": 1e-06, "loss": 0.421, "mean_token_accuracy": 0.880933403968811, "num_tokens": 763517258.0, "step": 20009 }, { "epoch": 2.5454776745961074, "ewc_loss": 0.07230613380670547, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003617331967689097, "grad_norm": 8.477734565734863, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8604217767715454, "num_tokens": 763553229.0, "step": 20010 }, { "epoch": 2.545604884874698, "ewc_loss": 0.07245908677577972, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003632627194747329, "grad_norm": 8.504531860351562, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8741074800491333, "num_tokens": 763594274.0, "step": 20011 }, { "epoch": 2.5457320951532885, "ewc_loss": 0.07228348404169083, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615067107602954, "grad_norm": 8.481338500976562, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.857647716999054, "num_tokens": 763632162.0, "step": 20012 }, { "epoch": 2.545859305431879, "ewc_loss": 0.07250943779945374, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036376621574163437, "grad_norm": 8.58501148223877, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8673901557922363, "num_tokens": 763666797.0, "step": 20013 }, { "epoch": 2.5459865157104695, "ewc_loss": 0.07229897379875183, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036166160134598613, "grad_norm": 8.5068359375, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8625616431236267, "num_tokens": 763705475.0, "step": 20014 }, { "epoch": 2.54611372598906, "ewc_loss": 0.07248179614543915, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036348981666378677, "grad_norm": 8.56716251373291, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8809990882873535, "num_tokens": 763739434.0, "step": 20015 }, { "epoch": 2.5462409362676506, "ewc_loss": 0.07251369953155518, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000363808823749423, "grad_norm": 8.61795711517334, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8518739938735962, "num_tokens": 763773262.0, "step": 20016 }, { "epoch": 2.546368146546241, "ewc_loss": 0.0723000168800354, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036167202051728964, "grad_norm": 8.509312629699707, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8549535274505615, "num_tokens": 763812573.0, "step": 20017 }, { "epoch": 2.546495356824831, "ewc_loss": 0.072610042989254, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003647723060566932, "grad_norm": 8.743182182312012, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8675214052200317, "num_tokens": 763848555.0, "step": 20018 }, { "epoch": 2.546622567103422, "ewc_loss": 0.07199734449386597, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035864528035745025, "grad_norm": 8.434893608093262, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8630534410476685, "num_tokens": 763884608.0, "step": 20019 }, { "epoch": 2.5467497773820122, "ewc_loss": 0.07276388257741928, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003663107054308057, "grad_norm": 8.615917205810547, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8603212833404541, "num_tokens": 763922038.0, "step": 20020 }, { "epoch": 2.546876987660603, "ewc_loss": 0.07211199402809143, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035979185486212373, "grad_norm": 8.46154499053955, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8619175553321838, "num_tokens": 763958089.0, "step": 20021 }, { "epoch": 2.5470041979391933, "ewc_loss": 0.07270384579896927, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003657103516161442, "grad_norm": 8.593017578125, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8660144805908203, "num_tokens": 764000802.0, "step": 20022 }, { "epoch": 2.547131408217784, "ewc_loss": 0.07221651077270508, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036083697341382504, "grad_norm": 8.451094627380371, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8695905208587646, "num_tokens": 764036628.0, "step": 20023 }, { "epoch": 2.5472586184963744, "ewc_loss": 0.07252790778875351, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003639509668573737, "grad_norm": 8.560314178466797, "learning_rate": 1e-06, "loss": 0.4438, "mean_token_accuracy": 0.8680555820465088, "num_tokens": 764073643.0, "step": 20024 }, { "epoch": 2.547385828774965, "ewc_loss": 0.07228173315525055, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036148919025436044, "grad_norm": 8.440308570861816, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.872036337852478, "num_tokens": 764110473.0, "step": 20025 }, { "epoch": 2.5475130390535554, "ewc_loss": 0.07262112200260162, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036488313344307244, "grad_norm": 8.57201862335205, "learning_rate": 1e-06, "loss": 0.4709, "mean_token_accuracy": 0.8624559044837952, "num_tokens": 764142941.0, "step": 20026 }, { "epoch": 2.547640249332146, "ewc_loss": 0.07228092849254608, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003614811284933239, "grad_norm": 8.470088005065918, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8676754236221313, "num_tokens": 764178752.0, "step": 20027 }, { "epoch": 2.5477674596107365, "ewc_loss": 0.07259061932563782, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645780961960554, "grad_norm": 8.586674690246582, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8627287149429321, "num_tokens": 764217440.0, "step": 20028 }, { "epoch": 2.547894669889327, "ewc_loss": 0.072211354970932, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003607854596339166, "grad_norm": 8.4567232131958, "learning_rate": 1e-06, "loss": 0.4901, "mean_token_accuracy": 0.8545224070549011, "num_tokens": 764257524.0, "step": 20029 }, { "epoch": 2.5480218801679175, "ewc_loss": 0.07266794145107269, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365351268555969, "grad_norm": 8.583754539489746, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8591268062591553, "num_tokens": 764293645.0, "step": 20030 }, { "epoch": 2.548149090446508, "ewc_loss": 0.07216645032167435, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003603363875299692, "grad_norm": 8.42834758758545, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8725200891494751, "num_tokens": 764331660.0, "step": 20031 }, { "epoch": 2.5482763007250986, "ewc_loss": 0.07275548577308655, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036622676998376846, "grad_norm": 8.549756050109863, "learning_rate": 1e-06, "loss": 0.4055, "mean_token_accuracy": 0.8835036754608154, "num_tokens": 764376577.0, "step": 20032 }, { "epoch": 2.548403511003689, "ewc_loss": 0.07223457098007202, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036101762088947, "grad_norm": 8.440165519714355, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8632773160934448, "num_tokens": 764415492.0, "step": 20033 }, { "epoch": 2.5485307212822796, "ewc_loss": 0.07274630665779114, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036613488919101655, "grad_norm": 8.592296600341797, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.855050265789032, "num_tokens": 764453948.0, "step": 20034 }, { "epoch": 2.54865793156087, "ewc_loss": 0.07230201363563538, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036169204395264387, "grad_norm": 8.502099990844727, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8705792427062988, "num_tokens": 764493906.0, "step": 20035 }, { "epoch": 2.5487851418394607, "ewc_loss": 0.07270536571741104, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003657255438156426, "grad_norm": 8.574939727783203, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.85948646068573, "num_tokens": 764532379.0, "step": 20036 }, { "epoch": 2.5489123521180512, "ewc_loss": 0.0722937136888504, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036160903982818127, "grad_norm": 8.493143081665039, "learning_rate": 1e-06, "loss": 0.5121, "mean_token_accuracy": 0.8462892770767212, "num_tokens": 764567726.0, "step": 20037 }, { "epoch": 2.5490395623966418, "ewc_loss": 0.0726148784160614, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036482064751908183, "grad_norm": 8.556346893310547, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8649201989173889, "num_tokens": 764606229.0, "step": 20038 }, { "epoch": 2.5491667726752323, "ewc_loss": 0.0723927766084671, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003625995886977762, "grad_norm": 8.422273635864258, "learning_rate": 1e-06, "loss": 0.4195, "mean_token_accuracy": 0.8772808313369751, "num_tokens": 764641346.0, "step": 20039 }, { "epoch": 2.549293982953823, "ewc_loss": 0.07268504798412323, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003655223990790546, "grad_norm": 8.521015167236328, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8655000925064087, "num_tokens": 764687286.0, "step": 20040 }, { "epoch": 2.5494211932324133, "ewc_loss": 0.0725526511669159, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036419834941625595, "grad_norm": 8.487232208251953, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8685915470123291, "num_tokens": 764729235.0, "step": 20041 }, { "epoch": 2.549548403511004, "ewc_loss": 0.07263438403606415, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003650157595984638, "grad_norm": 8.603161811828613, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8590512871742249, "num_tokens": 764771834.0, "step": 20042 }, { "epoch": 2.549675613789594, "ewc_loss": 0.07243777811527252, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036304962122812867, "grad_norm": 8.51481819152832, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8654348850250244, "num_tokens": 764808307.0, "step": 20043 }, { "epoch": 2.549802824068185, "ewc_loss": 0.0724821537733078, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003634933673311025, "grad_norm": 8.479620933532715, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8591243028640747, "num_tokens": 764848373.0, "step": 20044 }, { "epoch": 2.549930034346775, "ewc_loss": 0.07238899171352386, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003650032449513674, "grad_norm": 8.575121879577637, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.861060619354248, "num_tokens": 764886468.0, "step": 20045 }, { "epoch": 2.550057244625366, "ewc_loss": 0.07247095555067062, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003633814339991659, "grad_norm": 8.510411262512207, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8685084581375122, "num_tokens": 764928690.0, "step": 20046 }, { "epoch": 2.550184454903956, "ewc_loss": 0.07236956059932709, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036480891867540777, "grad_norm": 8.565801620483398, "learning_rate": 1e-06, "loss": 0.5784, "mean_token_accuracy": 0.8306486010551453, "num_tokens": 764969148.0, "step": 20047 }, { "epoch": 2.5503116651825466, "ewc_loss": 0.07249423861503601, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036361426464281976, "grad_norm": 8.52059268951416, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8600465655326843, "num_tokens": 765010754.0, "step": 20048 }, { "epoch": 2.550438875461137, "ewc_loss": 0.07238015532493591, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.00036491479841060936, "grad_norm": 8.528634071350098, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8688991665840149, "num_tokens": 765043562.0, "step": 20049 }, { "epoch": 2.5505660857397277, "ewc_loss": 0.07224807888269424, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003635940665844828, "grad_norm": 8.561264038085938, "learning_rate": 1e-06, "loss": 0.5274, "mean_token_accuracy": 0.8452637195587158, "num_tokens": 765085638.0, "step": 20050 }, { "epoch": 2.550693296018318, "ewc_loss": 0.07228934019804001, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003640066715888679, "grad_norm": 8.578845977783203, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.870989203453064, "num_tokens": 765115847.0, "step": 20051 }, { "epoch": 2.5508205062969087, "ewc_loss": 0.07211367785930634, "ewc_loss_diag": 3.600120544433594e-05, "ewc_loss_parallel": 0.0003622500516939908, "grad_norm": 8.578929901123047, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8641940951347351, "num_tokens": 765146839.0, "step": 20052 }, { "epoch": 2.5509477165754992, "ewc_loss": 0.07248519361019135, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003635238390415907, "grad_norm": 8.49976634979248, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8647747039794922, "num_tokens": 765185921.0, "step": 20053 }, { "epoch": 2.5510749268540898, "ewc_loss": 0.0725555270910263, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036422713310457766, "grad_norm": 8.594106674194336, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8645341992378235, "num_tokens": 765224534.0, "step": 20054 }, { "epoch": 2.5512021371326803, "ewc_loss": 0.0724143385887146, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003628152480814606, "grad_norm": 8.510245323181152, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8752935528755188, "num_tokens": 765267160.0, "step": 20055 }, { "epoch": 2.551329347411271, "ewc_loss": 0.07261619716882706, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036483383155427873, "grad_norm": 8.583447456359863, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8659366965293884, "num_tokens": 765304770.0, "step": 20056 }, { "epoch": 2.5514565576898613, "ewc_loss": 0.07225648313760757, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036123671452514827, "grad_norm": 8.514982223510742, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8616557121276855, "num_tokens": 765341966.0, "step": 20057 }, { "epoch": 2.551583767968452, "ewc_loss": 0.07257719337940216, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003644438402261585, "grad_norm": 8.532864570617676, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8808597326278687, "num_tokens": 765382653.0, "step": 20058 }, { "epoch": 2.5517109782470424, "ewc_loss": 0.07244105637073517, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036308245034888387, "grad_norm": 8.580965995788574, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8635154962539673, "num_tokens": 765414940.0, "step": 20059 }, { "epoch": 2.551838188525633, "ewc_loss": 0.07230091094970703, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003616809844970703, "grad_norm": 8.542497634887695, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8663958311080933, "num_tokens": 765445549.0, "step": 20060 }, { "epoch": 2.5519653988042235, "ewc_loss": 0.07237610220909119, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036243288195692003, "grad_norm": 8.54067325592041, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8721010088920593, "num_tokens": 765484474.0, "step": 20061 }, { "epoch": 2.552092609082814, "ewc_loss": 0.0723639577627182, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003623114316724241, "grad_norm": 8.52299690246582, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8653694987297058, "num_tokens": 765522651.0, "step": 20062 }, { "epoch": 2.5522198193614045, "ewc_loss": 0.07243892550468445, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003630610881373286, "grad_norm": 8.545055389404297, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.861076831817627, "num_tokens": 765565983.0, "step": 20063 }, { "epoch": 2.552347029639995, "ewc_loss": 0.07228018343448639, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036147370701655746, "grad_norm": 8.539612770080566, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8668667078018188, "num_tokens": 765604056.0, "step": 20064 }, { "epoch": 2.5524742399185856, "ewc_loss": 0.07257931679487228, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000364465027814731, "grad_norm": 8.69271183013916, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8742181658744812, "num_tokens": 765646062.0, "step": 20065 }, { "epoch": 2.5526014501971757, "ewc_loss": 0.07202304899692535, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035890238359570503, "grad_norm": 8.419920921325684, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.863454282283783, "num_tokens": 765687284.0, "step": 20066 }, { "epoch": 2.5527286604757666, "ewc_loss": 0.07270893454551697, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365761254215613, "grad_norm": 8.630663871765137, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8534625768661499, "num_tokens": 765723743.0, "step": 20067 }, { "epoch": 2.5528558707543567, "ewc_loss": 0.07203978300094604, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003590697597246617, "grad_norm": 8.432737350463867, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8621589541435242, "num_tokens": 765768472.0, "step": 20068 }, { "epoch": 2.5529830810329477, "ewc_loss": 0.07283688336610794, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036704071681015193, "grad_norm": 8.660754203796387, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8817684054374695, "num_tokens": 765809040.0, "step": 20069 }, { "epoch": 2.5531102913115378, "ewc_loss": 0.07210846245288849, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003597565519157797, "grad_norm": 8.525535583496094, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8659703731536865, "num_tokens": 765845968.0, "step": 20070 }, { "epoch": 2.5532375015901287, "ewc_loss": 0.07266795635223389, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003653513849712908, "grad_norm": 8.608010292053223, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8709192872047424, "num_tokens": 765881037.0, "step": 20071 }, { "epoch": 2.553364711868719, "ewc_loss": 0.07230915874242783, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003617634647525847, "grad_norm": 8.49796199798584, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8693503737449646, "num_tokens": 765918945.0, "step": 20072 }, { "epoch": 2.5534919221473094, "ewc_loss": 0.07267895340919495, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003654613683465868, "grad_norm": 8.648609161376953, "learning_rate": 1e-06, "loss": 0.4142, "mean_token_accuracy": 0.877549409866333, "num_tokens": 765956128.0, "step": 20073 }, { "epoch": 2.5536191324259, "ewc_loss": 0.07218330353498459, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003605048987083137, "grad_norm": 8.492334365844727, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8786311745643616, "num_tokens": 765997109.0, "step": 20074 }, { "epoch": 2.5537463427044904, "ewc_loss": 0.07288047671318054, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036503523006103933, "grad_norm": 8.589581489562988, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.861669659614563, "num_tokens": 766032557.0, "step": 20075 }, { "epoch": 2.553873552983081, "ewc_loss": 0.07227110862731934, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003613829321693629, "grad_norm": 8.53984546661377, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8624825477600098, "num_tokens": 766075285.0, "step": 20076 }, { "epoch": 2.5540007632616715, "ewc_loss": 0.07253806293010712, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036405251012183726, "grad_norm": 8.643546104431152, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.863944411277771, "num_tokens": 766110600.0, "step": 20077 }, { "epoch": 2.554127973540262, "ewc_loss": 0.07225240767002106, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036119596916250885, "grad_norm": 8.5367431640625, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8542070388793945, "num_tokens": 766146877.0, "step": 20078 }, { "epoch": 2.5542551838188525, "ewc_loss": 0.07245893776416779, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003632612933870405, "grad_norm": 8.577535629272461, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8619949817657471, "num_tokens": 766185164.0, "step": 20079 }, { "epoch": 2.554382394097443, "ewc_loss": 0.07226600497961044, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003613319422584027, "grad_norm": 8.50401782989502, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.885368824005127, "num_tokens": 766221442.0, "step": 20080 }, { "epoch": 2.5545096043760336, "ewc_loss": 0.07246904820203781, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036336237099021673, "grad_norm": 8.629915237426758, "learning_rate": 1e-06, "loss": 0.5438, "mean_token_accuracy": 0.840678334236145, "num_tokens": 766255258.0, "step": 20081 }, { "epoch": 2.554636814654624, "ewc_loss": 0.07214942574501038, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036016610101796687, "grad_norm": 8.522767066955566, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8627637624740601, "num_tokens": 766292392.0, "step": 20082 }, { "epoch": 2.5547640249332146, "ewc_loss": 0.07253265380859375, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036399837699718773, "grad_norm": 8.54472541809082, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8559702634811401, "num_tokens": 766335312.0, "step": 20083 }, { "epoch": 2.554891235211805, "ewc_loss": 0.07177592813968658, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036131395609118044, "grad_norm": 8.518457412719727, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8670793771743774, "num_tokens": 766372189.0, "step": 20084 }, { "epoch": 2.5550184454903957, "ewc_loss": 0.0724925771355629, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036359764635562897, "grad_norm": 8.586092948913574, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8544146418571472, "num_tokens": 766414021.0, "step": 20085 }, { "epoch": 2.5551456557689862, "ewc_loss": 0.07179072499275208, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003614619199652225, "grad_norm": 8.556794166564941, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8793483972549438, "num_tokens": 766452005.0, "step": 20086 }, { "epoch": 2.5552728660475768, "ewc_loss": 0.07193541526794434, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036290884600020945, "grad_norm": 8.529582023620605, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8586119413375854, "num_tokens": 766492150.0, "step": 20087 }, { "epoch": 2.5554000763261673, "ewc_loss": 0.07251029461622238, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036377483047544956, "grad_norm": 8.525154113769531, "learning_rate": 1e-06, "loss": 0.4292, "mean_token_accuracy": 0.8757126331329346, "num_tokens": 766533889.0, "step": 20088 }, { "epoch": 2.555527286604758, "ewc_loss": 0.07194443047046661, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003629989514593035, "grad_norm": 8.581713676452637, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8656972646713257, "num_tokens": 766565008.0, "step": 20089 }, { "epoch": 2.5556544968833483, "ewc_loss": 0.07181013375520706, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036165601341053843, "grad_norm": 8.522065162658691, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8656190037727356, "num_tokens": 766605744.0, "step": 20090 }, { "epoch": 2.5557817071619384, "ewc_loss": 0.07209603488445282, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003645149990916252, "grad_norm": 8.59994888305664, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8747840523719788, "num_tokens": 766645782.0, "step": 20091 }, { "epoch": 2.5559089174405294, "ewc_loss": 0.0718122273683548, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036167690996080637, "grad_norm": 8.553526878356934, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8548843860626221, "num_tokens": 766683135.0, "step": 20092 }, { "epoch": 2.5560361277191195, "ewc_loss": 0.0724618062376976, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003632899315562099, "grad_norm": 8.560792922973633, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.865329384803772, "num_tokens": 766720691.0, "step": 20093 }, { "epoch": 2.5561633379977104, "ewc_loss": 0.07190442085266113, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.0003625988611020148, "grad_norm": 8.584762573242188, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8601875305175781, "num_tokens": 766755426.0, "step": 20094 }, { "epoch": 2.5562905482763005, "ewc_loss": 0.07184725999832153, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036202732007950544, "grad_norm": 8.556142807006836, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.871427059173584, "num_tokens": 766788315.0, "step": 20095 }, { "epoch": 2.5564177585548915, "ewc_loss": 0.07200342416763306, "ewc_loss_diag": 3.5762786865234375e-05, "ewc_loss_parallel": 0.00036358897341415286, "grad_norm": 8.525586128234863, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8802189826965332, "num_tokens": 766825664.0, "step": 20096 }, { "epoch": 2.5565449688334816, "ewc_loss": 0.07229426503181458, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036161451134830713, "grad_norm": 8.473942756652832, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8761740922927856, "num_tokens": 766868145.0, "step": 20097 }, { "epoch": 2.556672179112072, "ewc_loss": 0.07254046201705933, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003640764916781336, "grad_norm": 8.550894737243652, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8539295196533203, "num_tokens": 766907278.0, "step": 20098 }, { "epoch": 2.5567993893906626, "ewc_loss": 0.07240727543830872, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000362744671292603, "grad_norm": 8.489998817443848, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8771595358848572, "num_tokens": 766941662.0, "step": 20099 }, { "epoch": 2.556926599669253, "ewc_loss": 0.07273636758327484, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003660355869214982, "grad_norm": 8.654705047607422, "learning_rate": 1e-06, "loss": 0.5365, "mean_token_accuracy": 0.848363995552063, "num_tokens": 766977725.0, "step": 20100 }, { "epoch": 2.5570538099478437, "ewc_loss": 0.07230764627456665, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036174835986457765, "grad_norm": 8.484957695007324, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8593472242355347, "num_tokens": 767011011.0, "step": 20101 }, { "epoch": 2.5571810202264342, "ewc_loss": 0.07269603759050369, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003656322369351983, "grad_norm": 8.540657997131348, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8634698390960693, "num_tokens": 767047078.0, "step": 20102 }, { "epoch": 2.5573082305050248, "ewc_loss": 0.07236076891422272, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003622795338742435, "grad_norm": 8.51929759979248, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8615808486938477, "num_tokens": 767089208.0, "step": 20103 }, { "epoch": 2.5574354407836153, "ewc_loss": 0.07257604598999023, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036443237331695855, "grad_norm": 8.624781608581543, "learning_rate": 1e-06, "loss": 0.5261, "mean_token_accuracy": 0.8506040573120117, "num_tokens": 767131225.0, "step": 20104 }, { "epoch": 2.557562651062206, "ewc_loss": 0.07225427031517029, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036121459561400115, "grad_norm": 8.52938461303711, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8525341749191284, "num_tokens": 767165222.0, "step": 20105 }, { "epoch": 2.5576898613407963, "ewc_loss": 0.07256641983985901, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036433609784580767, "grad_norm": 8.55886173248291, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8663118481636047, "num_tokens": 767200870.0, "step": 20106 }, { "epoch": 2.557817071619387, "ewc_loss": 0.07233819365501404, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036205380456522107, "grad_norm": 8.52428913116455, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.864011824131012, "num_tokens": 767236881.0, "step": 20107 }, { "epoch": 2.5579442818979774, "ewc_loss": 0.07244326174259186, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003631044819485396, "grad_norm": 8.579645156860352, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8700414299964905, "num_tokens": 767269716.0, "step": 20108 }, { "epoch": 2.558071492176568, "ewc_loss": 0.0723019689321518, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036169157829135656, "grad_norm": 8.535669326782227, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.855612576007843, "num_tokens": 767306840.0, "step": 20109 }, { "epoch": 2.5581987024551585, "ewc_loss": 0.07242708653211594, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036294275196269155, "grad_norm": 8.515353202819824, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8775724768638611, "num_tokens": 767346966.0, "step": 20110 }, { "epoch": 2.558325912733749, "ewc_loss": 0.07245743274688721, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036324618849903345, "grad_norm": 8.551368713378906, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8622331023216248, "num_tokens": 767387084.0, "step": 20111 }, { "epoch": 2.5584531230123395, "ewc_loss": 0.072407066822052, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003627425467129797, "grad_norm": 8.65367603302002, "learning_rate": 1e-06, "loss": 0.5252, "mean_token_accuracy": 0.8461405038833618, "num_tokens": 767417857.0, "step": 20112 }, { "epoch": 2.55858033329093, "ewc_loss": 0.07219554483890533, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003606273385230452, "grad_norm": 8.491839408874512, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.86693274974823, "num_tokens": 767455771.0, "step": 20113 }, { "epoch": 2.5587075435695206, "ewc_loss": 0.07248225808143616, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036349447327665985, "grad_norm": 8.622733116149902, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8533259630203247, "num_tokens": 767493187.0, "step": 20114 }, { "epoch": 2.558834753848111, "ewc_loss": 0.0720820277929306, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035949211451224983, "grad_norm": 8.474249839782715, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.858223021030426, "num_tokens": 767534526.0, "step": 20115 }, { "epoch": 2.558961964126701, "ewc_loss": 0.07238364219665527, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036250826087780297, "grad_norm": 8.562708854675293, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8624330759048462, "num_tokens": 767569297.0, "step": 20116 }, { "epoch": 2.559089174405292, "ewc_loss": 0.07218602299690247, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003605320816859603, "grad_norm": 8.602054595947266, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8617302179336548, "num_tokens": 767605860.0, "step": 20117 }, { "epoch": 2.5592163846838822, "ewc_loss": 0.07207579910755157, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003594298614189029, "grad_norm": 8.463605880737305, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8671472072601318, "num_tokens": 767645252.0, "step": 20118 }, { "epoch": 2.559343594962473, "ewc_loss": 0.07235528528690338, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003622247604653239, "grad_norm": 8.643733024597168, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8647743463516235, "num_tokens": 767681942.0, "step": 20119 }, { "epoch": 2.5594708052410633, "ewc_loss": 0.07190421223640442, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003577139868866652, "grad_norm": 8.479084968566895, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8656481504440308, "num_tokens": 767720631.0, "step": 20120 }, { "epoch": 2.559598015519654, "ewc_loss": 0.07241169363260269, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003627888218034059, "grad_norm": 8.575983047485352, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8692649602890015, "num_tokens": 767757018.0, "step": 20121 }, { "epoch": 2.5597252257982444, "ewc_loss": 0.07202888280153275, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003589607076719403, "grad_norm": 8.47530460357666, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8683153986930847, "num_tokens": 767796046.0, "step": 20122 }, { "epoch": 2.559852436076835, "ewc_loss": 0.07248418033123016, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003635136818047613, "grad_norm": 8.520390510559082, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8760911822319031, "num_tokens": 767835678.0, "step": 20123 }, { "epoch": 2.5599796463554254, "ewc_loss": 0.07223504781723022, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036102236481383443, "grad_norm": 8.560748100280762, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8792361617088318, "num_tokens": 767870515.0, "step": 20124 }, { "epoch": 2.560106856634016, "ewc_loss": 0.07224272191524506, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003610991407185793, "grad_norm": 8.547231674194336, "learning_rate": 1e-06, "loss": 0.5176, "mean_token_accuracy": 0.8488430976867676, "num_tokens": 767911573.0, "step": 20125 }, { "epoch": 2.5602340669126065, "ewc_loss": 0.07226844877004623, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003613563603721559, "grad_norm": 8.48504638671875, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.863466203212738, "num_tokens": 767950161.0, "step": 20126 }, { "epoch": 2.560361277191197, "ewc_loss": 0.07233797013759613, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036205153446644545, "grad_norm": 8.544249534606934, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8622266054153442, "num_tokens": 767989431.0, "step": 20127 }, { "epoch": 2.5604884874697875, "ewc_loss": 0.07227125763893127, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036138450377620757, "grad_norm": 8.534462928771973, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8747580051422119, "num_tokens": 768023551.0, "step": 20128 }, { "epoch": 2.560615697748378, "ewc_loss": 0.0723707377910614, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003623792144935578, "grad_norm": 8.56299877166748, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8731591105461121, "num_tokens": 768055317.0, "step": 20129 }, { "epoch": 2.5607429080269686, "ewc_loss": 0.07220375537872314, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003607094695325941, "grad_norm": 8.591843605041504, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8569084405899048, "num_tokens": 768089575.0, "step": 20130 }, { "epoch": 2.560870118305559, "ewc_loss": 0.07219287008047104, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036060059210285544, "grad_norm": 8.55476188659668, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8547329306602478, "num_tokens": 768131795.0, "step": 20131 }, { "epoch": 2.5609973285841496, "ewc_loss": 0.07234812527894974, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003621531359385699, "grad_norm": 8.553799629211426, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8697837591171265, "num_tokens": 768165714.0, "step": 20132 }, { "epoch": 2.56112453886274, "ewc_loss": 0.0723026841878891, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003616987669374794, "grad_norm": 8.508715629577637, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8740466833114624, "num_tokens": 768203475.0, "step": 20133 }, { "epoch": 2.5612517491413307, "ewc_loss": 0.07228301465511322, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615020541474223, "grad_norm": 8.549101829528809, "learning_rate": 1e-06, "loss": 0.4201, "mean_token_accuracy": 0.8817101716995239, "num_tokens": 768244841.0, "step": 20134 }, { "epoch": 2.561378959419921, "ewc_loss": 0.07223081588745117, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003609799896366894, "grad_norm": 8.467814445495605, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8615039587020874, "num_tokens": 768286502.0, "step": 20135 }, { "epoch": 2.5615061696985117, "ewc_loss": 0.07247728109359741, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036344470572657883, "grad_norm": 8.612210273742676, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8724827766418457, "num_tokens": 768320628.0, "step": 20136 }, { "epoch": 2.5616333799771023, "ewc_loss": 0.07200396060943604, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003587114333640784, "grad_norm": 8.490582466125488, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8581353425979614, "num_tokens": 768353630.0, "step": 20137 }, { "epoch": 2.561760590255693, "ewc_loss": 0.0726134330034256, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036480618291534483, "grad_norm": 8.57141399383545, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8639382123947144, "num_tokens": 768388170.0, "step": 20138 }, { "epoch": 2.5618878005342833, "ewc_loss": 0.07209012657403946, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003595731395762414, "grad_norm": 8.465660095214844, "learning_rate": 1e-06, "loss": 0.4121, "mean_token_accuracy": 0.8814080357551575, "num_tokens": 768428238.0, "step": 20139 }, { "epoch": 2.562015010812874, "ewc_loss": 0.07256728410720825, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003643447416834533, "grad_norm": 8.619132041931152, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8799133896827698, "num_tokens": 768460075.0, "step": 20140 }, { "epoch": 2.562142221091464, "ewc_loss": 0.07202720642089844, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003589439147617668, "grad_norm": 8.430129051208496, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8826775550842285, "num_tokens": 768493982.0, "step": 20141 }, { "epoch": 2.562269431370055, "ewc_loss": 0.07266786694526672, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003653505991678685, "grad_norm": 8.640715599060059, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8691003918647766, "num_tokens": 768529121.0, "step": 20142 }, { "epoch": 2.562396641648645, "ewc_loss": 0.07197414338588715, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035841335193254054, "grad_norm": 8.475485801696777, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8664107322692871, "num_tokens": 768560730.0, "step": 20143 }, { "epoch": 2.562523851927236, "ewc_loss": 0.07257697731256485, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036444165743887424, "grad_norm": 8.535455703735352, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8773818016052246, "num_tokens": 768603015.0, "step": 20144 }, { "epoch": 2.562651062205826, "ewc_loss": 0.072305828332901, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003617301117628813, "grad_norm": 8.526041984558105, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.864120602607727, "num_tokens": 768639847.0, "step": 20145 }, { "epoch": 2.5627782724844166, "ewc_loss": 0.07238411158323288, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036251297569833696, "grad_norm": 8.532001495361328, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8628577589988708, "num_tokens": 768681614.0, "step": 20146 }, { "epoch": 2.562905482763007, "ewc_loss": 0.07234218716621399, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003620937350206077, "grad_norm": 8.472363471984863, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8678299784660339, "num_tokens": 768722052.0, "step": 20147 }, { "epoch": 2.5630326930415976, "ewc_loss": 0.07244445383548737, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003631164145190269, "grad_norm": 8.568896293640137, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8577145338058472, "num_tokens": 768754191.0, "step": 20148 }, { "epoch": 2.563159903320188, "ewc_loss": 0.07225053012371063, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003611771680880338, "grad_norm": 8.55180835723877, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8541821241378784, "num_tokens": 768792981.0, "step": 20149 }, { "epoch": 2.5632871135987787, "ewc_loss": 0.072298564016819, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003616575268097222, "grad_norm": 8.496747970581055, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.8554369211196899, "num_tokens": 768827040.0, "step": 20150 }, { "epoch": 2.5634143238773692, "ewc_loss": 0.07248447835445404, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003635167086031288, "grad_norm": 8.525134086608887, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.863805890083313, "num_tokens": 768865669.0, "step": 20151 }, { "epoch": 2.5635415341559598, "ewc_loss": 0.07228811830282211, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003615530440583825, "grad_norm": 8.502714157104492, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8638337254524231, "num_tokens": 768903306.0, "step": 20152 }, { "epoch": 2.5636687444345503, "ewc_loss": 0.07236599177122116, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003623318043537438, "grad_norm": 8.51518726348877, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8693590760231018, "num_tokens": 768946322.0, "step": 20153 }, { "epoch": 2.563795954713141, "ewc_loss": 0.07264061272144318, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003626365796662867, "grad_norm": 8.485986709594727, "learning_rate": 1e-06, "loss": 0.4001, "mean_token_accuracy": 0.8830797672271729, "num_tokens": 768987350.0, "step": 20154 }, { "epoch": 2.5639231649917313, "ewc_loss": 0.0724804475903511, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003634763415902853, "grad_norm": 8.523883819580078, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8749035000801086, "num_tokens": 769030762.0, "step": 20155 }, { "epoch": 2.564050375270322, "ewc_loss": 0.07227909564971924, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036146288039162755, "grad_norm": 8.44815731048584, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8779003620147705, "num_tokens": 769069827.0, "step": 20156 }, { "epoch": 2.5641775855489124, "ewc_loss": 0.07254324853420258, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003641043440438807, "grad_norm": 8.540168762207031, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8646794557571411, "num_tokens": 769107742.0, "step": 20157 }, { "epoch": 2.564304795827503, "ewc_loss": 0.07231074571609497, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036177929723635316, "grad_norm": 8.477280616760254, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.863365113735199, "num_tokens": 769153286.0, "step": 20158 }, { "epoch": 2.5644320061060935, "ewc_loss": 0.07272498309612274, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365921703632921, "grad_norm": 8.59721565246582, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8679839968681335, "num_tokens": 769192646.0, "step": 20159 }, { "epoch": 2.564559216384684, "ewc_loss": 0.07226989418268204, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003613708249758929, "grad_norm": 8.51461410522461, "learning_rate": 1e-06, "loss": 0.5312, "mean_token_accuracy": 0.8426810503005981, "num_tokens": 769230360.0, "step": 20160 }, { "epoch": 2.5646864266632745, "ewc_loss": 0.07257561385631561, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003644279786385596, "grad_norm": 8.528759956359863, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8790788650512695, "num_tokens": 769270754.0, "step": 20161 }, { "epoch": 2.564813636941865, "ewc_loss": 0.07241454720497131, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036281737266108394, "grad_norm": 8.51960563659668, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8421834111213684, "num_tokens": 769310109.0, "step": 20162 }, { "epoch": 2.5649408472204556, "ewc_loss": 0.0725349485874176, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036402136902324855, "grad_norm": 8.591761589050293, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.863997757434845, "num_tokens": 769352101.0, "step": 20163 }, { "epoch": 2.5650680574990457, "ewc_loss": 0.07228255271911621, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036149745574221015, "grad_norm": 8.48692798614502, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8796494603157043, "num_tokens": 769386570.0, "step": 20164 }, { "epoch": 2.5651952677776366, "ewc_loss": 0.07264072448015213, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003650791186373681, "grad_norm": 8.577045440673828, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8778234720230103, "num_tokens": 769427167.0, "step": 20165 }, { "epoch": 2.5653224780562267, "ewc_loss": 0.07223242521286011, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036099617136642337, "grad_norm": 8.48890495300293, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8541696071624756, "num_tokens": 769465456.0, "step": 20166 }, { "epoch": 2.5654496883348177, "ewc_loss": 0.07275600731372833, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036623195046558976, "grad_norm": 8.707399368286133, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8787474036216736, "num_tokens": 769497135.0, "step": 20167 }, { "epoch": 2.5655768986134078, "ewc_loss": 0.0720667764544487, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000359339639544487, "grad_norm": 8.5645112991333, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8610241413116455, "num_tokens": 769533706.0, "step": 20168 }, { "epoch": 2.5657041088919987, "ewc_loss": 0.0725584328174591, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003642562369350344, "grad_norm": 8.546554565429688, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.860743522644043, "num_tokens": 769578698.0, "step": 20169 }, { "epoch": 2.565831319170589, "ewc_loss": 0.07231065630912781, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000361778395017609, "grad_norm": 8.516308784484863, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.878817081451416, "num_tokens": 769619799.0, "step": 20170 }, { "epoch": 2.5659585294491793, "ewc_loss": 0.07246898114681244, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036336170160211623, "grad_norm": 8.583209991455078, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8653912544250488, "num_tokens": 769655973.0, "step": 20171 }, { "epoch": 2.56608573972777, "ewc_loss": 0.07242659479379654, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003629378043115139, "grad_norm": 8.648484230041504, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8613925576210022, "num_tokens": 769692333.0, "step": 20172 }, { "epoch": 2.5662129500063604, "ewc_loss": 0.07225111126899719, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003611829597502947, "grad_norm": 8.535528182983398, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8694964051246643, "num_tokens": 769731804.0, "step": 20173 }, { "epoch": 2.566340160284951, "ewc_loss": 0.07259435951709747, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036461546551436186, "grad_norm": 8.586069107055664, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8714388012886047, "num_tokens": 769767409.0, "step": 20174 }, { "epoch": 2.5664673705635415, "ewc_loss": 0.0722060352563858, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003607321996241808, "grad_norm": 8.47513484954834, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8620232343673706, "num_tokens": 769806468.0, "step": 20175 }, { "epoch": 2.566594580842132, "ewc_loss": 0.07263442873954773, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003650161379482597, "grad_norm": 8.599211692810059, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8727666139602661, "num_tokens": 769842025.0, "step": 20176 }, { "epoch": 2.5667217911207225, "ewc_loss": 0.07232582569122314, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003619300841819495, "grad_norm": 8.50199031829834, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8611643314361572, "num_tokens": 769884125.0, "step": 20177 }, { "epoch": 2.566849001399313, "ewc_loss": 0.07279868423938751, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003666586708277464, "grad_norm": 8.674010276794434, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8532454967498779, "num_tokens": 769920523.0, "step": 20178 }, { "epoch": 2.5669762116779036, "ewc_loss": 0.07229956984519958, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003616675967350602, "grad_norm": 8.50500774383545, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8623723983764648, "num_tokens": 769960141.0, "step": 20179 }, { "epoch": 2.567103421956494, "ewc_loss": 0.0729646310210228, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036831817124038935, "grad_norm": 8.662360191345215, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8669196367263794, "num_tokens": 769995281.0, "step": 20180 }, { "epoch": 2.5672306322350846, "ewc_loss": 0.0722106397151947, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036077830009162426, "grad_norm": 8.503515243530273, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8700505495071411, "num_tokens": 770028230.0, "step": 20181 }, { "epoch": 2.567357842513675, "ewc_loss": 0.07281588017940521, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036683070356957614, "grad_norm": 8.667333602905273, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8723580241203308, "num_tokens": 770068996.0, "step": 20182 }, { "epoch": 2.5674850527922657, "ewc_loss": 0.0722486674785614, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003611585416365415, "grad_norm": 8.547842025756836, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8542230725288391, "num_tokens": 770107018.0, "step": 20183 }, { "epoch": 2.567612263070856, "ewc_loss": 0.07258419692516327, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645138640422374, "grad_norm": 8.575242042541504, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8646218180656433, "num_tokens": 770140251.0, "step": 20184 }, { "epoch": 2.5677394733494467, "ewc_loss": 0.07225478440523148, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036121971788816154, "grad_norm": 8.56979751586914, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8568799495697021, "num_tokens": 770182241.0, "step": 20185 }, { "epoch": 2.5678666836280373, "ewc_loss": 0.07246210426092148, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003632929001469165, "grad_norm": 8.57629108428955, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8803877830505371, "num_tokens": 770223267.0, "step": 20186 }, { "epoch": 2.567993893906628, "ewc_loss": 0.07224109768867493, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003610828425735235, "grad_norm": 8.508020401000977, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8541412353515625, "num_tokens": 770261856.0, "step": 20187 }, { "epoch": 2.5681211041852183, "ewc_loss": 0.07252851873636246, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036395707866176963, "grad_norm": 8.62062931060791, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8540621995925903, "num_tokens": 770299762.0, "step": 20188 }, { "epoch": 2.5682483144638084, "ewc_loss": 0.07207490503787994, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035942092654295266, "grad_norm": 8.477120399475098, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8604303598403931, "num_tokens": 770333690.0, "step": 20189 }, { "epoch": 2.5683755247423994, "ewc_loss": 0.07256222516298294, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036429413012228906, "grad_norm": 8.57950496673584, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8709502220153809, "num_tokens": 770375755.0, "step": 20190 }, { "epoch": 2.5685027350209895, "ewc_loss": 0.072242371737957, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036109559005126357, "grad_norm": 8.559557914733887, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.874683141708374, "num_tokens": 770417805.0, "step": 20191 }, { "epoch": 2.5686299452995804, "ewc_loss": 0.0723629742860794, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036230162368156016, "grad_norm": 8.528485298156738, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8647923469543457, "num_tokens": 770458183.0, "step": 20192 }, { "epoch": 2.5687571555781705, "ewc_loss": 0.07232573628425598, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003619292110670358, "grad_norm": 8.586480140686035, "learning_rate": 1e-06, "loss": 0.4473, "mean_token_accuracy": 0.8673738241195679, "num_tokens": 770488973.0, "step": 20193 }, { "epoch": 2.5688843658567615, "ewc_loss": 0.0723002552986145, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003616744070313871, "grad_norm": 8.491422653198242, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8588156700134277, "num_tokens": 770529030.0, "step": 20194 }, { "epoch": 2.5690115761353516, "ewc_loss": 0.07257780432701111, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036444992292672396, "grad_norm": 8.588845252990723, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8549391627311707, "num_tokens": 770574697.0, "step": 20195 }, { "epoch": 2.569138786413942, "ewc_loss": 0.07231809198856354, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003618528426159173, "grad_norm": 8.47307014465332, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8724609613418579, "num_tokens": 770615417.0, "step": 20196 }, { "epoch": 2.5692659966925326, "ewc_loss": 0.0727025642991066, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036569751682691276, "grad_norm": 8.612770080566406, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8516643643379211, "num_tokens": 770653228.0, "step": 20197 }, { "epoch": 2.569393206971123, "ewc_loss": 0.0723625123500824, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036229699617251754, "grad_norm": 8.479399681091309, "learning_rate": 1e-06, "loss": 0.4234, "mean_token_accuracy": 0.8814680576324463, "num_tokens": 770693876.0, "step": 20198 }, { "epoch": 2.5695204172497137, "ewc_loss": 0.07296441495418549, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000368316046660766, "grad_norm": 8.67954158782959, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8639464378356934, "num_tokens": 770728754.0, "step": 20199 }, { "epoch": 2.5696476275283042, "ewc_loss": 0.07225765287876129, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036124844336882234, "grad_norm": 8.506927490234375, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8642364740371704, "num_tokens": 770767961.0, "step": 20200 }, { "epoch": 2.5697748378068948, "ewc_loss": 0.07296602427959442, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003683321119751781, "grad_norm": 8.658934593200684, "learning_rate": 1e-06, "loss": 0.5602, "mean_token_accuracy": 0.8387817144393921, "num_tokens": 770808880.0, "step": 20201 }, { "epoch": 2.5699020480854853, "ewc_loss": 0.07236724346876144, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003623443190008402, "grad_norm": 8.523539543151855, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8602837324142456, "num_tokens": 770846483.0, "step": 20202 }, { "epoch": 2.570029258364076, "ewc_loss": 0.07302181422710419, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003688900324050337, "grad_norm": 8.609456062316895, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8454880714416504, "num_tokens": 770884234.0, "step": 20203 }, { "epoch": 2.5701564686426663, "ewc_loss": 0.07242609560489655, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003629328275565058, "grad_norm": 8.607162475585938, "learning_rate": 1e-06, "loss": 0.515, "mean_token_accuracy": 0.851829469203949, "num_tokens": 770918785.0, "step": 20204 }, { "epoch": 2.570283678921257, "ewc_loss": 0.07268711924552917, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003655431210063398, "grad_norm": 8.537482261657715, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.887033224105835, "num_tokens": 770956889.0, "step": 20205 }, { "epoch": 2.5704108891998474, "ewc_loss": 0.07271130383014679, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003657848865259439, "grad_norm": 8.580419540405273, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.8540269732475281, "num_tokens": 770994937.0, "step": 20206 }, { "epoch": 2.570538099478438, "ewc_loss": 0.07260940223932266, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003647659032139927, "grad_norm": 8.52414321899414, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8722596168518066, "num_tokens": 771031451.0, "step": 20207 }, { "epoch": 2.5706653097570284, "ewc_loss": 0.0727168545126915, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003658404166344553, "grad_norm": 8.611955642700195, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8738116025924683, "num_tokens": 771061575.0, "step": 20208 }, { "epoch": 2.570792520035619, "ewc_loss": 0.07249625772237778, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036363446270115674, "grad_norm": 8.51114559173584, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8652224540710449, "num_tokens": 771103366.0, "step": 20209 }, { "epoch": 2.5709197303142095, "ewc_loss": 0.0728333443403244, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000367005355656147, "grad_norm": 8.557551383972168, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8700476884841919, "num_tokens": 771142178.0, "step": 20210 }, { "epoch": 2.5710469405928, "ewc_loss": 0.07261008769273758, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036477274261415005, "grad_norm": 8.548583984375, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8659390211105347, "num_tokens": 771179296.0, "step": 20211 }, { "epoch": 2.5711741508713906, "ewc_loss": 0.0727706030011177, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003663779061753303, "grad_norm": 8.553485870361328, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8583838939666748, "num_tokens": 771219084.0, "step": 20212 }, { "epoch": 2.571301361149981, "ewc_loss": 0.07257582247257233, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000364430045010522, "grad_norm": 8.605015754699707, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8724411129951477, "num_tokens": 771250609.0, "step": 20213 }, { "epoch": 2.571428571428571, "ewc_loss": 0.07263466715812683, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003650185826700181, "grad_norm": 8.524672508239746, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8639012575149536, "num_tokens": 771286191.0, "step": 20214 }, { "epoch": 2.571555781707162, "ewc_loss": 0.0726768970489502, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036544082104228437, "grad_norm": 8.533029556274414, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8644510507583618, "num_tokens": 771320898.0, "step": 20215 }, { "epoch": 2.5716829919857522, "ewc_loss": 0.07259080559015274, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645799297373742, "grad_norm": 8.528515815734863, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8569247722625732, "num_tokens": 771353977.0, "step": 20216 }, { "epoch": 2.571810202264343, "ewc_loss": 0.07277627289295197, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036643457133322954, "grad_norm": 8.54574966430664, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.872281014919281, "num_tokens": 771391960.0, "step": 20217 }, { "epoch": 2.5719374125429333, "ewc_loss": 0.07262682169675827, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036494008963927627, "grad_norm": 8.502019882202148, "learning_rate": 1e-06, "loss": 0.4952, "mean_token_accuracy": 0.8572689294815063, "num_tokens": 771427764.0, "step": 20218 }, { "epoch": 2.572064622821524, "ewc_loss": 0.07281981408596039, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036686999374069273, "grad_norm": 8.497496604919434, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8657976984977722, "num_tokens": 771469706.0, "step": 20219 }, { "epoch": 2.5721918331001143, "ewc_loss": 0.07273042947053909, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365976186003536, "grad_norm": 8.529250144958496, "learning_rate": 1e-06, "loss": 0.3968, "mean_token_accuracy": 0.8866605758666992, "num_tokens": 771504296.0, "step": 20220 }, { "epoch": 2.572319043378705, "ewc_loss": 0.07287877798080444, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003674596664495766, "grad_norm": 8.565594673156738, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8656971454620361, "num_tokens": 771544496.0, "step": 20221 }, { "epoch": 2.5724462536572954, "ewc_loss": 0.07271592319011688, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003658311616163701, "grad_norm": 8.520078659057617, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8735707402229309, "num_tokens": 771584322.0, "step": 20222 }, { "epoch": 2.572573463935886, "ewc_loss": 0.0728181004524231, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036685290979221463, "grad_norm": 8.548067092895508, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8754773736000061, "num_tokens": 771621424.0, "step": 20223 }, { "epoch": 2.5727006742144765, "ewc_loss": 0.07259213924407959, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645932301878929, "grad_norm": 8.484251976013184, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8664305210113525, "num_tokens": 771658173.0, "step": 20224 }, { "epoch": 2.572827884493067, "ewc_loss": 0.07286344468593597, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003673063183669001, "grad_norm": 8.589503288269043, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8572641611099243, "num_tokens": 771698182.0, "step": 20225 }, { "epoch": 2.5729550947716575, "ewc_loss": 0.07264554500579834, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365127285476774, "grad_norm": 8.526549339294434, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8501412272453308, "num_tokens": 771731559.0, "step": 20226 }, { "epoch": 2.573082305050248, "ewc_loss": 0.0727774053812027, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003664459509309381, "grad_norm": 8.510546684265137, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8621973991394043, "num_tokens": 771774362.0, "step": 20227 }, { "epoch": 2.5732095153288386, "ewc_loss": 0.07260487973690033, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003647207049652934, "grad_norm": 8.514601707458496, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.855420708656311, "num_tokens": 771810540.0, "step": 20228 }, { "epoch": 2.573336725607429, "ewc_loss": 0.0726952776312828, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003656246408354491, "grad_norm": 8.52576732635498, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8690363764762878, "num_tokens": 771846221.0, "step": 20229 }, { "epoch": 2.5734639358860196, "ewc_loss": 0.07265640795230865, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003652359009720385, "grad_norm": 8.577998161315918, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8745216131210327, "num_tokens": 771882268.0, "step": 20230 }, { "epoch": 2.57359114616461, "ewc_loss": 0.07253732532262802, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003640451468527317, "grad_norm": 8.522406578063965, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.863763153553009, "num_tokens": 771913176.0, "step": 20231 }, { "epoch": 2.5737183564432007, "ewc_loss": 0.07281912863254547, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036686318344436586, "grad_norm": 8.536657333374023, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8600132465362549, "num_tokens": 771948163.0, "step": 20232 }, { "epoch": 2.573845566721791, "ewc_loss": 0.07246977090835571, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036336955963633955, "grad_norm": 8.460155487060547, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8783951997756958, "num_tokens": 771988296.0, "step": 20233 }, { "epoch": 2.5739727770003817, "ewc_loss": 0.07298365980386734, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036850848118774593, "grad_norm": 8.626209259033203, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8721778392791748, "num_tokens": 772024330.0, "step": 20234 }, { "epoch": 2.5740999872789723, "ewc_loss": 0.0724399983882904, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036307182745076716, "grad_norm": 8.50074291229248, "learning_rate": 1e-06, "loss": 0.5023, "mean_token_accuracy": 0.8530220985412598, "num_tokens": 772062291.0, "step": 20235 }, { "epoch": 2.574227197557563, "ewc_loss": 0.07280069589614868, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036667881067842245, "grad_norm": 8.564061164855957, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8663759231567383, "num_tokens": 772110239.0, "step": 20236 }, { "epoch": 2.5743544078361533, "ewc_loss": 0.07262647151947021, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003649366262834519, "grad_norm": 8.549240112304688, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8617807626724243, "num_tokens": 772153201.0, "step": 20237 }, { "epoch": 2.574481618114744, "ewc_loss": 0.0726836621761322, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003655084583442658, "grad_norm": 8.601874351501465, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8675214052200317, "num_tokens": 772186816.0, "step": 20238 }, { "epoch": 2.574608828393334, "ewc_loss": 0.07257191836833954, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036439101677387953, "grad_norm": 8.53184700012207, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8756977915763855, "num_tokens": 772224238.0, "step": 20239 }, { "epoch": 2.574736038671925, "ewc_loss": 0.07273261249065399, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365998042980209, "grad_norm": 8.599166870117188, "learning_rate": 1e-06, "loss": 0.4375, "mean_token_accuracy": 0.8750193119049072, "num_tokens": 772263505.0, "step": 20240 }, { "epoch": 2.574863248950515, "ewc_loss": 0.07245510816574097, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036322290543466806, "grad_norm": 8.475590705871582, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8748735785484314, "num_tokens": 772303205.0, "step": 20241 }, { "epoch": 2.574990459229106, "ewc_loss": 0.07276514172554016, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003663233364932239, "grad_norm": 8.528493881225586, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8715524673461914, "num_tokens": 772344495.0, "step": 20242 }, { "epoch": 2.575117669507696, "ewc_loss": 0.07263469696044922, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036501887370832264, "grad_norm": 8.645269393920898, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8608125448226929, "num_tokens": 772380296.0, "step": 20243 }, { "epoch": 2.5752448797862866, "ewc_loss": 0.07249660044908524, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036363789695315063, "grad_norm": 8.523282051086426, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8793429136276245, "num_tokens": 772423933.0, "step": 20244 }, { "epoch": 2.575372090064877, "ewc_loss": 0.07277710735797882, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036644289502874017, "grad_norm": 8.59278678894043, "learning_rate": 1e-06, "loss": 0.5143, "mean_token_accuracy": 0.8510969281196594, "num_tokens": 772464514.0, "step": 20245 }, { "epoch": 2.5754993003434676, "ewc_loss": 0.07246282696723938, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003633001178968698, "grad_norm": 8.559666633605957, "learning_rate": 1e-06, "loss": 0.3938, "mean_token_accuracy": 0.884600043296814, "num_tokens": 772498501.0, "step": 20246 }, { "epoch": 2.575626510622058, "ewc_loss": 0.07266253232955933, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003652971936389804, "grad_norm": 8.518611907958984, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8615721464157104, "num_tokens": 772538072.0, "step": 20247 }, { "epoch": 2.5757537209006487, "ewc_loss": 0.07258175313472748, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003644893877208233, "grad_norm": 8.554693222045898, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.865183413028717, "num_tokens": 772577434.0, "step": 20248 }, { "epoch": 2.575880931179239, "ewc_loss": 0.07252426445484161, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036391455796547234, "grad_norm": 8.558479309082031, "learning_rate": 1e-06, "loss": 0.4103, "mean_token_accuracy": 0.8815587759017944, "num_tokens": 772612917.0, "step": 20249 }, { "epoch": 2.5760081414578297, "ewc_loss": 0.07277318835258484, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003664037794806063, "grad_norm": 8.498818397521973, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8761563301086426, "num_tokens": 772652939.0, "step": 20250 }, { "epoch": 2.5761353517364203, "ewc_loss": 0.07267263531684875, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036539818393066525, "grad_norm": 8.590877532958984, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8734328746795654, "num_tokens": 772688018.0, "step": 20251 }, { "epoch": 2.576262562015011, "ewc_loss": 0.07264202833175659, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003650921571534127, "grad_norm": 8.57248592376709, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8566449880599976, "num_tokens": 772728071.0, "step": 20252 }, { "epoch": 2.5763897722936013, "ewc_loss": 0.07269368320703506, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036560872104018927, "grad_norm": 8.597087860107422, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8658430576324463, "num_tokens": 772762730.0, "step": 20253 }, { "epoch": 2.576516982572192, "ewc_loss": 0.07258255779743195, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036449747858569026, "grad_norm": 8.56203842163086, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8700004816055298, "num_tokens": 772799125.0, "step": 20254 }, { "epoch": 2.5766441928507824, "ewc_loss": 0.07268793135881424, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003655511827673763, "grad_norm": 8.588289260864258, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.877823531627655, "num_tokens": 772838657.0, "step": 20255 }, { "epoch": 2.576771403129373, "ewc_loss": 0.07251624763011932, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003638344060163945, "grad_norm": 8.592218399047852, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8712517023086548, "num_tokens": 772876470.0, "step": 20256 }, { "epoch": 2.5768986134079634, "ewc_loss": 0.07257837802171707, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036445565638132393, "grad_norm": 8.592644691467285, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8607035875320435, "num_tokens": 772910369.0, "step": 20257 }, { "epoch": 2.577025823686554, "ewc_loss": 0.07252980768680573, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003639699425548315, "grad_norm": 8.569419860839844, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.870895504951477, "num_tokens": 772948129.0, "step": 20258 }, { "epoch": 2.5771530339651445, "ewc_loss": 0.07258391380310059, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645109827630222, "grad_norm": 8.577391624450684, "learning_rate": 1e-06, "loss": 0.3818, "mean_token_accuracy": 0.8919059634208679, "num_tokens": 772985359.0, "step": 20259 }, { "epoch": 2.577280244243735, "ewc_loss": 0.07239866256713867, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036265849485062063, "grad_norm": 8.568880081176758, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.86793053150177, "num_tokens": 773019402.0, "step": 20260 }, { "epoch": 2.5774074545223256, "ewc_loss": 0.07255035638809204, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036417541559785604, "grad_norm": 8.594707489013672, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8649890422821045, "num_tokens": 773059176.0, "step": 20261 }, { "epoch": 2.5775346648009156, "ewc_loss": 0.07256283611059189, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000364300241926685, "grad_norm": 8.563505172729492, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8679443597793579, "num_tokens": 773094274.0, "step": 20262 }, { "epoch": 2.5776618750795066, "ewc_loss": 0.07260172069072723, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036468912730924785, "grad_norm": 8.548574447631836, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8648935556411743, "num_tokens": 773135679.0, "step": 20263 }, { "epoch": 2.5777890853580967, "ewc_loss": 0.07261306047439575, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036480248672887683, "grad_norm": 8.608960151672363, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8615908026695251, "num_tokens": 773174270.0, "step": 20264 }, { "epoch": 2.5779162956366877, "ewc_loss": 0.07251770794391632, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003638489870354533, "grad_norm": 8.534856796264648, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8693757653236389, "num_tokens": 773213582.0, "step": 20265 }, { "epoch": 2.5780435059152778, "ewc_loss": 0.0726715698838234, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365387590136379, "grad_norm": 8.662330627441406, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8608712553977966, "num_tokens": 773244980.0, "step": 20266 }, { "epoch": 2.5781707161938687, "ewc_loss": 0.07224725931882858, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003611444553826004, "grad_norm": 8.538619995117188, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8797868490219116, "num_tokens": 773287395.0, "step": 20267 }, { "epoch": 2.578297926472459, "ewc_loss": 0.07310041785240173, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036723463563248515, "grad_norm": 8.67249870300293, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8630002737045288, "num_tokens": 773324593.0, "step": 20268 }, { "epoch": 2.5784251367510493, "ewc_loss": 0.07252657413482666, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003614962042775005, "grad_norm": 8.587470054626465, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8611104488372803, "num_tokens": 773361802.0, "step": 20269 }, { "epoch": 2.57855234702964, "ewc_loss": 0.07262034714221954, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036487539182417095, "grad_norm": 8.668059349060059, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8710736036300659, "num_tokens": 773401348.0, "step": 20270 }, { "epoch": 2.5786795573082304, "ewc_loss": 0.07229365408420563, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036160837044008076, "grad_norm": 8.524768829345703, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8818162083625793, "num_tokens": 773439040.0, "step": 20271 }, { "epoch": 2.578806767586821, "ewc_loss": 0.0726490169763565, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003651620354503393, "grad_norm": 8.701756477355957, "learning_rate": 1e-06, "loss": 0.4122, "mean_token_accuracy": 0.8779395222663879, "num_tokens": 773471382.0, "step": 20272 }, { "epoch": 2.5789339778654115, "ewc_loss": 0.07249924540519714, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036122294841334224, "grad_norm": 8.591797828674316, "learning_rate": 1e-06, "loss": 0.4957, "mean_token_accuracy": 0.8544037342071533, "num_tokens": 773508924.0, "step": 20273 }, { "epoch": 2.579061188144002, "ewc_loss": 0.07253439724445343, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003640158101916313, "grad_norm": 8.585254669189453, "learning_rate": 1e-06, "loss": 0.4098, "mean_token_accuracy": 0.8809236884117126, "num_tokens": 773550219.0, "step": 20274 }, { "epoch": 2.5791883984225925, "ewc_loss": 0.07240718603134155, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003627436817623675, "grad_norm": 8.639121055603027, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8626840710639954, "num_tokens": 773589363.0, "step": 20275 }, { "epoch": 2.579315608701183, "ewc_loss": 0.07266979664564133, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036292843287810683, "grad_norm": 8.600894927978516, "learning_rate": 1e-06, "loss": 0.4784, "mean_token_accuracy": 0.8652889132499695, "num_tokens": 773630289.0, "step": 20276 }, { "epoch": 2.5794428189797736, "ewc_loss": 0.07269734889268875, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036320395884104073, "grad_norm": 8.633917808532715, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8669419884681702, "num_tokens": 773668660.0, "step": 20277 }, { "epoch": 2.579570029258364, "ewc_loss": 0.07265184074640274, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036274889134801924, "grad_norm": 8.646200180053711, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.862295925617218, "num_tokens": 773704318.0, "step": 20278 }, { "epoch": 2.5796972395369546, "ewc_loss": 0.07244111597537994, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000363083032425493, "grad_norm": 8.619104385375977, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8511713147163391, "num_tokens": 773745821.0, "step": 20279 }, { "epoch": 2.579824449815545, "ewc_loss": 0.0724775642156601, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036344752879813313, "grad_norm": 8.608892440795898, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8761757612228394, "num_tokens": 773782880.0, "step": 20280 }, { "epoch": 2.5799516600941357, "ewc_loss": 0.0724729374051094, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036340125370770693, "grad_norm": 8.588981628417969, "learning_rate": 1e-06, "loss": 0.5074, "mean_token_accuracy": 0.8514513969421387, "num_tokens": 773824880.0, "step": 20281 }, { "epoch": 2.580078870372726, "ewc_loss": 0.07239744067192078, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036264630034565926, "grad_norm": 8.57054615020752, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8695643544197083, "num_tokens": 773863210.0, "step": 20282 }, { "epoch": 2.5802060806513167, "ewc_loss": 0.07259529829025269, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036462480784393847, "grad_norm": 8.707354545593262, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8734510540962219, "num_tokens": 773903432.0, "step": 20283 }, { "epoch": 2.5803332909299073, "ewc_loss": 0.0722353383898735, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003610252751968801, "grad_norm": 8.569613456726074, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8734416961669922, "num_tokens": 773942424.0, "step": 20284 }, { "epoch": 2.580460501208498, "ewc_loss": 0.07274845242500305, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003661563969217241, "grad_norm": 8.724401473999023, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8643374443054199, "num_tokens": 773976598.0, "step": 20285 }, { "epoch": 2.5805877114870883, "ewc_loss": 0.07236354053020477, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003598658659029752, "grad_norm": 8.515887260437012, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8608075380325317, "num_tokens": 774016696.0, "step": 20286 }, { "epoch": 2.5807149217656784, "ewc_loss": 0.07311961054801941, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003674265753943473, "grad_norm": 8.723445892333984, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8587396144866943, "num_tokens": 774059046.0, "step": 20287 }, { "epoch": 2.5808421320442694, "ewc_loss": 0.07214865833520889, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036015844671055675, "grad_norm": 8.489309310913086, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8620482087135315, "num_tokens": 774095322.0, "step": 20288 }, { "epoch": 2.5809693423228595, "ewc_loss": 0.0730590745806694, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003692626196425408, "grad_norm": 8.80815315246582, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8547821640968323, "num_tokens": 774134506.0, "step": 20289 }, { "epoch": 2.5810965526014504, "ewc_loss": 0.07205080986022949, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035917997593060136, "grad_norm": 8.475217819213867, "learning_rate": 1e-06, "loss": 0.4161, "mean_token_accuracy": 0.8824172019958496, "num_tokens": 774176997.0, "step": 20290 }, { "epoch": 2.5812237628800405, "ewc_loss": 0.07340265065431595, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003702569811139256, "grad_norm": 8.87404727935791, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8601695895195007, "num_tokens": 774213900.0, "step": 20291 }, { "epoch": 2.5813509731586315, "ewc_loss": 0.07211914658546448, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003598633629735559, "grad_norm": 8.503400802612305, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8591129779815674, "num_tokens": 774259687.0, "step": 20292 }, { "epoch": 2.5814781834372216, "ewc_loss": 0.07326485216617584, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003713203768711537, "grad_norm": 8.866503715515137, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8710207343101501, "num_tokens": 774297200.0, "step": 20293 }, { "epoch": 2.581605393715812, "ewc_loss": 0.07201506942510605, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00035882258089259267, "grad_norm": 8.486380577087402, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8797568082809448, "num_tokens": 774336642.0, "step": 20294 }, { "epoch": 2.5817326039944026, "ewc_loss": 0.07334382832050323, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037211019662208855, "grad_norm": 8.97392463684082, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8506863117218018, "num_tokens": 774376762.0, "step": 20295 }, { "epoch": 2.581859814272993, "ewc_loss": 0.07192660868167877, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003579379990696907, "grad_norm": 8.392657279968262, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.87527996301651, "num_tokens": 774421360.0, "step": 20296 }, { "epoch": 2.5819870245515837, "ewc_loss": 0.07384132593870163, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003770851471927017, "grad_norm": 9.070823669433594, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8737108707427979, "num_tokens": 774460850.0, "step": 20297 }, { "epoch": 2.582114234830174, "ewc_loss": 0.0720415785908699, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003590876585803926, "grad_norm": 8.455880165100098, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8648971915245056, "num_tokens": 774497542.0, "step": 20298 }, { "epoch": 2.5822414451087647, "ewc_loss": 0.07393898814916611, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037806175532750785, "grad_norm": 8.942859649658203, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8557169437408447, "num_tokens": 774538164.0, "step": 20299 }, { "epoch": 2.5823686553873553, "ewc_loss": 0.07216361165046692, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000360307953087613, "grad_norm": 8.58240795135498, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8618534803390503, "num_tokens": 774576118.0, "step": 20300 }, { "epoch": 2.582495865665946, "ewc_loss": 0.07343542575836182, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003730261232703924, "grad_norm": 8.760146141052246, "learning_rate": 1e-06, "loss": 0.3973, "mean_token_accuracy": 0.8881431818008423, "num_tokens": 774617506.0, "step": 20301 }, { "epoch": 2.5826230759445363, "ewc_loss": 0.07246963679790497, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036336822086013854, "grad_norm": 8.700596809387207, "learning_rate": 1e-06, "loss": 0.5456, "mean_token_accuracy": 0.8422752618789673, "num_tokens": 774653720.0, "step": 20302 }, { "epoch": 2.582750286223127, "ewc_loss": 0.07278374582529068, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003665093390736729, "grad_norm": 8.69552993774414, "learning_rate": 1e-06, "loss": 0.3912, "mean_token_accuracy": 0.8869481086730957, "num_tokens": 774691056.0, "step": 20303 }, { "epoch": 2.5828774965017174, "ewc_loss": 0.07275079190731049, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036617982550524175, "grad_norm": 8.629240036010742, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8603817224502563, "num_tokens": 774726514.0, "step": 20304 }, { "epoch": 2.583004706780308, "ewc_loss": 0.07260692864656448, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003647411649581045, "grad_norm": 8.705365180969238, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8575125932693481, "num_tokens": 774760831.0, "step": 20305 }, { "epoch": 2.5831319170588984, "ewc_loss": 0.07268251478672028, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036549699143506587, "grad_norm": 8.598894119262695, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8768569231033325, "num_tokens": 774799399.0, "step": 20306 }, { "epoch": 2.583259127337489, "ewc_loss": 0.07283704727888107, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003670423466246575, "grad_norm": 8.657550811767578, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8591457605361938, "num_tokens": 774839910.0, "step": 20307 }, { "epoch": 2.5833863376160795, "ewc_loss": 0.0724920704960823, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003635925822891295, "grad_norm": 8.537199020385742, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.868396520614624, "num_tokens": 774875694.0, "step": 20308 }, { "epoch": 2.58351354789467, "ewc_loss": 0.07298509776592255, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036852280027233064, "grad_norm": 8.726858139038086, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8683329224586487, "num_tokens": 774908020.0, "step": 20309 }, { "epoch": 2.5836407581732606, "ewc_loss": 0.07252198457717896, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036389174056239426, "grad_norm": 8.548386573791504, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8560572266578674, "num_tokens": 774946716.0, "step": 20310 }, { "epoch": 2.583767968451851, "ewc_loss": 0.0731736421585083, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003704082628246397, "grad_norm": 8.742116928100586, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8614352941513062, "num_tokens": 774981732.0, "step": 20311 }, { "epoch": 2.583895178730441, "ewc_loss": 0.07234533131122589, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003621252253651619, "grad_norm": 8.508398056030273, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8706257939338684, "num_tokens": 775015280.0, "step": 20312 }, { "epoch": 2.584022389009032, "ewc_loss": 0.07308392226696014, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003695111081469804, "grad_norm": 8.682008743286133, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8742238283157349, "num_tokens": 775054518.0, "step": 20313 }, { "epoch": 2.5841495992876222, "ewc_loss": 0.07259667664766312, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003646386321634054, "grad_norm": 8.537866592407227, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8734025955200195, "num_tokens": 775091548.0, "step": 20314 }, { "epoch": 2.584276809566213, "ewc_loss": 0.0731477439403534, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003701492678374052, "grad_norm": 8.702828407287598, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8543383479118347, "num_tokens": 775130916.0, "step": 20315 }, { "epoch": 2.5844040198448033, "ewc_loss": 0.07252448797225952, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003639167407527566, "grad_norm": 8.529870986938477, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8787199854850769, "num_tokens": 775165875.0, "step": 20316 }, { "epoch": 2.584531230123394, "ewc_loss": 0.07313627004623413, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037003454053774476, "grad_norm": 8.717942237854004, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8516939878463745, "num_tokens": 775211463.0, "step": 20317 }, { "epoch": 2.5846584404019843, "ewc_loss": 0.07255332171916962, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036420507240109146, "grad_norm": 8.527877807617188, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8756632208824158, "num_tokens": 775248338.0, "step": 20318 }, { "epoch": 2.584785650680575, "ewc_loss": 0.0730396956205368, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036906887544319034, "grad_norm": 8.66264820098877, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8548541069030762, "num_tokens": 775285707.0, "step": 20319 }, { "epoch": 2.5849128609591654, "ewc_loss": 0.07256696373224258, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003643415111582726, "grad_norm": 8.522043228149414, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8734711408615112, "num_tokens": 775322682.0, "step": 20320 }, { "epoch": 2.585040071237756, "ewc_loss": 0.07306307554244995, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003693026374094188, "grad_norm": 8.604151725769043, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8765988349914551, "num_tokens": 775358274.0, "step": 20321 }, { "epoch": 2.5851672815163464, "ewc_loss": 0.07264479994773865, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003651199222076684, "grad_norm": 8.51310920715332, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8606266379356384, "num_tokens": 775394472.0, "step": 20322 }, { "epoch": 2.585294491794937, "ewc_loss": 0.07306834310293198, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003693553153425455, "grad_norm": 8.642800331115723, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8761876225471497, "num_tokens": 775433647.0, "step": 20323 }, { "epoch": 2.5854217020735275, "ewc_loss": 0.07281990349292755, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003668709541670978, "grad_norm": 8.577198028564453, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8565223217010498, "num_tokens": 775469159.0, "step": 20324 }, { "epoch": 2.585548912352118, "ewc_loss": 0.07313497364521027, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003700215893331915, "grad_norm": 8.6603422164917, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8751546144485474, "num_tokens": 775507329.0, "step": 20325 }, { "epoch": 2.5856761226307086, "ewc_loss": 0.07276584208011627, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003663302632048726, "grad_norm": 8.621180534362793, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8644031882286072, "num_tokens": 775544638.0, "step": 20326 }, { "epoch": 2.585803332909299, "ewc_loss": 0.07286521047353745, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003673239843919873, "grad_norm": 8.537428855895996, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8677127957344055, "num_tokens": 775584918.0, "step": 20327 }, { "epoch": 2.5859305431878896, "ewc_loss": 0.07309392094612122, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036961110890842974, "grad_norm": 8.635756492614746, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8619375228881836, "num_tokens": 775624214.0, "step": 20328 }, { "epoch": 2.58605775346648, "ewc_loss": 0.07268243283033371, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036549620563164353, "grad_norm": 8.60866928100586, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.854594349861145, "num_tokens": 775668372.0, "step": 20329 }, { "epoch": 2.5861849637450707, "ewc_loss": 0.07299035787582397, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003685754199977964, "grad_norm": 8.660175323486328, "learning_rate": 1e-06, "loss": 0.535, "mean_token_accuracy": 0.837933361530304, "num_tokens": 775700578.0, "step": 20330 }, { "epoch": 2.586312174023661, "ewc_loss": 0.07275965809822083, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036626847577281296, "grad_norm": 8.520689010620117, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8733495473861694, "num_tokens": 775746980.0, "step": 20331 }, { "epoch": 2.5864393843022517, "ewc_loss": 0.07308267056941986, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036949862260371447, "grad_norm": 8.607105255126953, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8601164221763611, "num_tokens": 775780762.0, "step": 20332 }, { "epoch": 2.5865665945808423, "ewc_loss": 0.07275442779064178, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036621620529331267, "grad_norm": 8.55033016204834, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.872307538986206, "num_tokens": 775815591.0, "step": 20333 }, { "epoch": 2.586693804859433, "ewc_loss": 0.07309390604496002, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036961096338927746, "grad_norm": 8.654958724975586, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8584692478179932, "num_tokens": 775856414.0, "step": 20334 }, { "epoch": 2.5868210151380233, "ewc_loss": 0.07265028357505798, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365174695616588, "grad_norm": 8.525107383728027, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8664911985397339, "num_tokens": 775896939.0, "step": 20335 }, { "epoch": 2.586948225416614, "ewc_loss": 0.0731290876865387, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000369962741388008, "grad_norm": 8.65052318572998, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8605501651763916, "num_tokens": 775941778.0, "step": 20336 }, { "epoch": 2.587075435695204, "ewc_loss": 0.07263495028018951, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003650214057415724, "grad_norm": 8.547867774963379, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8655168414115906, "num_tokens": 775979346.0, "step": 20337 }, { "epoch": 2.587202645973795, "ewc_loss": 0.07317386567592621, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003704105329234153, "grad_norm": 8.710884094238281, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8639262914657593, "num_tokens": 776013793.0, "step": 20338 }, { "epoch": 2.587329856252385, "ewc_loss": 0.07264360785484314, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003651079605333507, "grad_norm": 8.54566478729248, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8796194791793823, "num_tokens": 776058513.0, "step": 20339 }, { "epoch": 2.587457066530976, "ewc_loss": 0.07315713167190552, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003702431858982891, "grad_norm": 8.691289901733398, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8669397830963135, "num_tokens": 776102790.0, "step": 20340 }, { "epoch": 2.587584276809566, "ewc_loss": 0.07248653471469879, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036353719769977033, "grad_norm": 8.581814765930176, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8812973499298096, "num_tokens": 776133898.0, "step": 20341 }, { "epoch": 2.5877114870881566, "ewc_loss": 0.07308945059776306, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036956643452867866, "grad_norm": 8.683486938476562, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8647902607917786, "num_tokens": 776167590.0, "step": 20342 }, { "epoch": 2.587838697366747, "ewc_loss": 0.07264766097068787, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003651484730653465, "grad_norm": 8.525473594665527, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8598303198814392, "num_tokens": 776207966.0, "step": 20343 }, { "epoch": 2.5879659076453376, "ewc_loss": 0.07298649102449417, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003685367701109499, "grad_norm": 8.634491920471191, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.866250216960907, "num_tokens": 776245601.0, "step": 20344 }, { "epoch": 2.588093117923928, "ewc_loss": 0.0727437287569046, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036610919050872326, "grad_norm": 9.150242805480957, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8748766779899597, "num_tokens": 776285377.0, "step": 20345 }, { "epoch": 2.5882203282025187, "ewc_loss": 0.07219165563583374, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036058842670172453, "grad_norm": 8.412763595581055, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8580840826034546, "num_tokens": 776328116.0, "step": 20346 }, { "epoch": 2.588347538481109, "ewc_loss": 0.07358726859092712, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003745446156244725, "grad_norm": 8.787531852722168, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8499281406402588, "num_tokens": 776369008.0, "step": 20347 }, { "epoch": 2.5884747487596997, "ewc_loss": 0.07203811407089233, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003590530250221491, "grad_norm": 8.424680709838867, "learning_rate": 1e-06, "loss": 0.4946, "mean_token_accuracy": 0.8568679094314575, "num_tokens": 776410151.0, "step": 20348 }, { "epoch": 2.5886019590382903, "ewc_loss": 0.07376974821090698, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037636939669027925, "grad_norm": 8.820592880249023, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.8906987905502319, "num_tokens": 776445944.0, "step": 20349 }, { "epoch": 2.588729169316881, "ewc_loss": 0.07212156057357788, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003598874609451741, "grad_norm": 8.500121116638184, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8697798848152161, "num_tokens": 776485388.0, "step": 20350 }, { "epoch": 2.5888563795954713, "ewc_loss": 0.07356351613998413, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003743070119526237, "grad_norm": 8.792728424072266, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8480052947998047, "num_tokens": 776529135.0, "step": 20351 }, { "epoch": 2.588983589874062, "ewc_loss": 0.07294824719429016, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036327147972770035, "grad_norm": 8.681403160095215, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8588865995407104, "num_tokens": 776564310.0, "step": 20352 }, { "epoch": 2.5891108001526524, "ewc_loss": 0.07291871309280396, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003678590292111039, "grad_norm": 8.639182090759277, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8802287578582764, "num_tokens": 776609004.0, "step": 20353 }, { "epoch": 2.589238010431243, "ewc_loss": 0.0727873370051384, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003665452532004565, "grad_norm": 8.581031799316406, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8725923299789429, "num_tokens": 776645796.0, "step": 20354 }, { "epoch": 2.5893652207098334, "ewc_loss": 0.07283826172351837, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003670545411296189, "grad_norm": 8.642117500305176, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8563646078109741, "num_tokens": 776684248.0, "step": 20355 }, { "epoch": 2.589492430988424, "ewc_loss": 0.07285838574171066, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003672557359095663, "grad_norm": 8.572490692138672, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8691866397857666, "num_tokens": 776722684.0, "step": 20356 }, { "epoch": 2.5896196412670145, "ewc_loss": 0.07307670265436172, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036943890154361725, "grad_norm": 8.617371559143066, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8708400726318359, "num_tokens": 776764635.0, "step": 20357 }, { "epoch": 2.589746851545605, "ewc_loss": 0.07290272414684296, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003676991618704051, "grad_norm": 8.6898193359375, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8549083471298218, "num_tokens": 776805683.0, "step": 20358 }, { "epoch": 2.5898740618241956, "ewc_loss": 0.07297264039516449, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003683982649818063, "grad_norm": 8.708351135253906, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8688058257102966, "num_tokens": 776841229.0, "step": 20359 }, { "epoch": 2.5900012721027856, "ewc_loss": 0.07273177802562714, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003659896901808679, "grad_norm": 9.191149711608887, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8724717497825623, "num_tokens": 776881319.0, "step": 20360 }, { "epoch": 2.5901284823813766, "ewc_loss": 0.07220834493637085, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000360755278961733, "grad_norm": 8.432483673095703, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8713237643241882, "num_tokens": 776919436.0, "step": 20361 }, { "epoch": 2.5902556926599667, "ewc_loss": 0.07358884066343307, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003745602734852582, "grad_norm": 8.82461166381836, "learning_rate": 1e-06, "loss": 0.527, "mean_token_accuracy": 0.848401665687561, "num_tokens": 776958644.0, "step": 20362 }, { "epoch": 2.5903829029385577, "ewc_loss": 0.07224459946155548, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003611178253777325, "grad_norm": 8.543561935424805, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8827893733978271, "num_tokens": 776994509.0, "step": 20363 }, { "epoch": 2.5905101132171477, "ewc_loss": 0.0735442191362381, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003741141117643565, "grad_norm": 8.760946273803711, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8720403909683228, "num_tokens": 777037557.0, "step": 20364 }, { "epoch": 2.5906373234957387, "ewc_loss": 0.07255369424819946, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003642088267952204, "grad_norm": 8.499921798706055, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.8458938598632812, "num_tokens": 777077237.0, "step": 20365 }, { "epoch": 2.590764533774329, "ewc_loss": 0.07347387820482254, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037341067218221724, "grad_norm": 8.891602516174316, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8729132413864136, "num_tokens": 777120813.0, "step": 20366 }, { "epoch": 2.5908917440529193, "ewc_loss": 0.07242457568645477, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003629176353570074, "grad_norm": 8.57681941986084, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.854181706905365, "num_tokens": 777155809.0, "step": 20367 }, { "epoch": 2.59101895433151, "ewc_loss": 0.0734608918428421, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037328083999454975, "grad_norm": 8.75586223602295, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8669536113739014, "num_tokens": 777195868.0, "step": 20368 }, { "epoch": 2.5911461646101004, "ewc_loss": 0.07255934178829193, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000364265259122476, "grad_norm": 8.548945426940918, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8703139424324036, "num_tokens": 777233406.0, "step": 20369 }, { "epoch": 2.591273374888691, "ewc_loss": 0.07340681552886963, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003727400617208332, "grad_norm": 8.807672500610352, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8674862384796143, "num_tokens": 777272817.0, "step": 20370 }, { "epoch": 2.5914005851672814, "ewc_loss": 0.07264188677072525, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003650907601695508, "grad_norm": 8.58887767791748, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8611311912536621, "num_tokens": 777312398.0, "step": 20371 }, { "epoch": 2.591527795445872, "ewc_loss": 0.07314618676900864, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003701337263919413, "grad_norm": 8.677090644836426, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8767122626304626, "num_tokens": 777349375.0, "step": 20372 }, { "epoch": 2.5916550057244625, "ewc_loss": 0.07272115349769592, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036588337388820946, "grad_norm": 8.571038246154785, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8749727606773376, "num_tokens": 777394806.0, "step": 20373 }, { "epoch": 2.591782216003053, "ewc_loss": 0.07308647781610489, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003695366613101214, "grad_norm": 8.726317405700684, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8710060715675354, "num_tokens": 777434350.0, "step": 20374 }, { "epoch": 2.5919094262816436, "ewc_loss": 0.0726124569773674, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003647964622359723, "grad_norm": 8.550481796264648, "learning_rate": 1e-06, "loss": 0.5321, "mean_token_accuracy": 0.8396117687225342, "num_tokens": 777469690.0, "step": 20375 }, { "epoch": 2.592036636560234, "ewc_loss": 0.07335985451936722, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037227041320875287, "grad_norm": 8.828594207763672, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.857999324798584, "num_tokens": 777502095.0, "step": 20376 }, { "epoch": 2.5921638468388246, "ewc_loss": 0.07258529961109161, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003645248943939805, "grad_norm": 8.64358139038086, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8771073818206787, "num_tokens": 777540387.0, "step": 20377 }, { "epoch": 2.592291057117415, "ewc_loss": 0.07318789511919022, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037055081338621676, "grad_norm": 8.637602806091309, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8731640577316284, "num_tokens": 777581855.0, "step": 20378 }, { "epoch": 2.5924182673960057, "ewc_loss": 0.07288603484630585, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036753222229890525, "grad_norm": 8.694314956665039, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8667784929275513, "num_tokens": 777619852.0, "step": 20379 }, { "epoch": 2.592545477674596, "ewc_loss": 0.0727122575044632, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036579446168616414, "grad_norm": 8.663772583007812, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8598314523696899, "num_tokens": 777662772.0, "step": 20380 }, { "epoch": 2.5926726879531867, "ewc_loss": 0.07286307960748672, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000367302680388093, "grad_norm": 8.590805053710938, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8499993681907654, "num_tokens": 777701171.0, "step": 20381 }, { "epoch": 2.5927998982317773, "ewc_loss": 0.07290615141391754, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003677333879750222, "grad_norm": 8.791236877441406, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.851740837097168, "num_tokens": 777738245.0, "step": 20382 }, { "epoch": 2.592927108510368, "ewc_loss": 0.07269083708524704, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003655802574940026, "grad_norm": 8.639445304870605, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8675365447998047, "num_tokens": 777775732.0, "step": 20383 }, { "epoch": 2.5930543187889583, "ewc_loss": 0.07287312299013138, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003674030886031687, "grad_norm": 8.657074928283691, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.865999698638916, "num_tokens": 777811077.0, "step": 20384 }, { "epoch": 2.5931815290675484, "ewc_loss": 0.0727011114358902, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003656829649116844, "grad_norm": 8.643463134765625, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8642615079879761, "num_tokens": 777854107.0, "step": 20385 }, { "epoch": 2.5933087393461394, "ewc_loss": 0.07291259616613388, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003677978238556534, "grad_norm": 8.64125919342041, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8656953573226929, "num_tokens": 777891950.0, "step": 20386 }, { "epoch": 2.5934359496247295, "ewc_loss": 0.07270254194736481, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003656972839962691, "grad_norm": 8.525290489196777, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8833259344100952, "num_tokens": 777932638.0, "step": 20387 }, { "epoch": 2.5935631599033204, "ewc_loss": 0.07311020791530609, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036977394483983517, "grad_norm": 8.744668006896973, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8646513223648071, "num_tokens": 777977733.0, "step": 20388 }, { "epoch": 2.5936903701819105, "ewc_loss": 0.07260449230670929, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003647167468443513, "grad_norm": 8.517024993896484, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8718529343605042, "num_tokens": 778020451.0, "step": 20389 }, { "epoch": 2.5938175804605015, "ewc_loss": 0.07339325547218323, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037260440876707435, "grad_norm": 8.725041389465332, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8723493218421936, "num_tokens": 778059594.0, "step": 20390 }, { "epoch": 2.5939447907390916, "ewc_loss": 0.07267244160175323, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365396321285516, "grad_norm": 8.602646827697754, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8703252077102661, "num_tokens": 778099235.0, "step": 20391 }, { "epoch": 2.594072001017682, "ewc_loss": 0.07323353737592697, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003710072487592697, "grad_norm": 8.725278854370117, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8765541315078735, "num_tokens": 778135430.0, "step": 20392 }, { "epoch": 2.5941992112962726, "ewc_loss": 0.07266958057880402, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003653676831163466, "grad_norm": 8.614378929138184, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8582038879394531, "num_tokens": 778175111.0, "step": 20393 }, { "epoch": 2.594326421574863, "ewc_loss": 0.07312183082103729, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003698902146425098, "grad_norm": 8.610123634338379, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8574700355529785, "num_tokens": 778211954.0, "step": 20394 }, { "epoch": 2.5944536318534537, "ewc_loss": 0.07296926528215408, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003683645336423069, "grad_norm": 8.671956062316895, "learning_rate": 1e-06, "loss": 0.5126, "mean_token_accuracy": 0.8509222269058228, "num_tokens": 778246580.0, "step": 20395 }, { "epoch": 2.594580842132044, "ewc_loss": 0.07293490320444107, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036802090471610427, "grad_norm": 8.618256568908691, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8722464442253113, "num_tokens": 778288659.0, "step": 20396 }, { "epoch": 2.5947080524106347, "ewc_loss": 0.07306406646966934, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003693125327117741, "grad_norm": 8.600149154663086, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8650382161140442, "num_tokens": 778325365.0, "step": 20397 }, { "epoch": 2.5948352626892253, "ewc_loss": 0.07303497195243835, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003690216108225286, "grad_norm": 8.638773918151855, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8735334873199463, "num_tokens": 778362982.0, "step": 20398 }, { "epoch": 2.594962472967816, "ewc_loss": 0.07301907241344452, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036886255838908255, "grad_norm": 8.643586158752441, "learning_rate": 1e-06, "loss": 0.5005, "mean_token_accuracy": 0.8513479232788086, "num_tokens": 778394230.0, "step": 20399 }, { "epoch": 2.5950896832464063, "ewc_loss": 0.07302939146757126, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003689657896757126, "grad_norm": 8.600127220153809, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8552342057228088, "num_tokens": 778434345.0, "step": 20400 }, { "epoch": 2.595216893524997, "ewc_loss": 0.07338377833366394, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003700682136695832, "grad_norm": 8.667757034301758, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8601523637771606, "num_tokens": 778478303.0, "step": 20401 }, { "epoch": 2.5953441038035874, "ewc_loss": 0.07307274639606476, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036695791641250253, "grad_norm": 8.597713470458984, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8720675706863403, "num_tokens": 778513915.0, "step": 20402 }, { "epoch": 2.595471314082178, "ewc_loss": 0.0731312707066536, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000369984598364681, "grad_norm": 8.564345359802246, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8801277875900269, "num_tokens": 778554294.0, "step": 20403 }, { "epoch": 2.5955985243607684, "ewc_loss": 0.07308657467365265, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003695376217365265, "grad_norm": 8.751988410949707, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8636220693588257, "num_tokens": 778592836.0, "step": 20404 }, { "epoch": 2.595725734639359, "ewc_loss": 0.07274159789085388, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000366087828297168, "grad_norm": 8.520843505859375, "learning_rate": 1e-06, "loss": 0.4884, "mean_token_accuracy": 0.8567661046981812, "num_tokens": 778629316.0, "step": 20405 }, { "epoch": 2.5958529449179495, "ewc_loss": 0.07340948283672333, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000372766749933362, "grad_norm": 8.753890037536621, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8616297245025635, "num_tokens": 778659065.0, "step": 20406 }, { "epoch": 2.59598015519654, "ewc_loss": 0.07269325852394104, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003656044718809426, "grad_norm": 8.505462646484375, "learning_rate": 1e-06, "loss": 0.4092, "mean_token_accuracy": 0.8813909292221069, "num_tokens": 778693949.0, "step": 20407 }, { "epoch": 2.5961073654751305, "ewc_loss": 0.07379692047834396, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003741996770258993, "grad_norm": 8.651347160339355, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.870712399482727, "num_tokens": 778738045.0, "step": 20408 }, { "epoch": 2.596234575753721, "ewc_loss": 0.07278178632259369, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003664896939881146, "grad_norm": 8.568032264709473, "learning_rate": 1e-06, "loss": 0.5895, "mean_token_accuracy": 0.8216409087181091, "num_tokens": 778769578.0, "step": 20409 }, { "epoch": 2.596361786032311, "ewc_loss": 0.07336076349020004, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037227949360385537, "grad_norm": 8.674322128295898, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8610458374023438, "num_tokens": 778810052.0, "step": 20410 }, { "epoch": 2.596488996310902, "ewc_loss": 0.07337465137243271, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036997697316110134, "grad_norm": 8.644355773925781, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8596692681312561, "num_tokens": 778850868.0, "step": 20411 }, { "epoch": 2.596616206589492, "ewc_loss": 0.07336273789405823, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003698578802868724, "grad_norm": 8.661426544189453, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.873455286026001, "num_tokens": 778890129.0, "step": 20412 }, { "epoch": 2.596743416868083, "ewc_loss": 0.07335861027240753, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003698165819514543, "grad_norm": 8.65255069732666, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8703262209892273, "num_tokens": 778923181.0, "step": 20413 }, { "epoch": 2.5968706271466733, "ewc_loss": 0.07325625419616699, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036879300023429096, "grad_norm": 8.618562698364258, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8586890697479248, "num_tokens": 778963328.0, "step": 20414 }, { "epoch": 2.596997837425264, "ewc_loss": 0.07336379587650299, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003698684449773282, "grad_norm": 8.588231086730957, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8759756088256836, "num_tokens": 779003932.0, "step": 20415 }, { "epoch": 2.5971250477038543, "ewc_loss": 0.07335267961025238, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000369757239241153, "grad_norm": 8.668943405151367, "learning_rate": 1e-06, "loss": 0.4949, "mean_token_accuracy": 0.85636305809021, "num_tokens": 779043913.0, "step": 20416 }, { "epoch": 2.597252257982445, "ewc_loss": 0.07313863933086395, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003676168853417039, "grad_norm": 8.61506175994873, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8622126579284668, "num_tokens": 779077126.0, "step": 20417 }, { "epoch": 2.5973794682610354, "ewc_loss": 0.07327763736248016, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003690067969728261, "grad_norm": 8.647651672363281, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.870337724685669, "num_tokens": 779111544.0, "step": 20418 }, { "epoch": 2.597506678539626, "ewc_loss": 0.07319965958595276, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003682270471472293, "grad_norm": 8.65439510345459, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8658067584037781, "num_tokens": 779145493.0, "step": 20419 }, { "epoch": 2.5976338888182164, "ewc_loss": 0.07313669472932816, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036759741487912834, "grad_norm": 8.582594871520996, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8577332496643066, "num_tokens": 779189216.0, "step": 20420 }, { "epoch": 2.597761099096807, "ewc_loss": 0.07309406995773315, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036717118928208947, "grad_norm": 8.612052917480469, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8726065158843994, "num_tokens": 779225914.0, "step": 20421 }, { "epoch": 2.5978883093753975, "ewc_loss": 0.07316407561302185, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036787119461223483, "grad_norm": 8.665816307067871, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8711768388748169, "num_tokens": 779264118.0, "step": 20422 }, { "epoch": 2.598015519653988, "ewc_loss": 0.07335970550775528, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036738612107001245, "grad_norm": 8.646568298339844, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8692086935043335, "num_tokens": 779301694.0, "step": 20423 }, { "epoch": 2.5981427299325786, "ewc_loss": 0.07297451794147491, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036597560392692685, "grad_norm": 8.527926445007324, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8639497756958008, "num_tokens": 779339012.0, "step": 20424 }, { "epoch": 2.598269940211169, "ewc_loss": 0.07346350699663162, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003708655422087759, "grad_norm": 8.701408386230469, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8710862994194031, "num_tokens": 779376358.0, "step": 20425 }, { "epoch": 2.5983971504897596, "ewc_loss": 0.07270333170890808, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036570520023815334, "grad_norm": 8.54704761505127, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8647165298461914, "num_tokens": 779416344.0, "step": 20426 }, { "epoch": 2.59852436076835, "ewc_loss": 0.07317011803388596, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037037304718978703, "grad_norm": 8.644719123840332, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8537531495094299, "num_tokens": 779452189.0, "step": 20427 }, { "epoch": 2.5986515710469407, "ewc_loss": 0.07274174690246582, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036608934169635177, "grad_norm": 8.606144905090332, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8661372065544128, "num_tokens": 779493513.0, "step": 20428 }, { "epoch": 2.598778781325531, "ewc_loss": 0.07295706123113632, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003682424721773714, "grad_norm": 8.578914642333984, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8713475465774536, "num_tokens": 779533061.0, "step": 20429 }, { "epoch": 2.5989059916041217, "ewc_loss": 0.07300907373428345, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003687625576276332, "grad_norm": 8.635034561157227, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8603935837745667, "num_tokens": 779570159.0, "step": 20430 }, { "epoch": 2.5990332018827123, "ewc_loss": 0.07277509570121765, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036642278428189456, "grad_norm": 8.583161354064941, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8635740280151367, "num_tokens": 779609066.0, "step": 20431 }, { "epoch": 2.599160412161303, "ewc_loss": 0.07294176518917084, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036808953154832125, "grad_norm": 8.618539810180664, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8657079339027405, "num_tokens": 779651087.0, "step": 20432 }, { "epoch": 2.5992876224398933, "ewc_loss": 0.0727447122335434, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003661189693957567, "grad_norm": 8.572850227355957, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8781264424324036, "num_tokens": 779689531.0, "step": 20433 }, { "epoch": 2.599414832718484, "ewc_loss": 0.07293486595153809, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003680205554701388, "grad_norm": 8.651824951171875, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8617013692855835, "num_tokens": 779724062.0, "step": 20434 }, { "epoch": 2.599542042997074, "ewc_loss": 0.0727493017911911, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036616489524021745, "grad_norm": 8.56080150604248, "learning_rate": 1e-06, "loss": 0.5084, "mean_token_accuracy": 0.8521991968154907, "num_tokens": 779766029.0, "step": 20435 }, { "epoch": 2.599669253275665, "ewc_loss": 0.07306406646966934, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003693125327117741, "grad_norm": 8.542769432067871, "learning_rate": 1e-06, "loss": 0.5116, "mean_token_accuracy": 0.8503057956695557, "num_tokens": 779811065.0, "step": 20436 }, { "epoch": 2.599796463554255, "ewc_loss": 0.0729079395532608, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036775125772692263, "grad_norm": 8.563863754272461, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8526929616928101, "num_tokens": 779852442.0, "step": 20437 }, { "epoch": 2.599923673832846, "ewc_loss": 0.07308846712112427, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036955653922632337, "grad_norm": 8.63044548034668, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8709093928337097, "num_tokens": 779885993.0, "step": 20438 }, { "epoch": 2.600050884111436, "ewc_loss": 0.0728396475315094, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003670683363452554, "grad_norm": 8.542640686035156, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8686952590942383, "num_tokens": 779927791.0, "step": 20439 }, { "epoch": 2.6001780943900266, "ewc_loss": 0.07305188477039337, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036919076228514314, "grad_norm": 8.565741539001465, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8677589893341064, "num_tokens": 779961517.0, "step": 20440 }, { "epoch": 2.600305304668617, "ewc_loss": 0.07296741008758545, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000368345994502306, "grad_norm": 8.634709358215332, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8526006937026978, "num_tokens": 779992902.0, "step": 20441 }, { "epoch": 2.6004325149472076, "ewc_loss": 0.07294809818267822, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036815283237956464, "grad_norm": 8.546546936035156, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.867035984992981, "num_tokens": 780032849.0, "step": 20442 }, { "epoch": 2.600559725225798, "ewc_loss": 0.07301244884729385, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036879637627862394, "grad_norm": 8.607803344726562, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8770911693572998, "num_tokens": 780072563.0, "step": 20443 }, { "epoch": 2.6006869355043887, "ewc_loss": 0.07282184064388275, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036689030821435153, "grad_norm": 8.535343170166016, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8665196895599365, "num_tokens": 780115299.0, "step": 20444 }, { "epoch": 2.600814145782979, "ewc_loss": 0.07304994016885757, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003691712627187371, "grad_norm": 8.666415214538574, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.863860011100769, "num_tokens": 780155792.0, "step": 20445 }, { "epoch": 2.6009413560615697, "ewc_loss": 0.07271414995193481, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000365813379175961, "grad_norm": 8.605709075927734, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8608822822570801, "num_tokens": 780189040.0, "step": 20446 }, { "epoch": 2.6010685663401603, "ewc_loss": 0.07295027375221252, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036817463114857674, "grad_norm": 8.578197479248047, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8647873401641846, "num_tokens": 780229463.0, "step": 20447 }, { "epoch": 2.601195776618751, "ewc_loss": 0.072744220495224, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036611405084840953, "grad_norm": 8.556774139404297, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8768999576568604, "num_tokens": 780265961.0, "step": 20448 }, { "epoch": 2.6013229868973413, "ewc_loss": 0.07300330698490143, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003687049320433289, "grad_norm": 8.612058639526367, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.859839916229248, "num_tokens": 780303220.0, "step": 20449 }, { "epoch": 2.601450197175932, "ewc_loss": 0.07274472713470459, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000366119114914909, "grad_norm": 8.588594436645508, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8633042573928833, "num_tokens": 780340456.0, "step": 20450 }, { "epoch": 2.6015774074545224, "ewc_loss": 0.07299564778804779, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003686283016577363, "grad_norm": 8.634900093078613, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8640961647033691, "num_tokens": 780381402.0, "step": 20451 }, { "epoch": 2.601704617733113, "ewc_loss": 0.07284238934516907, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003670957521535456, "grad_norm": 8.560164451599121, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.8694777488708496, "num_tokens": 780422571.0, "step": 20452 }, { "epoch": 2.6018318280117034, "ewc_loss": 0.07301714271306992, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003688432916533202, "grad_norm": 8.582279205322266, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8612626791000366, "num_tokens": 780462447.0, "step": 20453 }, { "epoch": 2.601959038290294, "ewc_loss": 0.07305770367383957, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003692489117383957, "grad_norm": 8.587403297424316, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8643478155136108, "num_tokens": 780499183.0, "step": 20454 }, { "epoch": 2.6020862485688845, "ewc_loss": 0.07311159372329712, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003697878564707935, "grad_norm": 8.649761199951172, "learning_rate": 1e-06, "loss": 0.5037, "mean_token_accuracy": 0.8527649641036987, "num_tokens": 780539216.0, "step": 20455 }, { "epoch": 2.602213458847475, "ewc_loss": 0.07285933196544647, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003672652237582952, "grad_norm": 8.519230842590332, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8731352090835571, "num_tokens": 780578725.0, "step": 20456 }, { "epoch": 2.6023406691260655, "ewc_loss": 0.07328636944293976, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037153554148972034, "grad_norm": 8.719184875488281, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8664014935493469, "num_tokens": 780615175.0, "step": 20457 }, { "epoch": 2.6024678794046556, "ewc_loss": 0.07275425642728806, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003662144299596548, "grad_norm": 8.542526245117188, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8686108589172363, "num_tokens": 780651186.0, "step": 20458 }, { "epoch": 2.6025950896832466, "ewc_loss": 0.07337049394845963, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003723768168129027, "grad_norm": 8.692301750183105, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8582438230514526, "num_tokens": 780685550.0, "step": 20459 }, { "epoch": 2.6027222999618367, "ewc_loss": 0.0727672129869461, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003663440002128482, "grad_norm": 8.531210899353027, "learning_rate": 1e-06, "loss": 0.3901, "mean_token_accuracy": 0.8870997428894043, "num_tokens": 780725477.0, "step": 20460 }, { "epoch": 2.6028495102404277, "ewc_loss": 0.0733044296503067, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037171621806919575, "grad_norm": 8.663413047790527, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8692361116409302, "num_tokens": 780762481.0, "step": 20461 }, { "epoch": 2.6029767205190177, "ewc_loss": 0.07285018265247345, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003671736631076783, "grad_norm": 8.544731140136719, "learning_rate": 1e-06, "loss": 0.5285, "mean_token_accuracy": 0.8455764055252075, "num_tokens": 780800305.0, "step": 20462 }, { "epoch": 2.6031039307976087, "ewc_loss": 0.07348839938640594, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003711144963745028, "grad_norm": 8.719588279724121, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8679296970367432, "num_tokens": 780836271.0, "step": 20463 }, { "epoch": 2.603231141076199, "ewc_loss": 0.07280724495649338, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036674432340078056, "grad_norm": 8.521854400634766, "learning_rate": 1e-06, "loss": 0.4197, "mean_token_accuracy": 0.877387285232544, "num_tokens": 780877363.0, "step": 20464 }, { "epoch": 2.6033583513547893, "ewc_loss": 0.0734005868434906, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003726777504198253, "grad_norm": 8.712447166442871, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8530784845352173, "num_tokens": 780922982.0, "step": 20465 }, { "epoch": 2.60348556163338, "ewc_loss": 0.07285898923873901, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036726173129864037, "grad_norm": 8.552980422973633, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8632375001907349, "num_tokens": 780957925.0, "step": 20466 }, { "epoch": 2.6036127719119704, "ewc_loss": 0.07338186353445053, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003724904963746667, "grad_norm": 8.701576232910156, "learning_rate": 1e-06, "loss": 0.5363, "mean_token_accuracy": 0.8467780947685242, "num_tokens": 780996922.0, "step": 20467 }, { "epoch": 2.603739982190561, "ewc_loss": 0.07272110879421234, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003658829955384135, "grad_norm": 8.533808708190918, "learning_rate": 1e-06, "loss": 0.5356, "mean_token_accuracy": 0.8490282893180847, "num_tokens": 781037209.0, "step": 20468 }, { "epoch": 2.6038671924691514, "ewc_loss": 0.07355410605669022, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037177151534706354, "grad_norm": 8.64664077758789, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.858776330947876, "num_tokens": 781078077.0, "step": 20469 }, { "epoch": 2.603994402747742, "ewc_loss": 0.07288704812526703, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036754232132807374, "grad_norm": 8.550561904907227, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8644964098930359, "num_tokens": 781122516.0, "step": 20470 }, { "epoch": 2.6041216130263325, "ewc_loss": 0.07328467071056366, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003715185448527336, "grad_norm": 8.669567108154297, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.858396053314209, "num_tokens": 781156290.0, "step": 20471 }, { "epoch": 2.604248823304923, "ewc_loss": 0.07285437732934952, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036721565993502736, "grad_norm": 8.535079956054688, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8726911544799805, "num_tokens": 781192769.0, "step": 20472 }, { "epoch": 2.6043760335835135, "ewc_loss": 0.07329578697681427, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037162972148507833, "grad_norm": 8.62051010131836, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.8700516223907471, "num_tokens": 781227594.0, "step": 20473 }, { "epoch": 2.604503243862104, "ewc_loss": 0.07288588583469391, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036753067979589105, "grad_norm": 8.594635009765625, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8765716552734375, "num_tokens": 781259369.0, "step": 20474 }, { "epoch": 2.6046304541406946, "ewc_loss": 0.07355883717536926, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037181886727921665, "grad_norm": 8.660396575927734, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8637257814407349, "num_tokens": 781296788.0, "step": 20475 }, { "epoch": 2.604757664419285, "ewc_loss": 0.07321812212467194, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036841168184764683, "grad_norm": 8.578807830810547, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8673008680343628, "num_tokens": 781333103.0, "step": 20476 }, { "epoch": 2.6048848746978757, "ewc_loss": 0.07342125475406647, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000370443012798205, "grad_norm": 8.607305526733398, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8751753568649292, "num_tokens": 781372691.0, "step": 20477 }, { "epoch": 2.605012084976466, "ewc_loss": 0.0733417421579361, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003696479252539575, "grad_norm": 8.645679473876953, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8609771728515625, "num_tokens": 781410480.0, "step": 20478 }, { "epoch": 2.6051392952550567, "ewc_loss": 0.07318365573883057, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036806706339120865, "grad_norm": 8.611186027526855, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8571051359176636, "num_tokens": 781454110.0, "step": 20479 }, { "epoch": 2.6052665055336472, "ewc_loss": 0.07322616875171661, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036849218304269016, "grad_norm": 8.706703186035156, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8631926774978638, "num_tokens": 781494034.0, "step": 20480 }, { "epoch": 2.6053937158122378, "ewc_loss": 0.0730075091123581, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003663055249489844, "grad_norm": 8.58577823638916, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8632020354270935, "num_tokens": 781536225.0, "step": 20481 }, { "epoch": 2.6055209260908283, "ewc_loss": 0.07342252880334854, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037045576027594507, "grad_norm": 8.688981056213379, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8714136481285095, "num_tokens": 781573987.0, "step": 20482 }, { "epoch": 2.6056481363694184, "ewc_loss": 0.07283200323581696, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003645504475571215, "grad_norm": 8.54824447631836, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8691402673721313, "num_tokens": 781608987.0, "step": 20483 }, { "epoch": 2.6057753466480094, "ewc_loss": 0.07330049574375153, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003692354657687247, "grad_norm": 8.707919120788574, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8715865015983582, "num_tokens": 781646185.0, "step": 20484 }, { "epoch": 2.6059025569265994, "ewc_loss": 0.07289227843284607, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003651532460935414, "grad_norm": 8.61277961730957, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.8539078235626221, "num_tokens": 781682999.0, "step": 20485 }, { "epoch": 2.6060297672051904, "ewc_loss": 0.07318323850631714, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003680628433357924, "grad_norm": 8.600783348083496, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8628859519958496, "num_tokens": 781726052.0, "step": 20486 }, { "epoch": 2.6061569774837805, "ewc_loss": 0.07274650037288666, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036613683914765716, "grad_norm": 8.628837585449219, "learning_rate": 1e-06, "loss": 0.4169, "mean_token_accuracy": 0.879150927066803, "num_tokens": 781760562.0, "step": 20487 }, { "epoch": 2.6062841877623715, "ewc_loss": 0.07281370460987091, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036680890480056405, "grad_norm": 8.610051155090332, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8534660339355469, "num_tokens": 781803765.0, "step": 20488 }, { "epoch": 2.6064113980409616, "ewc_loss": 0.07290641963481903, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036773603642359376, "grad_norm": 8.648946762084961, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8655458688735962, "num_tokens": 781845419.0, "step": 20489 }, { "epoch": 2.606538608319552, "ewc_loss": 0.07275187224149704, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036619059392251074, "grad_norm": 8.59721851348877, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8741821646690369, "num_tokens": 781883371.0, "step": 20490 }, { "epoch": 2.6066658185981426, "ewc_loss": 0.07288391888141632, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036751103471033275, "grad_norm": 8.64767837524414, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8633097410202026, "num_tokens": 781919669.0, "step": 20491 }, { "epoch": 2.606793028876733, "ewc_loss": 0.0725570023059845, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036424194695428014, "grad_norm": 8.532382011413574, "learning_rate": 1e-06, "loss": 0.4545, "mean_token_accuracy": 0.8715978860855103, "num_tokens": 781953212.0, "step": 20492 }, { "epoch": 2.6069202391553237, "ewc_loss": 0.07297395914793015, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003684114490170032, "grad_norm": 8.616884231567383, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8688703179359436, "num_tokens": 781996824.0, "step": 20493 }, { "epoch": 2.607047449433914, "ewc_loss": 0.0727357417345047, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036602927139028907, "grad_norm": 8.570857048034668, "learning_rate": 1e-06, "loss": 0.3957, "mean_token_accuracy": 0.8841934204101562, "num_tokens": 782035350.0, "step": 20494 }, { "epoch": 2.6071746597125047, "ewc_loss": 0.07299482077360153, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003686200943775475, "grad_norm": 8.667430877685547, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8588321805000305, "num_tokens": 782073217.0, "step": 20495 }, { "epoch": 2.6073018699910953, "ewc_loss": 0.0726642906665802, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036531483056023717, "grad_norm": 8.599540710449219, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8593039512634277, "num_tokens": 782110777.0, "step": 20496 }, { "epoch": 2.607429080269686, "ewc_loss": 0.07302006334066391, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036887251189909875, "grad_norm": 8.650023460388184, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8592422008514404, "num_tokens": 782153300.0, "step": 20497 }, { "epoch": 2.6075562905482763, "ewc_loss": 0.07278121262788773, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036648401874117553, "grad_norm": 8.67383098602295, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8706762194633484, "num_tokens": 782191945.0, "step": 20498 }, { "epoch": 2.607683500826867, "ewc_loss": 0.0727381482720375, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003660533402580768, "grad_norm": 8.574630737304688, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.875452995300293, "num_tokens": 782230472.0, "step": 20499 }, { "epoch": 2.6078107111054574, "ewc_loss": 0.07300835102796555, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036875539808534086, "grad_norm": 8.680917739868164, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.8551346063613892, "num_tokens": 782263868.0, "step": 20500 }, { "epoch": 2.607937921384048, "ewc_loss": 0.07259640097618103, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003646358964033425, "grad_norm": 8.634021759033203, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8687325119972229, "num_tokens": 782299781.0, "step": 20501 }, { "epoch": 2.6080651316626384, "ewc_loss": 0.07299800217151642, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036865193396806717, "grad_norm": 8.635334014892578, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.858199417591095, "num_tokens": 782343376.0, "step": 20502 }, { "epoch": 2.608192341941229, "ewc_loss": 0.07303433865308762, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036657386226579547, "grad_norm": 8.515297889709473, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8782290816307068, "num_tokens": 782389894.0, "step": 20503 }, { "epoch": 2.6083195522198195, "ewc_loss": 0.07348963618278503, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003711268655024469, "grad_norm": 8.718751907348633, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8687764406204224, "num_tokens": 782430055.0, "step": 20504 }, { "epoch": 2.60844676249841, "ewc_loss": 0.07294793426990509, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003657098568510264, "grad_norm": 8.554352760314941, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8663672208786011, "num_tokens": 782467211.0, "step": 20505 }, { "epoch": 2.6085739727770005, "ewc_loss": 0.07359795272350311, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037221002276055515, "grad_norm": 8.736161231994629, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8573365807533264, "num_tokens": 782505913.0, "step": 20506 }, { "epoch": 2.608701183055591, "ewc_loss": 0.07289165258407593, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036514701787382364, "grad_norm": 8.580074310302734, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8653704524040222, "num_tokens": 782541858.0, "step": 20507 }, { "epoch": 2.608828393334181, "ewc_loss": 0.0733719915151596, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037239183438941836, "grad_norm": 8.775976181030273, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8658205270767212, "num_tokens": 782575854.0, "step": 20508 }, { "epoch": 2.608955603612772, "ewc_loss": 0.07294167578220367, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036564719630405307, "grad_norm": 8.619059562683105, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8679685592651367, "num_tokens": 782614005.0, "step": 20509 }, { "epoch": 2.609082813891362, "ewc_loss": 0.07337556779384613, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003699861408676952, "grad_norm": 8.732955932617188, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8736542463302612, "num_tokens": 782650147.0, "step": 20510 }, { "epoch": 2.609210024169953, "ewc_loss": 0.07297494262456894, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000365979882190004, "grad_norm": 8.66375732421875, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.866753339767456, "num_tokens": 782683215.0, "step": 20511 }, { "epoch": 2.6093372344485433, "ewc_loss": 0.07324237376451492, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003686542040668428, "grad_norm": 8.662053108215332, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.871496319770813, "num_tokens": 782721184.0, "step": 20512 }, { "epoch": 2.609464444727134, "ewc_loss": 0.07309228926897049, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003671533486340195, "grad_norm": 8.701142311096191, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8603845238685608, "num_tokens": 782759891.0, "step": 20513 }, { "epoch": 2.6095916550057243, "ewc_loss": 0.0730229914188385, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036646032822318375, "grad_norm": 8.64022159576416, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.855571448802948, "num_tokens": 782796398.0, "step": 20514 }, { "epoch": 2.609718865284315, "ewc_loss": 0.07324782013893127, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003687086282297969, "grad_norm": 8.649017333984375, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8756559491157532, "num_tokens": 782835968.0, "step": 20515 }, { "epoch": 2.6098460755629054, "ewc_loss": 0.07294099032878876, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036564041511155665, "grad_norm": 8.609657287597656, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.876245379447937, "num_tokens": 782871367.0, "step": 20516 }, { "epoch": 2.609973285841496, "ewc_loss": 0.07310961186885834, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003673266328405589, "grad_norm": 8.614356994628906, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8876535892486572, "num_tokens": 782905766.0, "step": 20517 }, { "epoch": 2.6101004961200864, "ewc_loss": 0.07308611273765564, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036709161940962076, "grad_norm": 8.651704788208008, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8706899285316467, "num_tokens": 782942256.0, "step": 20518 }, { "epoch": 2.610227706398677, "ewc_loss": 0.07299723476171494, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036620281753130257, "grad_norm": 8.597638130187988, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8761416673660278, "num_tokens": 782976156.0, "step": 20519 }, { "epoch": 2.6103549166772675, "ewc_loss": 0.0730614960193634, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003668454592116177, "grad_norm": 8.600211143493652, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8747700452804565, "num_tokens": 783011450.0, "step": 20520 }, { "epoch": 2.610482126955858, "ewc_loss": 0.07323144376277924, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036854486097581685, "grad_norm": 8.618839263916016, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8717782497406006, "num_tokens": 783050355.0, "step": 20521 }, { "epoch": 2.6106093372344485, "ewc_loss": 0.07303037494421005, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003665342228487134, "grad_norm": 8.587071418762207, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8767339587211609, "num_tokens": 783091621.0, "step": 20522 }, { "epoch": 2.610736547513039, "ewc_loss": 0.07320080697536469, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003682384849525988, "grad_norm": 8.649177551269531, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8555671572685242, "num_tokens": 783130471.0, "step": 20523 }, { "epoch": 2.6108637577916296, "ewc_loss": 0.07300058007240295, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003662362869363278, "grad_norm": 8.58924674987793, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8701072931289673, "num_tokens": 783168340.0, "step": 20524 }, { "epoch": 2.61099096807022, "ewc_loss": 0.0732736736536026, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003689671866595745, "grad_norm": 8.606392860412598, "learning_rate": 1e-06, "loss": 0.5188, "mean_token_accuracy": 0.846594512462616, "num_tokens": 783207279.0, "step": 20525 }, { "epoch": 2.6111181783488107, "ewc_loss": 0.07317578047513962, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036798827932216227, "grad_norm": 8.647490501403809, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8586934804916382, "num_tokens": 783243010.0, "step": 20526 }, { "epoch": 2.611245388627401, "ewc_loss": 0.07304871827363968, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036671763518825173, "grad_norm": 8.537456512451172, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.8566254377365112, "num_tokens": 783287548.0, "step": 20527 }, { "epoch": 2.6113725989059917, "ewc_loss": 0.07348746061325073, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037110503762960434, "grad_norm": 8.679473876953125, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8629205226898193, "num_tokens": 783331549.0, "step": 20528 }, { "epoch": 2.6114998091845822, "ewc_loss": 0.07274036109447479, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003660755173768848, "grad_norm": 8.573151588439941, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8763864040374756, "num_tokens": 783369445.0, "step": 20529 }, { "epoch": 2.6116270194631728, "ewc_loss": 0.07311882078647614, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036986006307415664, "grad_norm": 8.625395774841309, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8617907762527466, "num_tokens": 783413218.0, "step": 20530 }, { "epoch": 2.6117542297417633, "ewc_loss": 0.07295237481594086, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036819561501033604, "grad_norm": 8.634859085083008, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8656505346298218, "num_tokens": 783449944.0, "step": 20531 }, { "epoch": 2.611881440020354, "ewc_loss": 0.07301678508520126, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036883974098600447, "grad_norm": 8.653039932250977, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8659034371376038, "num_tokens": 783486142.0, "step": 20532 }, { "epoch": 2.612008650298944, "ewc_loss": 0.07294836640357971, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003681555681396276, "grad_norm": 8.61275863647461, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8734094500541687, "num_tokens": 783521170.0, "step": 20533 }, { "epoch": 2.612135860577535, "ewc_loss": 0.07310289144515991, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036970083601772785, "grad_norm": 8.677672386169434, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8635921478271484, "num_tokens": 783563902.0, "step": 20534 }, { "epoch": 2.612263070856125, "ewc_loss": 0.07280309498310089, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036670282133854926, "grad_norm": 8.549094200134277, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8714256286621094, "num_tokens": 783597655.0, "step": 20535 }, { "epoch": 2.612390281134716, "ewc_loss": 0.07328036427497864, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037147552939131856, "grad_norm": 8.638299942016602, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8560127019882202, "num_tokens": 783635233.0, "step": 20536 }, { "epoch": 2.612517491413306, "ewc_loss": 0.07285332679748535, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003672051534522325, "grad_norm": 8.540510177612305, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8672112226486206, "num_tokens": 783679677.0, "step": 20537 }, { "epoch": 2.6126447016918966, "ewc_loss": 0.0732380822300911, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037105270894244313, "grad_norm": 8.64645004272461, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.870491087436676, "num_tokens": 783720608.0, "step": 20538 }, { "epoch": 2.612771911970487, "ewc_loss": 0.0728902518749237, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003675743646454066, "grad_norm": 8.585516929626465, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8752214908599854, "num_tokens": 783753198.0, "step": 20539 }, { "epoch": 2.6128991222490776, "ewc_loss": 0.07315635681152344, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037023547338321805, "grad_norm": 8.624979972839355, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8649563193321228, "num_tokens": 783791068.0, "step": 20540 }, { "epoch": 2.613026332527668, "ewc_loss": 0.07297581434249878, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036843004636466503, "grad_norm": 8.541665077209473, "learning_rate": 1e-06, "loss": 0.4668, "mean_token_accuracy": 0.8681075572967529, "num_tokens": 783831319.0, "step": 20541 }, { "epoch": 2.6131535428062587, "ewc_loss": 0.07326164841651917, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003712883626576513, "grad_norm": 8.688400268554688, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8690131306648254, "num_tokens": 783867537.0, "step": 20542 }, { "epoch": 2.613280753084849, "ewc_loss": 0.07282277941703796, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003668996796477586, "grad_norm": 8.535431861877441, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8604640960693359, "num_tokens": 783907787.0, "step": 20543 }, { "epoch": 2.6134079633634397, "ewc_loss": 0.07330678403377533, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037173970486037433, "grad_norm": 8.66871452331543, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8605436086654663, "num_tokens": 783948558.0, "step": 20544 }, { "epoch": 2.6135351736420303, "ewc_loss": 0.07310573011636734, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036728777922689915, "grad_norm": 8.533510208129883, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.884677529335022, "num_tokens": 783986726.0, "step": 20545 }, { "epoch": 2.613662383920621, "ewc_loss": 0.0733303651213646, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037197553319856524, "grad_norm": 8.66641616821289, "learning_rate": 1e-06, "loss": 0.5065, "mean_token_accuracy": 0.8477169275283813, "num_tokens": 784023449.0, "step": 20546 }, { "epoch": 2.6137895941992113, "ewc_loss": 0.07280167937278748, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036668861866928637, "grad_norm": 8.48790168762207, "learning_rate": 1e-06, "loss": 0.4079, "mean_token_accuracy": 0.8833754658699036, "num_tokens": 784059529.0, "step": 20547 }, { "epoch": 2.613916804477802, "ewc_loss": 0.07349269092082977, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037359879934228957, "grad_norm": 8.654540061950684, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8676815629005432, "num_tokens": 784103530.0, "step": 20548 }, { "epoch": 2.6140440147563924, "ewc_loss": 0.07288294285535812, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003675013140309602, "grad_norm": 8.507478713989258, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8592595458030701, "num_tokens": 784141721.0, "step": 20549 }, { "epoch": 2.614171225034983, "ewc_loss": 0.07346662878990173, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037333814543671906, "grad_norm": 8.710431098937988, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8669935464859009, "num_tokens": 784175868.0, "step": 20550 }, { "epoch": 2.6142984353135734, "ewc_loss": 0.07281241565942764, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003667960409075022, "grad_norm": 8.574995994567871, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8782145977020264, "num_tokens": 784210646.0, "step": 20551 }, { "epoch": 2.614425645592164, "ewc_loss": 0.07345059514045715, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037317784153856337, "grad_norm": 8.668315887451172, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8716195821762085, "num_tokens": 784255588.0, "step": 20552 }, { "epoch": 2.6145528558707545, "ewc_loss": 0.07285767793655396, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036724863457493484, "grad_norm": 8.572575569152832, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8680097460746765, "num_tokens": 784292877.0, "step": 20553 }, { "epoch": 2.614680066149345, "ewc_loss": 0.07325038313865662, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003711757599376142, "grad_norm": 8.616312980651855, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8699741363525391, "num_tokens": 784336150.0, "step": 20554 }, { "epoch": 2.6148072764279355, "ewc_loss": 0.07315382361412048, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003702100657392293, "grad_norm": 8.58782958984375, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8806226849555969, "num_tokens": 784371547.0, "step": 20555 }, { "epoch": 2.6149344867065256, "ewc_loss": 0.07324285060167313, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037110038101673126, "grad_norm": 8.623618125915527, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8690276145935059, "num_tokens": 784411473.0, "step": 20556 }, { "epoch": 2.6150616969851166, "ewc_loss": 0.07320477068424225, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003707196156028658, "grad_norm": 8.608826637268066, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8569934368133545, "num_tokens": 784456713.0, "step": 20557 }, { "epoch": 2.6151889072637067, "ewc_loss": 0.0732266753911972, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037093862192705274, "grad_norm": 8.631610870361328, "learning_rate": 1e-06, "loss": 0.4148, "mean_token_accuracy": 0.880787193775177, "num_tokens": 784493685.0, "step": 20558 }, { "epoch": 2.6153161175422976, "ewc_loss": 0.07316413521766663, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037031318061053753, "grad_norm": 8.636826515197754, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.878657341003418, "num_tokens": 784534641.0, "step": 20559 }, { "epoch": 2.6154433278208877, "ewc_loss": 0.07323727011680603, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003710445307660848, "grad_norm": 8.634345054626465, "learning_rate": 1e-06, "loss": 0.537, "mean_token_accuracy": 0.8444443941116333, "num_tokens": 784576184.0, "step": 20560 }, { "epoch": 2.6155705380994787, "ewc_loss": 0.07315219938755035, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037019391311332583, "grad_norm": 8.640606880187988, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8698569536209106, "num_tokens": 784613851.0, "step": 20561 }, { "epoch": 2.615697748378069, "ewc_loss": 0.07316296547651291, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037030153907835484, "grad_norm": 8.613225936889648, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8686909079551697, "num_tokens": 784654133.0, "step": 20562 }, { "epoch": 2.6158249586566593, "ewc_loss": 0.07319819927215576, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037065387004986405, "grad_norm": 8.80107307434082, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8678742051124573, "num_tokens": 784690529.0, "step": 20563 }, { "epoch": 2.61595216893525, "ewc_loss": 0.07286269217729568, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003672987804748118, "grad_norm": 8.544970512390137, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8666906356811523, "num_tokens": 784731972.0, "step": 20564 }, { "epoch": 2.6160793792138404, "ewc_loss": 0.07353644073009491, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037403631722554564, "grad_norm": 8.690985679626465, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8614299893379211, "num_tokens": 784773275.0, "step": 20565 }, { "epoch": 2.616206589492431, "ewc_loss": 0.07288260757923126, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003674979379866272, "grad_norm": 8.601424217224121, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8505693674087524, "num_tokens": 784811468.0, "step": 20566 }, { "epoch": 2.6163337997710214, "ewc_loss": 0.07345381379127502, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037321000127121806, "grad_norm": 8.690673828125, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8681324124336243, "num_tokens": 784850611.0, "step": 20567 }, { "epoch": 2.616461010049612, "ewc_loss": 0.07293332368135452, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003680051304399967, "grad_norm": 8.580756187438965, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8718211054801941, "num_tokens": 784886502.0, "step": 20568 }, { "epoch": 2.6165882203282025, "ewc_loss": 0.07335467636585236, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003722186083905399, "grad_norm": 8.62807559967041, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8571122884750366, "num_tokens": 784924489.0, "step": 20569 }, { "epoch": 2.616715430606793, "ewc_loss": 0.07316476106643677, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003703194379340857, "grad_norm": 8.619817733764648, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8715412616729736, "num_tokens": 784957224.0, "step": 20570 }, { "epoch": 2.6168426408853835, "ewc_loss": 0.07326298207044601, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003713016922120005, "grad_norm": 8.648571014404297, "learning_rate": 1e-06, "loss": 0.545, "mean_token_accuracy": 0.8416663408279419, "num_tokens": 784997754.0, "step": 20571 }, { "epoch": 2.616969851163974, "ewc_loss": 0.07314404845237732, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037011236418038607, "grad_norm": 8.602415084838867, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8518894910812378, "num_tokens": 785033932.0, "step": 20572 }, { "epoch": 2.6170970614425646, "ewc_loss": 0.0732368677854538, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037104060174897313, "grad_norm": 8.626826286315918, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8642681837081909, "num_tokens": 785073190.0, "step": 20573 }, { "epoch": 2.617224271721155, "ewc_loss": 0.07322192192077637, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037089112447574735, "grad_norm": 8.606098175048828, "learning_rate": 1e-06, "loss": 0.4049, "mean_token_accuracy": 0.8831619620323181, "num_tokens": 785108397.0, "step": 20574 }, { "epoch": 2.6173514819997457, "ewc_loss": 0.07333570718765259, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037202893872745335, "grad_norm": 8.690094947814941, "learning_rate": 1e-06, "loss": 0.385, "mean_token_accuracy": 0.8936358690261841, "num_tokens": 785142671.0, "step": 20575 }, { "epoch": 2.617478692278336, "ewc_loss": 0.073188915848732, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037056105793453753, "grad_norm": 8.63107681274414, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8650696277618408, "num_tokens": 785183467.0, "step": 20576 }, { "epoch": 2.6176059025569267, "ewc_loss": 0.07333993911743164, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003720712848007679, "grad_norm": 8.596904754638672, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8564167618751526, "num_tokens": 785222072.0, "step": 20577 }, { "epoch": 2.6177331128355172, "ewc_loss": 0.07322773337364197, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003709491575136781, "grad_norm": 8.624801635742188, "learning_rate": 1e-06, "loss": 0.4813, "mean_token_accuracy": 0.8564401865005493, "num_tokens": 785260458.0, "step": 20578 }, { "epoch": 2.6178603231141078, "ewc_loss": 0.07314744591712952, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037014635745435953, "grad_norm": 8.589424133300781, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8528033494949341, "num_tokens": 785302434.0, "step": 20579 }, { "epoch": 2.6179875333926983, "ewc_loss": 0.07336661219596863, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037233796319924295, "grad_norm": 8.658340454101562, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8517572283744812, "num_tokens": 785338251.0, "step": 20580 }, { "epoch": 2.6181147436712884, "ewc_loss": 0.07300679385662079, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036873985663987696, "grad_norm": 8.553218841552734, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8628460764884949, "num_tokens": 785384789.0, "step": 20581 }, { "epoch": 2.6182419539498794, "ewc_loss": 0.0733332633972168, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003720044915098697, "grad_norm": 8.594344139099121, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.866441547870636, "num_tokens": 785424978.0, "step": 20582 }, { "epoch": 2.6183691642284694, "ewc_loss": 0.07328154146671295, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003714872582349926, "grad_norm": 8.617855072021484, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8600174188613892, "num_tokens": 785464421.0, "step": 20583 }, { "epoch": 2.6184963745070604, "ewc_loss": 0.0732470154762268, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003711419994942844, "grad_norm": 8.612739562988281, "learning_rate": 1e-06, "loss": 0.3945, "mean_token_accuracy": 0.8846466541290283, "num_tokens": 785496166.0, "step": 20584 }, { "epoch": 2.6186235847856505, "ewc_loss": 0.07333605736494064, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003720324602909386, "grad_norm": 8.786547660827637, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8519050478935242, "num_tokens": 785528504.0, "step": 20585 }, { "epoch": 2.6187507950642415, "ewc_loss": 0.07298757880926132, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003685476549435407, "grad_norm": 8.587179183959961, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8689800500869751, "num_tokens": 785570896.0, "step": 20586 }, { "epoch": 2.6188780053428315, "ewc_loss": 0.07346862554550171, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003733580815605819, "grad_norm": 8.670379638671875, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.862412691116333, "num_tokens": 785612464.0, "step": 20587 }, { "epoch": 2.619005215621422, "ewc_loss": 0.07311943173408508, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000369866203982383, "grad_norm": 8.622532844543457, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.867128849029541, "num_tokens": 785655500.0, "step": 20588 }, { "epoch": 2.6191324259000126, "ewc_loss": 0.07327085733413696, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037138047628104687, "grad_norm": 8.668611526489258, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8687280416488647, "num_tokens": 785691186.0, "step": 20589 }, { "epoch": 2.619259636178603, "ewc_loss": 0.07300581783056259, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036873004864901304, "grad_norm": 8.572225570678711, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8802892565727234, "num_tokens": 785728575.0, "step": 20590 }, { "epoch": 2.6193868464571937, "ewc_loss": 0.0733858197927475, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003725301066879183, "grad_norm": 8.656660079956055, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8637669086456299, "num_tokens": 785765782.0, "step": 20591 }, { "epoch": 2.619514056735784, "ewc_loss": 0.07308316230773926, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036950354115106165, "grad_norm": 8.577363014221191, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8712186217308044, "num_tokens": 785800130.0, "step": 20592 }, { "epoch": 2.6196412670143747, "ewc_loss": 0.07337634265422821, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003724353155121207, "grad_norm": 8.656190872192383, "learning_rate": 1e-06, "loss": 0.3719, "mean_token_accuracy": 0.8929951190948486, "num_tokens": 785836647.0, "step": 20593 }, { "epoch": 2.6197684772929652, "ewc_loss": 0.07308586686849594, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003695305495057255, "grad_norm": 8.639277458190918, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8611829280853271, "num_tokens": 785873897.0, "step": 20594 }, { "epoch": 2.6198956875715558, "ewc_loss": 0.07318323105573654, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037050418904982507, "grad_norm": 8.60048770904541, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8623754978179932, "num_tokens": 785914296.0, "step": 20595 }, { "epoch": 2.6200228978501463, "ewc_loss": 0.07328300178050995, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003715018683578819, "grad_norm": 8.651313781738281, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8539453148841858, "num_tokens": 785951426.0, "step": 20596 }, { "epoch": 2.620150108128737, "ewc_loss": 0.07297976315021515, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003684695402625948, "grad_norm": 8.575234413146973, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8698688745498657, "num_tokens": 785989906.0, "step": 20597 }, { "epoch": 2.6202773184073274, "ewc_loss": 0.07383286952972412, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003721177636180073, "grad_norm": 8.700227737426758, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.860305905342102, "num_tokens": 786024763.0, "step": 20598 }, { "epoch": 2.620404528685918, "ewc_loss": 0.07326643168926239, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036889477632939816, "grad_norm": 8.597380638122559, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8622856736183167, "num_tokens": 786063055.0, "step": 20599 }, { "epoch": 2.6205317389645084, "ewc_loss": 0.07368184626102448, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003730489406734705, "grad_norm": 8.660381317138672, "learning_rate": 1e-06, "loss": 0.5626, "mean_token_accuracy": 0.8384224772453308, "num_tokens": 786097675.0, "step": 20600 }, { "epoch": 2.620658949243099, "ewc_loss": 0.07343540340662003, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003705844865180552, "grad_norm": 8.631278038024902, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8781191110610962, "num_tokens": 786133721.0, "step": 20601 }, { "epoch": 2.6207861595216895, "ewc_loss": 0.07361616939306259, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003723921545315534, "grad_norm": 8.653791427612305, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.8483878970146179, "num_tokens": 786179615.0, "step": 20602 }, { "epoch": 2.62091336980028, "ewc_loss": 0.07325029373168945, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036873339558951557, "grad_norm": 8.58095932006836, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8791873455047607, "num_tokens": 786214945.0, "step": 20603 }, { "epoch": 2.6210405800788705, "ewc_loss": 0.07363682985305786, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037259882083162665, "grad_norm": 8.664331436157227, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8677313327789307, "num_tokens": 786250880.0, "step": 20604 }, { "epoch": 2.621167790357461, "ewc_loss": 0.07333812117576599, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036961163277737796, "grad_norm": 8.615093231201172, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8671238422393799, "num_tokens": 786286095.0, "step": 20605 }, { "epoch": 2.621295000636051, "ewc_loss": 0.07345699518918991, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037080043694004416, "grad_norm": 8.633198738098145, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8709737062454224, "num_tokens": 786319547.0, "step": 20606 }, { "epoch": 2.621422210914642, "ewc_loss": 0.07328350841999054, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003690655285026878, "grad_norm": 8.580031394958496, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.865617036819458, "num_tokens": 786358699.0, "step": 20607 }, { "epoch": 2.621549421193232, "ewc_loss": 0.07350483536720276, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037127878749743104, "grad_norm": 8.645217895507812, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8702284097671509, "num_tokens": 786399732.0, "step": 20608 }, { "epoch": 2.621676631471823, "ewc_loss": 0.07326212525367737, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003688517026603222, "grad_norm": 8.586641311645508, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8571890592575073, "num_tokens": 786437375.0, "step": 20609 }, { "epoch": 2.6218038417504133, "ewc_loss": 0.07353755831718445, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003716060018632561, "grad_norm": 8.607071876525879, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8561484813690186, "num_tokens": 786475538.0, "step": 20610 }, { "epoch": 2.621931052029004, "ewc_loss": 0.07334192842245102, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003696497587952763, "grad_norm": 8.590127944946289, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8677706122398376, "num_tokens": 786516802.0, "step": 20611 }, { "epoch": 2.6220582623075943, "ewc_loss": 0.073496013879776, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003711906319949776, "grad_norm": 8.580246925354004, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8662907481193542, "num_tokens": 786558470.0, "step": 20612 }, { "epoch": 2.622185472586185, "ewc_loss": 0.07361641526222229, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003723945701494813, "grad_norm": 8.660506248474121, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8603519201278687, "num_tokens": 786601163.0, "step": 20613 }, { "epoch": 2.6223126828647754, "ewc_loss": 0.07335294783115387, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000369759916793555, "grad_norm": 8.604970932006836, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8726770877838135, "num_tokens": 786636741.0, "step": 20614 }, { "epoch": 2.622439893143366, "ewc_loss": 0.07327141612768173, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003713860351126641, "grad_norm": 8.644452095031738, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8542245626449585, "num_tokens": 786672931.0, "step": 20615 }, { "epoch": 2.6225671034219564, "ewc_loss": 0.07312241196632385, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036989597720094025, "grad_norm": 8.614558219909668, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8782012462615967, "num_tokens": 786709310.0, "step": 20616 }, { "epoch": 2.622694313700547, "ewc_loss": 0.07333835959434509, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003696140192914754, "grad_norm": 8.589449882507324, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8718047142028809, "num_tokens": 786747113.0, "step": 20617 }, { "epoch": 2.6228215239791375, "ewc_loss": 0.07343536615371704, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037058410816825926, "grad_norm": 8.664172172546387, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.865108072757721, "num_tokens": 786782033.0, "step": 20618 }, { "epoch": 2.622948734257728, "ewc_loss": 0.07332240790128708, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036945455940440297, "grad_norm": 8.581117630004883, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8665359616279602, "num_tokens": 786827229.0, "step": 20619 }, { "epoch": 2.6230759445363185, "ewc_loss": 0.07343074679374695, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003705379203893244, "grad_norm": 8.653780937194824, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8748549222946167, "num_tokens": 786861722.0, "step": 20620 }, { "epoch": 2.623203154814909, "ewc_loss": 0.0732000470161438, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003682308888528496, "grad_norm": 8.549030303955078, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8703845143318176, "num_tokens": 786896202.0, "step": 20621 }, { "epoch": 2.6233303650934996, "ewc_loss": 0.07371847331523895, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037341518327593803, "grad_norm": 8.766247749328613, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.868095874786377, "num_tokens": 786932154.0, "step": 20622 }, { "epoch": 2.62345757537209, "ewc_loss": 0.07282742112874985, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003669461002573371, "grad_norm": 8.556641578674316, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8572947978973389, "num_tokens": 786972184.0, "step": 20623 }, { "epoch": 2.6235847856506807, "ewc_loss": 0.07330861687660217, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000371758098481223, "grad_norm": 8.68976879119873, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8604586720466614, "num_tokens": 787015514.0, "step": 20624 }, { "epoch": 2.623711995929271, "ewc_loss": 0.07294458895921707, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003681177622638643, "grad_norm": 8.60078239440918, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8653818964958191, "num_tokens": 787054023.0, "step": 20625 }, { "epoch": 2.6238392062078617, "ewc_loss": 0.07314510643482208, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003701228997670114, "grad_norm": 8.644290924072266, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8607332110404968, "num_tokens": 787088858.0, "step": 20626 }, { "epoch": 2.6239664164864522, "ewc_loss": 0.07298631966114044, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003685350820887834, "grad_norm": 8.626853942871094, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8680708408355713, "num_tokens": 787124805.0, "step": 20627 }, { "epoch": 2.6240936267650428, "ewc_loss": 0.07301615178585052, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003688333963509649, "grad_norm": 8.605581283569336, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8791324496269226, "num_tokens": 787164922.0, "step": 20628 }, { "epoch": 2.6242208370436333, "ewc_loss": 0.07327710092067719, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036900147097185254, "grad_norm": 8.60888671875, "learning_rate": 1e-06, "loss": 0.4224, "mean_token_accuracy": 0.8771457076072693, "num_tokens": 787204936.0, "step": 20629 }, { "epoch": 2.624348047322224, "ewc_loss": 0.07334451377391815, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003696755738928914, "grad_norm": 8.684976577758789, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8787506222724915, "num_tokens": 787238292.0, "step": 20630 }, { "epoch": 2.624475257600814, "ewc_loss": 0.07294808328151703, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003681526577565819, "grad_norm": 8.6262845993042, "learning_rate": 1e-06, "loss": 0.3975, "mean_token_accuracy": 0.8859894871711731, "num_tokens": 787269054.0, "step": 20631 }, { "epoch": 2.624602467879405, "ewc_loss": 0.07316150516271591, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037028692895546556, "grad_norm": 8.63590145111084, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8639128804206848, "num_tokens": 787310022.0, "step": 20632 }, { "epoch": 2.624729678157995, "ewc_loss": 0.07309021055698395, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003695740015245974, "grad_norm": 8.638585090637207, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8573538064956665, "num_tokens": 787350917.0, "step": 20633 }, { "epoch": 2.624856888436586, "ewc_loss": 0.07318976521492004, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037056949804537, "grad_norm": 8.66867733001709, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8743430376052856, "num_tokens": 787388187.0, "step": 20634 }, { "epoch": 2.624984098715176, "ewc_loss": 0.07307054847478867, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003693773760460317, "grad_norm": 8.56414794921875, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8721646666526794, "num_tokens": 787424998.0, "step": 20635 }, { "epoch": 2.6251113089937665, "ewc_loss": 0.07327502965927124, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037142212386243045, "grad_norm": 8.660364151000977, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8779616355895996, "num_tokens": 787465456.0, "step": 20636 }, { "epoch": 2.625238519272357, "ewc_loss": 0.07306009531021118, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003692728641908616, "grad_norm": 8.613407135009766, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.864179253578186, "num_tokens": 787504797.0, "step": 20637 }, { "epoch": 2.6253657295509476, "ewc_loss": 0.07332711666822433, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003719430242199451, "grad_norm": 8.681385040283203, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8772155046463013, "num_tokens": 787542911.0, "step": 20638 }, { "epoch": 2.625492939829538, "ewc_loss": 0.07301802188158035, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003688520810101181, "grad_norm": 8.638025283813477, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8569594025611877, "num_tokens": 787576143.0, "step": 20639 }, { "epoch": 2.6256201501081287, "ewc_loss": 0.07326070219278336, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037127890391275287, "grad_norm": 8.725994110107422, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.868234395980835, "num_tokens": 787619912.0, "step": 20640 }, { "epoch": 2.625747360386719, "ewc_loss": 0.0730966180562973, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003696380590554327, "grad_norm": 8.658576011657715, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.855758547782898, "num_tokens": 787658181.0, "step": 20641 }, { "epoch": 2.6258745706653097, "ewc_loss": 0.07325413823127747, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003712133038789034, "grad_norm": 8.71528434753418, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8572774529457092, "num_tokens": 787694889.0, "step": 20642 }, { "epoch": 2.6260017809439002, "ewc_loss": 0.07297594845294952, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003684313560370356, "grad_norm": 8.62451171875, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.8808596730232239, "num_tokens": 787737633.0, "step": 20643 }, { "epoch": 2.6261289912224908, "ewc_loss": 0.0732126533985138, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037079837056808174, "grad_norm": 8.68404769897461, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8546289205551147, "num_tokens": 787779069.0, "step": 20644 }, { "epoch": 2.6262562015010813, "ewc_loss": 0.072934091091156, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036801278474740684, "grad_norm": 8.647632598876953, "learning_rate": 1e-06, "loss": 0.5151, "mean_token_accuracy": 0.8530905246734619, "num_tokens": 787815544.0, "step": 20645 }, { "epoch": 2.626383411779672, "ewc_loss": 0.07316219061613083, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003702937683556229, "grad_norm": 8.709244728088379, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8607555627822876, "num_tokens": 787852684.0, "step": 20646 }, { "epoch": 2.6265106220582624, "ewc_loss": 0.07292047142982483, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003678766079246998, "grad_norm": 8.6080904006958, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8631744980812073, "num_tokens": 787893452.0, "step": 20647 }, { "epoch": 2.626637832336853, "ewc_loss": 0.07327228039503098, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037139467895030975, "grad_norm": 8.73646068572998, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8665546178817749, "num_tokens": 787929430.0, "step": 20648 }, { "epoch": 2.6267650426154434, "ewc_loss": 0.07281441986560822, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003668161225505173, "grad_norm": 8.627388000488281, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8639910817146301, "num_tokens": 787969768.0, "step": 20649 }, { "epoch": 2.626892252894034, "ewc_loss": 0.07322026789188385, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037087456439621747, "grad_norm": 8.661128997802734, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8666815757751465, "num_tokens": 788009605.0, "step": 20650 }, { "epoch": 2.6270194631726245, "ewc_loss": 0.07309509813785553, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003696228377521038, "grad_norm": 8.643890380859375, "learning_rate": 1e-06, "loss": 0.4209, "mean_token_accuracy": 0.8786715865135193, "num_tokens": 788044652.0, "step": 20651 }, { "epoch": 2.627146673451215, "ewc_loss": 0.07321015000343323, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003707733703777194, "grad_norm": 8.715685844421387, "learning_rate": 1e-06, "loss": 0.5502, "mean_token_accuracy": 0.8387877941131592, "num_tokens": 788085981.0, "step": 20652 }, { "epoch": 2.6272738837298055, "ewc_loss": 0.07297360897064209, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003684080147650093, "grad_norm": 8.627619743347168, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8637785315513611, "num_tokens": 788132196.0, "step": 20653 }, { "epoch": 2.6274010940083956, "ewc_loss": 0.07330544292926788, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037172631709836423, "grad_norm": 8.755415916442871, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8773600459098816, "num_tokens": 788160099.0, "step": 20654 }, { "epoch": 2.6275283042869866, "ewc_loss": 0.07282517850399017, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003669236321002245, "grad_norm": 8.643820762634277, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8633858561515808, "num_tokens": 788197264.0, "step": 20655 }, { "epoch": 2.6276555145655767, "ewc_loss": 0.07322867214679718, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003709585580509156, "grad_norm": 8.654464721679688, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.877780020236969, "num_tokens": 788232900.0, "step": 20656 }, { "epoch": 2.6277827248441676, "ewc_loss": 0.07304114103317261, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003690833109430969, "grad_norm": 8.671761512756348, "learning_rate": 1e-06, "loss": 0.4834, "mean_token_accuracy": 0.8617874979972839, "num_tokens": 788273720.0, "step": 20657 }, { "epoch": 2.6279099351227577, "ewc_loss": 0.07298491895198822, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003685210831463337, "grad_norm": 8.65506362915039, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8677030205726624, "num_tokens": 788308425.0, "step": 20658 }, { "epoch": 2.6280371454013487, "ewc_loss": 0.07308821380138397, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036955400719307363, "grad_norm": 8.670609474182129, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8570820093154907, "num_tokens": 788343701.0, "step": 20659 }, { "epoch": 2.628164355679939, "ewc_loss": 0.07289064675569534, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003675783518701792, "grad_norm": 8.663223266601562, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.850644588470459, "num_tokens": 788372789.0, "step": 20660 }, { "epoch": 2.6282915659585293, "ewc_loss": 0.07324179261922836, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003686484124045819, "grad_norm": 8.63239574432373, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8706420063972473, "num_tokens": 788408422.0, "step": 20661 }, { "epoch": 2.62841877623712, "ewc_loss": 0.07333346456289291, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036956509575247765, "grad_norm": 8.607650756835938, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8695124387741089, "num_tokens": 788446113.0, "step": 20662 }, { "epoch": 2.6285459865157104, "ewc_loss": 0.07345020025968552, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036829107557423413, "grad_norm": 8.609129905700684, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8582708835601807, "num_tokens": 788485009.0, "step": 20663 }, { "epoch": 2.628673196794301, "ewc_loss": 0.07324321568012238, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036866258597001433, "grad_norm": 8.616281509399414, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8695040941238403, "num_tokens": 788523446.0, "step": 20664 }, { "epoch": 2.6288004070728914, "ewc_loss": 0.07332039624452591, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036943444865755737, "grad_norm": 8.627426147460938, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8751072287559509, "num_tokens": 788557423.0, "step": 20665 }, { "epoch": 2.628927617351482, "ewc_loss": 0.07310250401496887, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036725556128658354, "grad_norm": 8.647802352905273, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8625613451004028, "num_tokens": 788588177.0, "step": 20666 }, { "epoch": 2.6290548276300725, "ewc_loss": 0.07318990677595139, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036812954931519926, "grad_norm": 8.662259101867676, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8672283887863159, "num_tokens": 788624877.0, "step": 20667 }, { "epoch": 2.629182037908663, "ewc_loss": 0.07292389869689941, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036791086313314736, "grad_norm": 8.602400779724121, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8764491677284241, "num_tokens": 788662532.0, "step": 20668 }, { "epoch": 2.6293092481872535, "ewc_loss": 0.07304508984088898, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036912274663336575, "grad_norm": 8.647621154785156, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8663843274116516, "num_tokens": 788702734.0, "step": 20669 }, { "epoch": 2.629436458465844, "ewc_loss": 0.07290155440568924, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000367687433026731, "grad_norm": 8.58410930633545, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8658616542816162, "num_tokens": 788738041.0, "step": 20670 }, { "epoch": 2.6295636687444346, "ewc_loss": 0.07298360764980316, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003685079573187977, "grad_norm": 8.654460906982422, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8657602667808533, "num_tokens": 788772998.0, "step": 20671 }, { "epoch": 2.629690879023025, "ewc_loss": 0.07336117327213287, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003674008185043931, "grad_norm": 8.622251510620117, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8488882780075073, "num_tokens": 788814310.0, "step": 20672 }, { "epoch": 2.6298180893016156, "ewc_loss": 0.0735476016998291, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003692650643642992, "grad_norm": 8.58241081237793, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8704509139060974, "num_tokens": 788852811.0, "step": 20673 }, { "epoch": 2.629945299580206, "ewc_loss": 0.07349895685911179, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036877862294204533, "grad_norm": 8.666046142578125, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8737852573394775, "num_tokens": 788888756.0, "step": 20674 }, { "epoch": 2.6300725098587967, "ewc_loss": 0.07321901619434357, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036842061672359705, "grad_norm": 8.641660690307617, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8707841634750366, "num_tokens": 788926292.0, "step": 20675 }, { "epoch": 2.6301997201373872, "ewc_loss": 0.07363680750131607, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003701571258716285, "grad_norm": 8.630859375, "learning_rate": 1e-06, "loss": 0.426, "mean_token_accuracy": 0.8762973546981812, "num_tokens": 788965625.0, "step": 20676 }, { "epoch": 2.6303269304159778, "ewc_loss": 0.0735701471567154, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003694905317388475, "grad_norm": 8.603813171386719, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8714760541915894, "num_tokens": 789007390.0, "step": 20677 }, { "epoch": 2.6304541406945683, "ewc_loss": 0.07362419366836548, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037003104807808995, "grad_norm": 8.69735050201416, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8691296577453613, "num_tokens": 789047940.0, "step": 20678 }, { "epoch": 2.6305813509731584, "ewc_loss": 0.07326428592205048, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036643195198848844, "grad_norm": 8.606850624084473, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8661643862724304, "num_tokens": 789086321.0, "step": 20679 }, { "epoch": 2.6307085612517493, "ewc_loss": 0.07379605621099472, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003717496292665601, "grad_norm": 8.665242195129395, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8789815306663513, "num_tokens": 789121757.0, "step": 20680 }, { "epoch": 2.6308357715303394, "ewc_loss": 0.07323916256427765, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036618064041249454, "grad_norm": 8.576949119567871, "learning_rate": 1e-06, "loss": 0.4187, "mean_token_accuracy": 0.8804612159729004, "num_tokens": 789160076.0, "step": 20681 }, { "epoch": 2.6309629818089304, "ewc_loss": 0.07367519289255142, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003705409762915224, "grad_norm": 8.73573112487793, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.8575301766395569, "num_tokens": 789197080.0, "step": 20682 }, { "epoch": 2.6310901920875205, "ewc_loss": 0.07316461205482483, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003654352331068367, "grad_norm": 8.534056663513184, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8612474203109741, "num_tokens": 789249876.0, "step": 20683 }, { "epoch": 2.6312174023661115, "ewc_loss": 0.07385905086994171, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037237961078062654, "grad_norm": 8.713007926940918, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8649212121963501, "num_tokens": 789290146.0, "step": 20684 }, { "epoch": 2.6313446126447015, "ewc_loss": 0.07324927300214767, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003662817762233317, "grad_norm": 8.56447982788086, "learning_rate": 1e-06, "loss": 0.4487, "mean_token_accuracy": 0.8707666397094727, "num_tokens": 789335610.0, "step": 20685 }, { "epoch": 2.631471822923292, "ewc_loss": 0.07322748005390167, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003709467127919197, "grad_norm": 8.69191837310791, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.861304521560669, "num_tokens": 789371956.0, "step": 20686 }, { "epoch": 2.6315990332018826, "ewc_loss": 0.07275259494781494, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036619784077629447, "grad_norm": 8.552630424499512, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8626786470413208, "num_tokens": 789415257.0, "step": 20687 }, { "epoch": 2.631726243480473, "ewc_loss": 0.07328881323337555, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037155995960347354, "grad_norm": 8.716184616088867, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.86753249168396, "num_tokens": 789452533.0, "step": 20688 }, { "epoch": 2.6318534537590637, "ewc_loss": 0.07275912165641785, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003662631206680089, "grad_norm": 11.251380920410156, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8627128005027771, "num_tokens": 789488780.0, "step": 20689 }, { "epoch": 2.631980664037654, "ewc_loss": 0.07355644553899765, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003742363187484443, "grad_norm": 8.475955963134766, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8640888929367065, "num_tokens": 789528169.0, "step": 20690 }, { "epoch": 2.6321078743162447, "ewc_loss": 0.07590055465698242, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00039767747512087226, "grad_norm": 9.304509162902832, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8591886162757874, "num_tokens": 789568382.0, "step": 20691 }, { "epoch": 2.6322350845948352, "ewc_loss": 0.07226438075304031, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003613156732171774, "grad_norm": 8.467084884643555, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8631690144538879, "num_tokens": 789606895.0, "step": 20692 }, { "epoch": 2.6323622948734258, "ewc_loss": 0.07619138062000275, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0004005856462754309, "grad_norm": 9.250646591186523, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.8502445220947266, "num_tokens": 789649106.0, "step": 20693 }, { "epoch": 2.6324895051520163, "ewc_loss": 0.07285621017217636, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036723396624438465, "grad_norm": 8.573463439941406, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8702735900878906, "num_tokens": 789689449.0, "step": 20694 }, { "epoch": 2.632616715430607, "ewc_loss": 0.07519262284040451, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00039059811388142407, "grad_norm": 9.119390487670898, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8770673871040344, "num_tokens": 789726179.0, "step": 20695 }, { "epoch": 2.6327439257091974, "ewc_loss": 0.07316587120294571, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037033058470115066, "grad_norm": 8.687492370605469, "learning_rate": 1e-06, "loss": 0.5115, "mean_token_accuracy": 0.8479695320129395, "num_tokens": 789764858.0, "step": 20696 }, { "epoch": 2.632871135987788, "ewc_loss": 0.07443098723888397, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00038298169965855777, "grad_norm": 8.966537475585938, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8534762859344482, "num_tokens": 789799196.0, "step": 20697 }, { "epoch": 2.6329983462663784, "ewc_loss": 0.07314763963222504, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037014822009950876, "grad_norm": 8.698290824890137, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.864842414855957, "num_tokens": 789840498.0, "step": 20698 }, { "epoch": 2.633125556544969, "ewc_loss": 0.07384717464447021, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003771436167880893, "grad_norm": 8.821013450622559, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8702736496925354, "num_tokens": 789874269.0, "step": 20699 }, { "epoch": 2.6332527668235595, "ewc_loss": 0.07327944040298462, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003714662743732333, "grad_norm": 8.705556869506836, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8673543930053711, "num_tokens": 789914938.0, "step": 20700 }, { "epoch": 2.63337997710215, "ewc_loss": 0.073442742228508, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003730993194039911, "grad_norm": 8.808671951293945, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8672357797622681, "num_tokens": 789959219.0, "step": 20701 }, { "epoch": 2.6335071873807405, "ewc_loss": 0.073200523853302, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037067706580273807, "grad_norm": 8.668963432312012, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8592653274536133, "num_tokens": 789993707.0, "step": 20702 }, { "epoch": 2.633634397659331, "ewc_loss": 0.07343664765357971, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037303834687918425, "grad_norm": 11.388690948486328, "learning_rate": 1e-06, "loss": 0.5257, "mean_token_accuracy": 0.8464994430541992, "num_tokens": 790034285.0, "step": 20703 }, { "epoch": 2.633761607937921, "ewc_loss": 0.07465660572052002, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003852379450108856, "grad_norm": 8.745580673217773, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8704184293746948, "num_tokens": 790069714.0, "step": 20704 }, { "epoch": 2.633888818216512, "ewc_loss": 0.07544958591461182, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00039316774928011, "grad_norm": 9.090909004211426, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8646551370620728, "num_tokens": 790110733.0, "step": 20705 }, { "epoch": 2.634016028495102, "ewc_loss": 0.07284367084503174, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003671086160466075, "grad_norm": 8.688972473144531, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8701950311660767, "num_tokens": 790144468.0, "step": 20706 }, { "epoch": 2.634143238773693, "ewc_loss": 0.07497964799404144, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00038846832467243075, "grad_norm": 8.962852478027344, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8642289042472839, "num_tokens": 790186881.0, "step": 20707 }, { "epoch": 2.6342704490522832, "ewc_loss": 0.07333312928676605, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037200318183749914, "grad_norm": 8.734349250793457, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8720607757568359, "num_tokens": 790225083.0, "step": 20708 }, { "epoch": 2.6343976593308738, "ewc_loss": 0.0738389790058136, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037706163129769266, "grad_norm": 8.722747802734375, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8656935691833496, "num_tokens": 790266822.0, "step": 20709 }, { "epoch": 2.6345248696094643, "ewc_loss": 0.07375232130289078, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037619509384967387, "grad_norm": 8.818052291870117, "learning_rate": 1e-06, "loss": 0.4681, "mean_token_accuracy": 0.8623436093330383, "num_tokens": 790305948.0, "step": 20710 }, { "epoch": 2.634652079888055, "ewc_loss": 0.07326476275920868, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037131950375624, "grad_norm": 8.727937698364258, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8673758506774902, "num_tokens": 790341639.0, "step": 20711 }, { "epoch": 2.6347792901666454, "ewc_loss": 0.07354653626680374, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037413722020573914, "grad_norm": 8.71798038482666, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8756163716316223, "num_tokens": 790377442.0, "step": 20712 }, { "epoch": 2.634906500445236, "ewc_loss": 0.07340085506439209, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037268042797222733, "grad_norm": 8.737197875976562, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8639718294143677, "num_tokens": 790414911.0, "step": 20713 }, { "epoch": 2.6350337107238264, "ewc_loss": 0.0732436403632164, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037110826815478504, "grad_norm": 8.631730079650879, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8648886680603027, "num_tokens": 790453968.0, "step": 20714 }, { "epoch": 2.635160921002417, "ewc_loss": 0.0735216736793518, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003738885570783168, "grad_norm": 8.679641723632812, "learning_rate": 1e-06, "loss": 0.4821, "mean_token_accuracy": 0.8619460463523865, "num_tokens": 790493216.0, "step": 20715 }, { "epoch": 2.6352881312810075, "ewc_loss": 0.07337744534015656, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003724463458638638, "grad_norm": 8.737154960632324, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8611531257629395, "num_tokens": 790529380.0, "step": 20716 }, { "epoch": 2.635415341559598, "ewc_loss": 0.07337440550327301, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003724159032572061, "grad_norm": 8.719952583312988, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8611036539077759, "num_tokens": 790569568.0, "step": 20717 }, { "epoch": 2.6355425518381885, "ewc_loss": 0.07331207394599915, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003717925865203142, "grad_norm": 8.704304695129395, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8710179328918457, "num_tokens": 790602269.0, "step": 20718 }, { "epoch": 2.635669762116779, "ewc_loss": 0.07329845428466797, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003716564388014376, "grad_norm": 8.680668830871582, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8544359803199768, "num_tokens": 790639723.0, "step": 20719 }, { "epoch": 2.6357969723953696, "ewc_loss": 0.07343044877052307, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003729763557203114, "grad_norm": 8.707374572753906, "learning_rate": 1e-06, "loss": 0.5138, "mean_token_accuracy": 0.851658046245575, "num_tokens": 790676103.0, "step": 20720 }, { "epoch": 2.63592418267396, "ewc_loss": 0.07331390678882599, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037181098014116287, "grad_norm": 8.68431568145752, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8678106069564819, "num_tokens": 790712118.0, "step": 20721 }, { "epoch": 2.6360513929525506, "ewc_loss": 0.07341420650482178, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037281395634636283, "grad_norm": 8.712861061096191, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.865450382232666, "num_tokens": 790747079.0, "step": 20722 }, { "epoch": 2.636178603231141, "ewc_loss": 0.07323019206523895, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037097377935424447, "grad_norm": 8.69075870513916, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8561367988586426, "num_tokens": 790781741.0, "step": 20723 }, { "epoch": 2.6363058135097317, "ewc_loss": 0.07333767414093018, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000372048671124503, "grad_norm": 8.66130542755127, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8651110529899597, "num_tokens": 790820243.0, "step": 20724 }, { "epoch": 2.6364330237883222, "ewc_loss": 0.0734400823712349, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003730726893991232, "grad_norm": 8.670099258422852, "learning_rate": 1e-06, "loss": 0.4168, "mean_token_accuracy": 0.8808486461639404, "num_tokens": 790851492.0, "step": 20725 }, { "epoch": 2.6365602340669128, "ewc_loss": 0.07335050404071808, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037217693170532584, "grad_norm": 8.703044891357422, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8706976771354675, "num_tokens": 790882293.0, "step": 20726 }, { "epoch": 2.636687444345503, "ewc_loss": 0.07341322302818298, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000372804090147838, "grad_norm": 8.670907974243164, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8608238697052002, "num_tokens": 790922188.0, "step": 20727 }, { "epoch": 2.636814654624094, "ewc_loss": 0.0735662430524826, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037433431134559214, "grad_norm": 8.731253623962402, "learning_rate": 1e-06, "loss": 0.4304, "mean_token_accuracy": 0.8745506405830383, "num_tokens": 790960675.0, "step": 20728 }, { "epoch": 2.636941864902684, "ewc_loss": 0.07325764745473862, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003712483448907733, "grad_norm": 8.66923999786377, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8729878664016724, "num_tokens": 790999352.0, "step": 20729 }, { "epoch": 2.637069075181275, "ewc_loss": 0.07346303761005402, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037330223130993545, "grad_norm": 8.681740760803223, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8656834363937378, "num_tokens": 791035898.0, "step": 20730 }, { "epoch": 2.637196285459865, "ewc_loss": 0.07336345314979553, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037230641464702785, "grad_norm": 8.717910766601562, "learning_rate": 1e-06, "loss": 0.5387, "mean_token_accuracy": 0.841464102268219, "num_tokens": 791079731.0, "step": 20731 }, { "epoch": 2.637323495738456, "ewc_loss": 0.07340297847986221, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003702602698467672, "grad_norm": 8.717004776000977, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8771372437477112, "num_tokens": 791118565.0, "step": 20732 }, { "epoch": 2.637450706017046, "ewc_loss": 0.07363419234752655, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037257239455357194, "grad_norm": 8.742953300476074, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8662906885147095, "num_tokens": 791154876.0, "step": 20733 }, { "epoch": 2.6375779162956365, "ewc_loss": 0.0731259137392044, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036993096000514925, "grad_norm": 8.638601303100586, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.865142822265625, "num_tokens": 791196003.0, "step": 20734 }, { "epoch": 2.637705126574227, "ewc_loss": 0.07347050309181213, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037337688263505697, "grad_norm": 8.744208335876465, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8641711473464966, "num_tokens": 791236462.0, "step": 20735 }, { "epoch": 2.6378323368528176, "ewc_loss": 0.07302282750606537, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036890010233037174, "grad_norm": 8.64552116394043, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8520347476005554, "num_tokens": 791271918.0, "step": 20736 }, { "epoch": 2.637959547131408, "ewc_loss": 0.07353703677654266, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003740422544069588, "grad_norm": 8.795284271240234, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8779596090316772, "num_tokens": 791309813.0, "step": 20737 }, { "epoch": 2.6380867574099987, "ewc_loss": 0.07354006171226501, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003691896272357553, "grad_norm": 8.685455322265625, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8727452754974365, "num_tokens": 791338854.0, "step": 20738 }, { "epoch": 2.638213967688589, "ewc_loss": 0.07390404492616653, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003728295268956572, "grad_norm": 8.690743446350098, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8714126348495483, "num_tokens": 791376580.0, "step": 20739 }, { "epoch": 2.6383411779671797, "ewc_loss": 0.07375561445951462, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003713452024385333, "grad_norm": 8.68163776397705, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8635585904121399, "num_tokens": 791415931.0, "step": 20740 }, { "epoch": 2.6384683882457702, "ewc_loss": 0.07383688539266586, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003721579269040376, "grad_norm": 8.689520835876465, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8619664311408997, "num_tokens": 791448201.0, "step": 20741 }, { "epoch": 2.6385955985243608, "ewc_loss": 0.07386419177055359, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037243097904138267, "grad_norm": 8.611699104309082, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8582006692886353, "num_tokens": 791489183.0, "step": 20742 }, { "epoch": 2.6387228088029513, "ewc_loss": 0.07404649257659912, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003742539556697011, "grad_norm": 8.735517501831055, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8668625354766846, "num_tokens": 791528414.0, "step": 20743 }, { "epoch": 2.638850019081542, "ewc_loss": 0.07375294715166092, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037131854332983494, "grad_norm": 8.633079528808594, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8849244713783264, "num_tokens": 791566379.0, "step": 20744 }, { "epoch": 2.6389772293601323, "ewc_loss": 0.0740978792309761, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003747678711079061, "grad_norm": 8.774359703063965, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.868178129196167, "num_tokens": 791599789.0, "step": 20745 }, { "epoch": 2.639104439638723, "ewc_loss": 0.07365670800209045, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037035613786429167, "grad_norm": 8.687182426452637, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.851205587387085, "num_tokens": 791642333.0, "step": 20746 }, { "epoch": 2.6392316499173134, "ewc_loss": 0.07398223876953125, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037361140130087733, "grad_norm": 8.650771141052246, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8680344820022583, "num_tokens": 791687190.0, "step": 20747 }, { "epoch": 2.639358860195904, "ewc_loss": 0.07385851442813873, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003723741974681616, "grad_norm": 8.764538764953613, "learning_rate": 1e-06, "loss": 0.5006, "mean_token_accuracy": 0.8513966798782349, "num_tokens": 791722806.0, "step": 20748 }, { "epoch": 2.6394860704744945, "ewc_loss": 0.07365250587463379, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037031411193311214, "grad_norm": 8.689963340759277, "learning_rate": 1e-06, "loss": 0.5109, "mean_token_accuracy": 0.8504968881607056, "num_tokens": 791757726.0, "step": 20749 }, { "epoch": 2.639613280753085, "ewc_loss": 0.07389900833368301, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003727791481651366, "grad_norm": 8.689048767089844, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8729194402694702, "num_tokens": 791795160.0, "step": 20750 }, { "epoch": 2.6397404910316755, "ewc_loss": 0.0736706554889679, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003704955743160099, "grad_norm": 8.74283504486084, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8789814710617065, "num_tokens": 791831682.0, "step": 20751 }, { "epoch": 2.6398677013102656, "ewc_loss": 0.07371576130390167, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037094668368808925, "grad_norm": 8.724239349365234, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8773825168609619, "num_tokens": 791863230.0, "step": 20752 }, { "epoch": 2.6399949115888566, "ewc_loss": 0.07320374250411987, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037070931284688413, "grad_norm": 8.666828155517578, "learning_rate": 1e-06, "loss": 0.4138, "mean_token_accuracy": 0.8804069757461548, "num_tokens": 791904323.0, "step": 20753 }, { "epoch": 2.6401221218674467, "ewc_loss": 0.07317420095205307, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037041387986391783, "grad_norm": 8.639799118041992, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8595104217529297, "num_tokens": 791951836.0, "step": 20754 }, { "epoch": 2.6402493321460376, "ewc_loss": 0.0737970694899559, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000371759757399559, "grad_norm": 8.738316535949707, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8691811561584473, "num_tokens": 791989782.0, "step": 20755 }, { "epoch": 2.6403765424246277, "ewc_loss": 0.07362042367458344, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000369993329513818, "grad_norm": 8.660489082336426, "learning_rate": 1e-06, "loss": 0.4525, "mean_token_accuracy": 0.8666865825653076, "num_tokens": 792024038.0, "step": 20756 }, { "epoch": 2.6405037527032187, "ewc_loss": 0.07389390468597412, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003727281291503459, "grad_norm": 8.715571403503418, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8674591779708862, "num_tokens": 792067893.0, "step": 20757 }, { "epoch": 2.6406309629818088, "ewc_loss": 0.07314912974834442, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037016315036453307, "grad_norm": 8.722582817077637, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8748575448989868, "num_tokens": 792108593.0, "step": 20758 }, { "epoch": 2.6407581732603993, "ewc_loss": 0.07311925292015076, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036986437044106424, "grad_norm": 8.650805473327637, "learning_rate": 1e-06, "loss": 0.4192, "mean_token_accuracy": 0.8790924549102783, "num_tokens": 792141201.0, "step": 20759 }, { "epoch": 2.64088538353899, "ewc_loss": 0.07342249155044556, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003728967858478427, "grad_norm": 8.708773612976074, "learning_rate": 1e-06, "loss": 0.4034, "mean_token_accuracy": 0.8859821557998657, "num_tokens": 792182908.0, "step": 20760 }, { "epoch": 2.6410125938175804, "ewc_loss": 0.0730353519320488, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000369025394320488, "grad_norm": 8.710410118103027, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8674090504646301, "num_tokens": 792225223.0, "step": 20761 }, { "epoch": 2.641139804096171, "ewc_loss": 0.07324165850877762, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037108847755007446, "grad_norm": 8.80010986328125, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8490956425666809, "num_tokens": 792258131.0, "step": 20762 }, { "epoch": 2.6412670143747614, "ewc_loss": 0.07298795878887177, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036855143844150007, "grad_norm": 8.675008773803711, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.872074544429779, "num_tokens": 792291429.0, "step": 20763 }, { "epoch": 2.641394224653352, "ewc_loss": 0.07311542332172394, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003698260698001832, "grad_norm": 8.720246315002441, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8610164523124695, "num_tokens": 792327777.0, "step": 20764 }, { "epoch": 2.6415214349319425, "ewc_loss": 0.0730549693107605, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003692216123454273, "grad_norm": 8.618952751159668, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8690668344497681, "num_tokens": 792367918.0, "step": 20765 }, { "epoch": 2.641648645210533, "ewc_loss": 0.07323363423347473, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037100823828950524, "grad_norm": 8.790781021118164, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8703300356864929, "num_tokens": 792405550.0, "step": 20766 }, { "epoch": 2.6417758554891235, "ewc_loss": 0.07285690307617188, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003672409220598638, "grad_norm": 8.645683288574219, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8699585199356079, "num_tokens": 792440948.0, "step": 20767 }, { "epoch": 2.641903065767714, "ewc_loss": 0.07335792481899261, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003722511464729905, "grad_norm": 8.75153923034668, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8729982376098633, "num_tokens": 792488065.0, "step": 20768 }, { "epoch": 2.6420302760463046, "ewc_loss": 0.072944775223732, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003681196249090135, "grad_norm": 8.628988265991211, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8728803992271423, "num_tokens": 792522845.0, "step": 20769 }, { "epoch": 2.642157486324895, "ewc_loss": 0.07333356887102127, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003720075765158981, "grad_norm": 8.678054809570312, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8763227462768555, "num_tokens": 792561118.0, "step": 20770 }, { "epoch": 2.6422846966034856, "ewc_loss": 0.07302648574113846, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036893674405291677, "grad_norm": 8.607281684875488, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8663109540939331, "num_tokens": 792600125.0, "step": 20771 }, { "epoch": 2.642411906882076, "ewc_loss": 0.07338985800743103, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037257044459693134, "grad_norm": 8.78390121459961, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8704724907875061, "num_tokens": 792633856.0, "step": 20772 }, { "epoch": 2.6425391171606667, "ewc_loss": 0.07337937504053116, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036758280475623906, "grad_norm": 8.577489852905273, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8615522384643555, "num_tokens": 792672427.0, "step": 20773 }, { "epoch": 2.6426663274392572, "ewc_loss": 0.0741308256983757, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037509732646867633, "grad_norm": 8.739218711853027, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8443242311477661, "num_tokens": 792713856.0, "step": 20774 }, { "epoch": 2.6427935377178478, "ewc_loss": 0.07298159599304199, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036848781746812165, "grad_norm": 8.595256805419922, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8585509061813354, "num_tokens": 792757602.0, "step": 20775 }, { "epoch": 2.6429207479964383, "ewc_loss": 0.0738951712846756, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037518219323828816, "grad_norm": 8.705427169799805, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8557947874069214, "num_tokens": 792797067.0, "step": 20776 }, { "epoch": 2.6430479582750284, "ewc_loss": 0.07354174554347992, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037164788227528334, "grad_norm": 8.68648910522461, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8693467974662781, "num_tokens": 792835748.0, "step": 20777 }, { "epoch": 2.6431751685536193, "ewc_loss": 0.07390990853309631, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003728881129063666, "grad_norm": 8.676236152648926, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.868678867816925, "num_tokens": 792875039.0, "step": 20778 }, { "epoch": 2.6433023788322094, "ewc_loss": 0.07391948997974396, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003729839518200606, "grad_norm": 8.705418586730957, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.87166428565979, "num_tokens": 792916580.0, "step": 20779 }, { "epoch": 2.6434295891108004, "ewc_loss": 0.0738041028380394, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037183010135777295, "grad_norm": 8.688652992248535, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.866698682308197, "num_tokens": 792952507.0, "step": 20780 }, { "epoch": 2.6435567993893905, "ewc_loss": 0.07387804239988327, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037256948417052627, "grad_norm": 8.644380569458008, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.868096113204956, "num_tokens": 792994046.0, "step": 20781 }, { "epoch": 2.6436840096679814, "ewc_loss": 0.07398921996355057, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003736812504939735, "grad_norm": 8.698845863342285, "learning_rate": 1e-06, "loss": 0.4077, "mean_token_accuracy": 0.886054277420044, "num_tokens": 793033855.0, "step": 20782 }, { "epoch": 2.6438112199465715, "ewc_loss": 0.07382860034704208, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003720750682987273, "grad_norm": 8.704009056091309, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8691847324371338, "num_tokens": 793072226.0, "step": 20783 }, { "epoch": 2.643938430225162, "ewc_loss": 0.07349801063537598, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037365194293670356, "grad_norm": 8.739410400390625, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.869928240776062, "num_tokens": 793110884.0, "step": 20784 }, { "epoch": 2.6440656405037526, "ewc_loss": 0.07338044792413712, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003724763519130647, "grad_norm": 8.714866638183594, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8491345643997192, "num_tokens": 793151885.0, "step": 20785 }, { "epoch": 2.644192850782343, "ewc_loss": 0.07337181270122528, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003723900008480996, "grad_norm": 8.722339630126953, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8436974883079529, "num_tokens": 793187253.0, "step": 20786 }, { "epoch": 2.6443200610609336, "ewc_loss": 0.07327914237976074, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003714632766786963, "grad_norm": 8.66176700592041, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8707771301269531, "num_tokens": 793231249.0, "step": 20787 }, { "epoch": 2.644447271339524, "ewc_loss": 0.07344116270542145, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003730834578163922, "grad_norm": 8.697928428649902, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8777997493743896, "num_tokens": 793269918.0, "step": 20788 }, { "epoch": 2.6445744816181147, "ewc_loss": 0.07321935892105103, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003708654548972845, "grad_norm": 8.670117378234863, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8762991428375244, "num_tokens": 793306856.0, "step": 20789 }, { "epoch": 2.6447016918967052, "ewc_loss": 0.0734797939658165, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003734698111657053, "grad_norm": 8.708517074584961, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.8560791015625, "num_tokens": 793347866.0, "step": 20790 }, { "epoch": 2.6448289021752958, "ewc_loss": 0.07331713289022446, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003718431980814785, "grad_norm": 8.733269691467285, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8647055625915527, "num_tokens": 793391037.0, "step": 20791 }, { "epoch": 2.6449561124538863, "ewc_loss": 0.07332980632781982, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037196994526311755, "grad_norm": 8.778726577758789, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8521973490715027, "num_tokens": 793423560.0, "step": 20792 }, { "epoch": 2.645083322732477, "ewc_loss": 0.07336103916168213, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037228225846774876, "grad_norm": 8.794137954711914, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.872779130935669, "num_tokens": 793460215.0, "step": 20793 }, { "epoch": 2.6452105330110673, "ewc_loss": 0.07328246533870697, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003714965423569083, "grad_norm": 8.757136344909668, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8534462451934814, "num_tokens": 793492671.0, "step": 20794 }, { "epoch": 2.645337743289658, "ewc_loss": 0.07327988743782043, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037147069815546274, "grad_norm": 8.672941207885742, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8655276298522949, "num_tokens": 793529526.0, "step": 20795 }, { "epoch": 2.6454649535682484, "ewc_loss": 0.0733051598072052, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003717234649229795, "grad_norm": 8.866724014282227, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8754541873931885, "num_tokens": 793566748.0, "step": 20796 }, { "epoch": 2.645592163846839, "ewc_loss": 0.07304395735263824, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003691114252433181, "grad_norm": 8.64083480834961, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8801007270812988, "num_tokens": 793600703.0, "step": 20797 }, { "epoch": 2.6457193741254295, "ewc_loss": 0.0736064463853836, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003747363807633519, "grad_norm": 8.839239120483398, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8652085661888123, "num_tokens": 793639226.0, "step": 20798 }, { "epoch": 2.64584658440402, "ewc_loss": 0.07339993119239807, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003677884233184159, "grad_norm": 8.640304565429688, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8801793456077576, "num_tokens": 793676147.0, "step": 20799 }, { "epoch": 2.6459737946826105, "ewc_loss": 0.07416892051696777, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037547829560935497, "grad_norm": 8.833250999450684, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8773208856582642, "num_tokens": 793715778.0, "step": 20800 }, { "epoch": 2.646101004961201, "ewc_loss": 0.07301218062639236, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003687936987262219, "grad_norm": 8.739977836608887, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8822388648986816, "num_tokens": 793752096.0, "step": 20801 }, { "epoch": 2.646228215239791, "ewc_loss": 0.0733962133526802, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037263400736264884, "grad_norm": 8.801093101501465, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8609044551849365, "num_tokens": 793789809.0, "step": 20802 }, { "epoch": 2.646355425518382, "ewc_loss": 0.07302172482013702, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036888915929012, "grad_norm": 8.681757926940918, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8633167147636414, "num_tokens": 793826505.0, "step": 20803 }, { "epoch": 2.646482635796972, "ewc_loss": 0.07344011962413788, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037307306774891913, "grad_norm": 8.86634635925293, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8658468723297119, "num_tokens": 793868360.0, "step": 20804 }, { "epoch": 2.646609846075563, "ewc_loss": 0.07289012521505356, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003675731422845274, "grad_norm": 8.61135482788086, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8669639825820923, "num_tokens": 793906566.0, "step": 20805 }, { "epoch": 2.6467370563541532, "ewc_loss": 0.07356694340705872, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037434129626490176, "grad_norm": 8.9140625, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8633158206939697, "num_tokens": 793947814.0, "step": 20806 }, { "epoch": 2.6468642666327438, "ewc_loss": 0.07266457378864288, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036531759542413056, "grad_norm": 8.587069511413574, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8712587356567383, "num_tokens": 793987391.0, "step": 20807 }, { "epoch": 2.6469914769113343, "ewc_loss": 0.0737355649471283, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003760275722015649, "grad_norm": 8.90599250793457, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8778639435768127, "num_tokens": 794024045.0, "step": 20808 }, { "epoch": 2.647118687189925, "ewc_loss": 0.07286323606967926, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003648628480732441, "grad_norm": 8.633923530578613, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.870625376701355, "num_tokens": 794063405.0, "step": 20809 }, { "epoch": 2.6472458974685154, "ewc_loss": 0.0736851617693901, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003755234938580543, "grad_norm": 8.976826667785645, "learning_rate": 1e-06, "loss": 0.5317, "mean_token_accuracy": 0.8422955274581909, "num_tokens": 794099729.0, "step": 20810 }, { "epoch": 2.647373107747106, "ewc_loss": 0.07256469130516052, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036431883927434683, "grad_norm": 8.535106658935547, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8585708737373352, "num_tokens": 794139669.0, "step": 20811 }, { "epoch": 2.6475003180256964, "ewc_loss": 0.07407251745462418, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037939706817269325, "grad_norm": 9.02820873260498, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8691684007644653, "num_tokens": 794176906.0, "step": 20812 }, { "epoch": 2.647627528304287, "ewc_loss": 0.07243131101131439, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003629850107245147, "grad_norm": 8.467094421386719, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8698025941848755, "num_tokens": 794216594.0, "step": 20813 }, { "epoch": 2.6477547385828775, "ewc_loss": 0.07424178719520569, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00038108977605588734, "grad_norm": 8.974287033081055, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8704878687858582, "num_tokens": 794248379.0, "step": 20814 }, { "epoch": 2.647881948861468, "ewc_loss": 0.07260974496603012, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036476930836215615, "grad_norm": 8.486572265625, "learning_rate": 1e-06, "loss": 0.5267, "mean_token_accuracy": 0.8425081968307495, "num_tokens": 794293604.0, "step": 20815 }, { "epoch": 2.6480091591400585, "ewc_loss": 0.0742776170372963, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00038144804420880973, "grad_norm": 9.03060531616211, "learning_rate": 1e-06, "loss": 0.4912, "mean_token_accuracy": 0.8577945232391357, "num_tokens": 794332084.0, "step": 20816 }, { "epoch": 2.648136369418649, "ewc_loss": 0.07321514189243317, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003659404464997351, "grad_norm": 8.520792961120605, "learning_rate": 1e-06, "loss": 0.4108, "mean_token_accuracy": 0.8826225399971008, "num_tokens": 794369158.0, "step": 20817 }, { "epoch": 2.6482635796972396, "ewc_loss": 0.07445567846298218, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003832286165561527, "grad_norm": 8.9301118850708, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8699594140052795, "num_tokens": 794410531.0, "step": 20818 }, { "epoch": 2.64839078997583, "ewc_loss": 0.07294590026140213, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003681308589875698, "grad_norm": 8.567450523376465, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8671962022781372, "num_tokens": 794457556.0, "step": 20819 }, { "epoch": 2.6485180002544206, "ewc_loss": 0.07431532442569733, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003818251134362072, "grad_norm": 8.963980674743652, "learning_rate": 1e-06, "loss": 0.5, "mean_token_accuracy": 0.8553165197372437, "num_tokens": 794492737.0, "step": 20820 }, { "epoch": 2.648645210533011, "ewc_loss": 0.07301905751228333, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036886250018142164, "grad_norm": 8.613652229309082, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8769901990890503, "num_tokens": 794532589.0, "step": 20821 }, { "epoch": 2.6487724208116017, "ewc_loss": 0.07394258677959442, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003780977276619524, "grad_norm": 8.867561340332031, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8563426733016968, "num_tokens": 794566982.0, "step": 20822 }, { "epoch": 2.648899631090192, "ewc_loss": 0.07306719571352005, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003693438193295151, "grad_norm": 8.681416511535645, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8504747152328491, "num_tokens": 794604278.0, "step": 20823 }, { "epoch": 2.6490268413687827, "ewc_loss": 0.07376794517040253, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003763513232115656, "grad_norm": 8.815000534057617, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8758635520935059, "num_tokens": 794643785.0, "step": 20824 }, { "epoch": 2.649154051647373, "ewc_loss": 0.07364625483751297, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003702515969052911, "grad_norm": 8.66187572479248, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8753032088279724, "num_tokens": 794678302.0, "step": 20825 }, { "epoch": 2.649281261925964, "ewc_loss": 0.07359975576400757, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003746694710571319, "grad_norm": 8.811537742614746, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8579161167144775, "num_tokens": 794715732.0, "step": 20826 }, { "epoch": 2.649408472204554, "ewc_loss": 0.07367409765720367, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003705300041474402, "grad_norm": 8.6412992477417, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8704239726066589, "num_tokens": 794754359.0, "step": 20827 }, { "epoch": 2.649535682483145, "ewc_loss": 0.07359971106052399, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037466894718818367, "grad_norm": 8.767563819885254, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8642368912696838, "num_tokens": 794795770.0, "step": 20828 }, { "epoch": 2.649662892761735, "ewc_loss": 0.07315799593925476, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003702518588397652, "grad_norm": 8.741543769836426, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8584432005882263, "num_tokens": 794835654.0, "step": 20829 }, { "epoch": 2.649790103040326, "ewc_loss": 0.07392038404941559, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037299285759218037, "grad_norm": 8.765203475952148, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.86972975730896, "num_tokens": 794872527.0, "step": 20830 }, { "epoch": 2.649917313318916, "ewc_loss": 0.0732181966304779, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037085387157276273, "grad_norm": 8.653009414672852, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8707271814346313, "num_tokens": 794918577.0, "step": 20831 }, { "epoch": 2.6500445235975065, "ewc_loss": 0.07340645045042038, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003727363655343652, "grad_norm": 8.7963228225708, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8781089782714844, "num_tokens": 794953566.0, "step": 20832 }, { "epoch": 2.650171733876097, "ewc_loss": 0.07306402921676636, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036931218346580863, "grad_norm": 8.697555541992188, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8742612600326538, "num_tokens": 794990547.0, "step": 20833 }, { "epoch": 2.6502989441546876, "ewc_loss": 0.07382124662399292, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037200149381533265, "grad_norm": 8.71076488494873, "learning_rate": 1e-06, "loss": 0.4084, "mean_token_accuracy": 0.8840079307556152, "num_tokens": 795028419.0, "step": 20834 }, { "epoch": 2.650426154433278, "ewc_loss": 0.07369885593652725, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003707776195369661, "grad_norm": 8.674935340881348, "learning_rate": 1e-06, "loss": 0.4101, "mean_token_accuracy": 0.8830280900001526, "num_tokens": 795064303.0, "step": 20835 }, { "epoch": 2.6505533647118686, "ewc_loss": 0.07396751642227173, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037346422323025763, "grad_norm": 8.842138290405273, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8438292145729065, "num_tokens": 795104280.0, "step": 20836 }, { "epoch": 2.650680574990459, "ewc_loss": 0.0735182836651802, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003689719014801085, "grad_norm": 8.699090957641602, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8585349321365356, "num_tokens": 795142547.0, "step": 20837 }, { "epoch": 2.6508077852690497, "ewc_loss": 0.07385312020778656, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037232026807032526, "grad_norm": 8.822502136230469, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8641155958175659, "num_tokens": 795186193.0, "step": 20838 }, { "epoch": 2.6509349955476402, "ewc_loss": 0.07353143393993378, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000369103392586112, "grad_norm": 8.636691093444824, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8852171897888184, "num_tokens": 795215453.0, "step": 20839 }, { "epoch": 2.6510622058262308, "ewc_loss": 0.07396688312292099, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037345787859521806, "grad_norm": 8.917768478393555, "learning_rate": 1e-06, "loss": 0.4778, "mean_token_accuracy": 0.8629426956176758, "num_tokens": 795252820.0, "step": 20840 }, { "epoch": 2.6511894161048213, "ewc_loss": 0.07321701943874359, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036595924757421017, "grad_norm": 8.60603141784668, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8644946217536926, "num_tokens": 795290000.0, "step": 20841 }, { "epoch": 2.651316626383412, "ewc_loss": 0.07424154877662659, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037620452349074185, "grad_norm": 8.922837257385254, "learning_rate": 1e-06, "loss": 0.4038, "mean_token_accuracy": 0.8835355639457703, "num_tokens": 795330389.0, "step": 20842 }, { "epoch": 2.6514438366620023, "ewc_loss": 0.07303249835968018, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036411406472325325, "grad_norm": 8.545477867126465, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8628087043762207, "num_tokens": 795366693.0, "step": 20843 }, { "epoch": 2.651571046940593, "ewc_loss": 0.07417193055152893, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037794976378791034, "grad_norm": 9.034452438354492, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8688646554946899, "num_tokens": 795405748.0, "step": 20844 }, { "epoch": 2.6516982572191834, "ewc_loss": 0.07296955585479736, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000363484607078135, "grad_norm": 8.437122344970703, "learning_rate": 1e-06, "loss": 0.414, "mean_token_accuracy": 0.8774409294128418, "num_tokens": 795446179.0, "step": 20845 }, { "epoch": 2.651825467497774, "ewc_loss": 0.07488793134689331, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003826683678198606, "grad_norm": 8.969152450561523, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8620936870574951, "num_tokens": 795488234.0, "step": 20846 }, { "epoch": 2.6519526777763645, "ewc_loss": 0.0728694349527359, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003649248101282865, "grad_norm": 8.623464584350586, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8642527461051941, "num_tokens": 795526508.0, "step": 20847 }, { "epoch": 2.652079888054955, "ewc_loss": 0.07452216744422913, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000379010773031041, "grad_norm": 8.805482864379883, "learning_rate": 1e-06, "loss": 0.4443, "mean_token_accuracy": 0.8738042116165161, "num_tokens": 795566299.0, "step": 20848 }, { "epoch": 2.6522070983335455, "ewc_loss": 0.07333050668239594, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003670940932352096, "grad_norm": 8.677740097045898, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8622724413871765, "num_tokens": 795603892.0, "step": 20849 }, { "epoch": 2.6523343086121356, "ewc_loss": 0.07413142919540405, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037510335096158087, "grad_norm": 8.818548202514648, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8759210109710693, "num_tokens": 795639622.0, "step": 20850 }, { "epoch": 2.6524615188907266, "ewc_loss": 0.07308299839496613, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003695018822327256, "grad_norm": 8.610350608825684, "learning_rate": 1e-06, "loss": 0.3889, "mean_token_accuracy": 0.8887633085250854, "num_tokens": 795677299.0, "step": 20851 }, { "epoch": 2.6525887291693167, "ewc_loss": 0.07429791986942291, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003767682646866888, "grad_norm": 8.908288955688477, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.866280198097229, "num_tokens": 795714304.0, "step": 20852 }, { "epoch": 2.6527159394479076, "ewc_loss": 0.07291637361049652, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036783565883524716, "grad_norm": 8.633041381835938, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8629243969917297, "num_tokens": 795752330.0, "step": 20853 }, { "epoch": 2.6528431497264977, "ewc_loss": 0.07367351651191711, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000375406991224736, "grad_norm": 8.779367446899414, "learning_rate": 1e-06, "loss": 0.5233, "mean_token_accuracy": 0.8457881212234497, "num_tokens": 795789503.0, "step": 20854 }, { "epoch": 2.6529703600050887, "ewc_loss": 0.07367579638957977, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037054705899208784, "grad_norm": 8.779455184936523, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8656955361366272, "num_tokens": 795828449.0, "step": 20855 }, { "epoch": 2.6530975702836788, "ewc_loss": 0.07386117428541183, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037240079836919904, "grad_norm": 8.632440567016602, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8679690361022949, "num_tokens": 795868454.0, "step": 20856 }, { "epoch": 2.6532247805622693, "ewc_loss": 0.07394950836896896, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037328412872739136, "grad_norm": 8.796966552734375, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8535779714584351, "num_tokens": 795904629.0, "step": 20857 }, { "epoch": 2.65335199084086, "ewc_loss": 0.07353898137807846, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036917885881848633, "grad_norm": 8.690644264221191, "learning_rate": 1e-06, "loss": 0.5001, "mean_token_accuracy": 0.8533971905708313, "num_tokens": 795943197.0, "step": 20858 }, { "epoch": 2.6534792011194503, "ewc_loss": 0.07394179701805115, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037320706178434193, "grad_norm": 8.624883651733398, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8628961443901062, "num_tokens": 795984672.0, "step": 20859 }, { "epoch": 2.653606411398041, "ewc_loss": 0.07394851744174957, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037327426252886653, "grad_norm": 8.722967147827148, "learning_rate": 1e-06, "loss": 0.5214, "mean_token_accuracy": 0.84747314453125, "num_tokens": 796022599.0, "step": 20860 }, { "epoch": 2.6537336216766314, "ewc_loss": 0.07385869324207306, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037237603100948036, "grad_norm": 8.688197135925293, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.8653223514556885, "num_tokens": 796057798.0, "step": 20861 }, { "epoch": 2.653860831955222, "ewc_loss": 0.07396520674228668, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037344114389270544, "grad_norm": 8.655065536499023, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8647574186325073, "num_tokens": 796094928.0, "step": 20862 }, { "epoch": 2.6539880422338125, "ewc_loss": 0.07393315434455872, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003731205652002245, "grad_norm": 8.698769569396973, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8680115342140198, "num_tokens": 796132614.0, "step": 20863 }, { "epoch": 2.654115252512403, "ewc_loss": 0.07383539527654648, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037214302574284375, "grad_norm": 8.662810325622559, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8653119802474976, "num_tokens": 796171653.0, "step": 20864 }, { "epoch": 2.6542424627909935, "ewc_loss": 0.07405444979667664, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003743335837498307, "grad_norm": 8.688620567321777, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8604671955108643, "num_tokens": 796215218.0, "step": 20865 }, { "epoch": 2.654369673069584, "ewc_loss": 0.07396291196346283, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003734181809704751, "grad_norm": 8.69262409210205, "learning_rate": 1e-06, "loss": 0.422, "mean_token_accuracy": 0.879339337348938, "num_tokens": 796249165.0, "step": 20866 }, { "epoch": 2.6544968833481746, "ewc_loss": 0.07345448434352875, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000373216753359884, "grad_norm": 8.670222282409668, "learning_rate": 1e-06, "loss": 0.4787, "mean_token_accuracy": 0.8628256320953369, "num_tokens": 796283878.0, "step": 20867 }, { "epoch": 2.654624093626765, "ewc_loss": 0.07401169836521149, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037390601937659085, "grad_norm": 8.661600112915039, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8535768389701843, "num_tokens": 796330760.0, "step": 20868 }, { "epoch": 2.6547513039053556, "ewc_loss": 0.07400521636009216, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003738412051461637, "grad_norm": 8.662721633911133, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8540564179420471, "num_tokens": 796371547.0, "step": 20869 }, { "epoch": 2.654878514183946, "ewc_loss": 0.07352260500192642, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003738979285117239, "grad_norm": 8.724093437194824, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8517075181007385, "num_tokens": 796409893.0, "step": 20870 }, { "epoch": 2.6550057244625367, "ewc_loss": 0.07336622476577759, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037233412149362266, "grad_norm": 8.738584518432617, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8509306311607361, "num_tokens": 796442493.0, "step": 20871 }, { "epoch": 2.655132934741127, "ewc_loss": 0.07346486300230026, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003733205085154623, "grad_norm": 8.681386947631836, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8685315847396851, "num_tokens": 796480853.0, "step": 20872 }, { "epoch": 2.6552601450197177, "ewc_loss": 0.07389704138040543, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003727594739757478, "grad_norm": 8.69813060760498, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8710287809371948, "num_tokens": 796520057.0, "step": 20873 }, { "epoch": 2.6553873552983083, "ewc_loss": 0.07388218492269516, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003726108989212662, "grad_norm": 8.756933212280273, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8691285848617554, "num_tokens": 796549779.0, "step": 20874 }, { "epoch": 2.6555145655768984, "ewc_loss": 0.07376138120889664, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003714028571266681, "grad_norm": 8.597091674804688, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8701696991920471, "num_tokens": 796590923.0, "step": 20875 }, { "epoch": 2.6556417758554893, "ewc_loss": 0.07413709908723831, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003751600452233106, "grad_norm": 8.736132621765137, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8620543479919434, "num_tokens": 796629455.0, "step": 20876 }, { "epoch": 2.6557689861340794, "ewc_loss": 0.07368014752864838, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037059051101095974, "grad_norm": 8.608797073364258, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8618800640106201, "num_tokens": 796669950.0, "step": 20877 }, { "epoch": 2.6558961964126704, "ewc_loss": 0.0741659551858902, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037544863880611956, "grad_norm": 8.724566459655762, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8549004793167114, "num_tokens": 796712263.0, "step": 20878 }, { "epoch": 2.6560234066912605, "ewc_loss": 0.07387122511863708, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037250129389576614, "grad_norm": 8.65688419342041, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8660316467285156, "num_tokens": 796753025.0, "step": 20879 }, { "epoch": 2.6561506169698514, "ewc_loss": 0.07416726648807526, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003754616773221642, "grad_norm": 8.716056823730469, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.8786839842796326, "num_tokens": 796788810.0, "step": 20880 }, { "epoch": 2.6562778272484415, "ewc_loss": 0.07397644966840744, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037355354288592935, "grad_norm": 8.658893585205078, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8674639463424683, "num_tokens": 796826275.0, "step": 20881 }, { "epoch": 2.656405037527032, "ewc_loss": 0.07416088879108429, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037539793993346393, "grad_norm": 8.804317474365234, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.860512375831604, "num_tokens": 796859587.0, "step": 20882 }, { "epoch": 2.6565322478056226, "ewc_loss": 0.07366428524255753, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000370431924238801, "grad_norm": 8.56630802154541, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8719019889831543, "num_tokens": 796899440.0, "step": 20883 }, { "epoch": 2.656659458084213, "ewc_loss": 0.0739312469959259, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037798433913849294, "grad_norm": 8.810808181762695, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.855838418006897, "num_tokens": 796938703.0, "step": 20884 }, { "epoch": 2.6567866683628036, "ewc_loss": 0.07364457845687866, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000370234833098948, "grad_norm": 8.607583999633789, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8527498245239258, "num_tokens": 796982287.0, "step": 20885 }, { "epoch": 2.656913878641394, "ewc_loss": 0.07433629035949707, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037715191137976944, "grad_norm": 8.789928436279297, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8683241605758667, "num_tokens": 797017640.0, "step": 20886 }, { "epoch": 2.6570410889199847, "ewc_loss": 0.0736059844493866, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00036984894541092217, "grad_norm": 8.570344924926758, "learning_rate": 1e-06, "loss": 0.5096, "mean_token_accuracy": 0.8496090769767761, "num_tokens": 797054050.0, "step": 20887 }, { "epoch": 2.6571682991985752, "ewc_loss": 0.07438215613365173, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003776106459554285, "grad_norm": 8.780874252319336, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8713803291320801, "num_tokens": 797092429.0, "step": 20888 }, { "epoch": 2.6572955094771658, "ewc_loss": 0.0736858919262886, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003706479910761118, "grad_norm": 8.62575626373291, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8630850315093994, "num_tokens": 797125646.0, "step": 20889 }, { "epoch": 2.6574227197557563, "ewc_loss": 0.07436366379261017, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037742574932053685, "grad_norm": 8.759233474731445, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8702824115753174, "num_tokens": 797164560.0, "step": 20890 }, { "epoch": 2.657549930034347, "ewc_loss": 0.0735929012298584, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003721594694070518, "grad_norm": 8.684837341308594, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.8644390106201172, "num_tokens": 797194168.0, "step": 20891 }, { "epoch": 2.6576771403129373, "ewc_loss": 0.07361024618148804, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003747743903659284, "grad_norm": 8.730620384216309, "learning_rate": 1e-06, "loss": 0.4063, "mean_token_accuracy": 0.8864840865135193, "num_tokens": 797227045.0, "step": 20892 }, { "epoch": 2.657804350591528, "ewc_loss": 0.07339636981487274, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037263554986566305, "grad_norm": 8.641850471496582, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8808387517929077, "num_tokens": 797267607.0, "step": 20893 }, { "epoch": 2.6579315608701184, "ewc_loss": 0.07354536652565002, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037412557867355645, "grad_norm": 8.726393699645996, "learning_rate": 1e-06, "loss": 0.4967, "mean_token_accuracy": 0.8554399013519287, "num_tokens": 797309320.0, "step": 20894 }, { "epoch": 2.658058771148709, "ewc_loss": 0.07347743213176727, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037100474582985044, "grad_norm": 8.707807540893555, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8630355596542358, "num_tokens": 797348258.0, "step": 20895 }, { "epoch": 2.6581859814272994, "ewc_loss": 0.07389815151691437, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003727705334313214, "grad_norm": 8.666243553161621, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8652348518371582, "num_tokens": 797391563.0, "step": 20896 }, { "epoch": 2.65831319170589, "ewc_loss": 0.07337304949760437, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003724023699760437, "grad_norm": 8.704379081726074, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8742082118988037, "num_tokens": 797428314.0, "step": 20897 }, { "epoch": 2.6584404019844805, "ewc_loss": 0.07325354218482971, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037120727938599885, "grad_norm": 8.729936599731445, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8448700904846191, "num_tokens": 797464480.0, "step": 20898 }, { "epoch": 2.658567612263071, "ewc_loss": 0.07334291934967041, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037210105801932514, "grad_norm": 8.647833824157715, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8665584325790405, "num_tokens": 797504135.0, "step": 20899 }, { "epoch": 2.658694822541661, "ewc_loss": 0.07338996231555939, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003725714632309973, "grad_norm": 8.74756145477295, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.869015634059906, "num_tokens": 797544222.0, "step": 20900 }, { "epoch": 2.658822032820252, "ewc_loss": 0.07308086007833481, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036948046181350946, "grad_norm": 11.216928482055664, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8713235855102539, "num_tokens": 797583695.0, "step": 20901 }, { "epoch": 2.658949243098842, "ewc_loss": 0.07371864467859268, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000375858333427459, "grad_norm": 8.546180725097656, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8515521883964539, "num_tokens": 797625660.0, "step": 20902 }, { "epoch": 2.659076453377433, "ewc_loss": 0.07584945112466812, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00039716638275422156, "grad_norm": 9.246089935302734, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8643249273300171, "num_tokens": 797665807.0, "step": 20903 }, { "epoch": 2.6592036636560232, "ewc_loss": 0.07240140438079834, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036268593976274133, "grad_norm": 8.47122573852539, "learning_rate": 1e-06, "loss": 0.5205, "mean_token_accuracy": 0.8477235436439514, "num_tokens": 797707935.0, "step": 20904 }, { "epoch": 2.6593308739346138, "ewc_loss": 0.07618250697851181, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0004004969378001988, "grad_norm": 9.263630867004395, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.858696460723877, "num_tokens": 797745502.0, "step": 20905 }, { "epoch": 2.6594580842132043, "ewc_loss": 0.0731106698513031, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036977860145270824, "grad_norm": 8.630400657653809, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8648345470428467, "num_tokens": 797779412.0, "step": 20906 }, { "epoch": 2.659585294491795, "ewc_loss": 0.07566601037979126, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003904492186848074, "grad_norm": 9.103271484375, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8700233101844788, "num_tokens": 797816677.0, "step": 20907 }, { "epoch": 2.6597125047703853, "ewc_loss": 0.07349754869937897, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037364731542766094, "grad_norm": 8.7116060256958, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8480027318000793, "num_tokens": 797854260.0, "step": 20908 }, { "epoch": 2.659839715048976, "ewc_loss": 0.07450151443481445, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003836869727820158, "grad_norm": 8.979073524475098, "learning_rate": 1e-06, "loss": 0.5129, "mean_token_accuracy": 0.8545774221420288, "num_tokens": 797892349.0, "step": 20909 }, { "epoch": 2.6599669253275664, "ewc_loss": 0.07338633388280869, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003725352289620787, "grad_norm": 8.717845916748047, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8681787252426147, "num_tokens": 797928028.0, "step": 20910 }, { "epoch": 2.660094135606157, "ewc_loss": 0.07402117550373077, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000378883647499606, "grad_norm": 8.92550277709961, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8513084053993225, "num_tokens": 797963885.0, "step": 20911 }, { "epoch": 2.6602213458847475, "ewc_loss": 0.07332858443260193, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003719577507581562, "grad_norm": 8.729449272155762, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8517113924026489, "num_tokens": 798009842.0, "step": 20912 }, { "epoch": 2.660348556163338, "ewc_loss": 0.07360631227493286, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037473501288332045, "grad_norm": 8.784255981445312, "learning_rate": 1e-06, "loss": 0.419, "mean_token_accuracy": 0.8774784803390503, "num_tokens": 798043207.0, "step": 20913 }, { "epoch": 2.6604757664419285, "ewc_loss": 0.07349541783332825, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003736260696314275, "grad_norm": 8.728606224060059, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8806291818618774, "num_tokens": 798080887.0, "step": 20914 }, { "epoch": 2.660602976720519, "ewc_loss": 0.07342717796564102, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037294364301487803, "grad_norm": 8.70974349975586, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8749176263809204, "num_tokens": 798126393.0, "step": 20915 }, { "epoch": 2.6607301869991096, "ewc_loss": 0.07360236346721649, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003746955480892211, "grad_norm": 8.745335578918457, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8819646835327148, "num_tokens": 798166099.0, "step": 20916 }, { "epoch": 2.6608573972777, "ewc_loss": 0.07327896356582642, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037146153044886887, "grad_norm": 8.882617950439453, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8598350286483765, "num_tokens": 798202285.0, "step": 20917 }, { "epoch": 2.6609846075562906, "ewc_loss": 0.073124960064888, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003699214430525899, "grad_norm": 8.618926048278809, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.873340904712677, "num_tokens": 798243463.0, "step": 20918 }, { "epoch": 2.661111817834881, "ewc_loss": 0.07369442284107208, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003756161022465676, "grad_norm": 8.79431438446045, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8678731322288513, "num_tokens": 798275507.0, "step": 20919 }, { "epoch": 2.6612390281134717, "ewc_loss": 0.07310023903846741, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036967426422052085, "grad_norm": 8.645411491394043, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8843380808830261, "num_tokens": 798310536.0, "step": 20920 }, { "epoch": 2.661366238392062, "ewc_loss": 0.0737062394618988, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037573432200588286, "grad_norm": 8.790091514587402, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.86155104637146, "num_tokens": 798347874.0, "step": 20921 }, { "epoch": 2.6614934486706527, "ewc_loss": 0.07323945313692093, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003710663877427578, "grad_norm": 8.675760269165039, "learning_rate": 1e-06, "loss": 0.3886, "mean_token_accuracy": 0.8897649645805359, "num_tokens": 798381865.0, "step": 20922 }, { "epoch": 2.661620658949243, "ewc_loss": 0.07386696338653564, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037490014801733196, "grad_norm": 8.78732967376709, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8640788197517395, "num_tokens": 798423378.0, "step": 20923 }, { "epoch": 2.661747869227834, "ewc_loss": 0.07343000173568726, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003705304698087275, "grad_norm": 8.65386962890625, "learning_rate": 1e-06, "loss": 0.4498, "mean_token_accuracy": 0.8726762533187866, "num_tokens": 798460278.0, "step": 20924 }, { "epoch": 2.661875079506424, "ewc_loss": 0.07360587269067764, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037473058910109103, "grad_norm": 8.767908096313477, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8670405149459839, "num_tokens": 798496622.0, "step": 20925 }, { "epoch": 2.662002289785015, "ewc_loss": 0.07323826104402542, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000371054484276101, "grad_norm": 8.609503746032715, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8604127764701843, "num_tokens": 798533061.0, "step": 20926 }, { "epoch": 2.662129500063605, "ewc_loss": 0.07373650372028351, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000376036943634972, "grad_norm": 8.796730995178223, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8585360050201416, "num_tokens": 798573689.0, "step": 20927 }, { "epoch": 2.662256710342196, "ewc_loss": 0.07309764623641968, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003696483909152448, "grad_norm": 8.628087997436523, "learning_rate": 1e-06, "loss": 0.4303, "mean_token_accuracy": 0.8743442296981812, "num_tokens": 798614058.0, "step": 20928 }, { "epoch": 2.662383920620786, "ewc_loss": 0.07395301759243011, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037576069007627666, "grad_norm": 8.764873504638672, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8565031290054321, "num_tokens": 798657769.0, "step": 20929 }, { "epoch": 2.6625111308993765, "ewc_loss": 0.07350780069828033, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003713084734044969, "grad_norm": 8.619720458984375, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8694490194320679, "num_tokens": 798701084.0, "step": 20930 }, { "epoch": 2.662638341177967, "ewc_loss": 0.07374826073646545, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037615446490235627, "grad_norm": 8.786517143249512, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8722667098045349, "num_tokens": 798738029.0, "step": 20931 }, { "epoch": 2.6627655514565576, "ewc_loss": 0.07336042821407318, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037227614666335285, "grad_norm": 8.645224571228027, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8616988062858582, "num_tokens": 798773537.0, "step": 20932 }, { "epoch": 2.662892761735148, "ewc_loss": 0.07377539575099945, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003764258581213653, "grad_norm": 8.780637741088867, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8636236190795898, "num_tokens": 798810397.0, "step": 20933 }, { "epoch": 2.6630199720137386, "ewc_loss": 0.07333629578351974, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003720348177012056, "grad_norm": 8.623832702636719, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8622854351997375, "num_tokens": 798850796.0, "step": 20934 }, { "epoch": 2.663147182292329, "ewc_loss": 0.07411384582519531, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003773688804358244, "grad_norm": 8.77392292022705, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8659002780914307, "num_tokens": 798892379.0, "step": 20935 }, { "epoch": 2.6632743925709197, "ewc_loss": 0.07371443510055542, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037337481626309454, "grad_norm": 8.724871635437012, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.853173017501831, "num_tokens": 798929444.0, "step": 20936 }, { "epoch": 2.66340160284951, "ewc_loss": 0.07380741089582443, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003767459711525589, "grad_norm": 8.830962181091309, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8715949654579163, "num_tokens": 798964086.0, "step": 20937 }, { "epoch": 2.6635288131281007, "ewc_loss": 0.0733395665884018, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003720675013028085, "grad_norm": 8.660299301147461, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8732104897499084, "num_tokens": 799007408.0, "step": 20938 }, { "epoch": 2.6636560234066913, "ewc_loss": 0.07425273954868317, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003787579189520329, "grad_norm": 8.855571746826172, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8700140714645386, "num_tokens": 799045930.0, "step": 20939 }, { "epoch": 2.663783233685282, "ewc_loss": 0.07342074811458588, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000370437977835536, "grad_norm": 8.673877716064453, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8770096302032471, "num_tokens": 799083568.0, "step": 20940 }, { "epoch": 2.6639104439638723, "ewc_loss": 0.07370468974113464, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003757187514565885, "grad_norm": 8.831269264221191, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8627206087112427, "num_tokens": 799123681.0, "step": 20941 }, { "epoch": 2.664037654242463, "ewc_loss": 0.07330100238323212, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037168184644542634, "grad_norm": 8.726166725158691, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8616656064987183, "num_tokens": 799164623.0, "step": 20942 }, { "epoch": 2.6641648645210534, "ewc_loss": 0.07358462363481522, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003745181020349264, "grad_norm": 8.802318572998047, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8653637766838074, "num_tokens": 799200408.0, "step": 20943 }, { "epoch": 2.664292074799644, "ewc_loss": 0.07315957546234131, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037026763311587274, "grad_norm": 8.685612678527832, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8554330468177795, "num_tokens": 799245106.0, "step": 20944 }, { "epoch": 2.6644192850782344, "ewc_loss": 0.07366035878658295, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037527544191107154, "grad_norm": 8.807428359985352, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.8729977011680603, "num_tokens": 799290429.0, "step": 20945 }, { "epoch": 2.664546495356825, "ewc_loss": 0.07319605350494385, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003706324496306479, "grad_norm": 8.752513885498047, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8634927272796631, "num_tokens": 799322951.0, "step": 20946 }, { "epoch": 2.6646737056354155, "ewc_loss": 0.0734659805893898, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037333168438635767, "grad_norm": 8.767199516296387, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8629188537597656, "num_tokens": 799365498.0, "step": 20947 }, { "epoch": 2.6648009159140056, "ewc_loss": 0.07323974370956421, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037106929812580347, "grad_norm": 8.71453857421875, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.8608453869819641, "num_tokens": 799405894.0, "step": 20948 }, { "epoch": 2.6649281261925966, "ewc_loss": 0.07326607406139374, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000371332629583776, "grad_norm": 8.726511001586914, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8641805648803711, "num_tokens": 799439750.0, "step": 20949 }, { "epoch": 2.6650553364711866, "ewc_loss": 0.07337208837270737, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000372392765711993, "grad_norm": 8.75126838684082, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8573870062828064, "num_tokens": 799475170.0, "step": 20950 }, { "epoch": 2.6651825467497776, "ewc_loss": 0.07333050668239594, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003719769592862576, "grad_norm": 8.780766487121582, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.8530306220054626, "num_tokens": 799514231.0, "step": 20951 }, { "epoch": 2.6653097570283677, "ewc_loss": 0.07357297837734222, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003719602827914059, "grad_norm": 8.768494606018066, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8657816648483276, "num_tokens": 799549512.0, "step": 20952 }, { "epoch": 2.6654369673069587, "ewc_loss": 0.07359662652015686, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003721967223100364, "grad_norm": 8.666062355041504, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8605196475982666, "num_tokens": 799592144.0, "step": 20953 }, { "epoch": 2.6655641775855488, "ewc_loss": 0.07363587617874146, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003750306786969304, "grad_norm": 8.749835014343262, "learning_rate": 1e-06, "loss": 0.5088, "mean_token_accuracy": 0.8513649702072144, "num_tokens": 799634779.0, "step": 20954 }, { "epoch": 2.6656913878641393, "ewc_loss": 0.07319901883602142, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003706620482262224, "grad_norm": 8.65718936920166, "learning_rate": 1e-06, "loss": 0.4268, "mean_token_accuracy": 0.8776100277900696, "num_tokens": 799677548.0, "step": 20955 }, { "epoch": 2.66581859814273, "ewc_loss": 0.07368232309818268, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003754950885195285, "grad_norm": 8.796869277954102, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8488243818283081, "num_tokens": 799715679.0, "step": 20956 }, { "epoch": 2.6659458084213203, "ewc_loss": 0.07312923669815063, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003699642838910222, "grad_norm": 8.624011039733887, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8576563596725464, "num_tokens": 799755701.0, "step": 20957 }, { "epoch": 2.666073018699911, "ewc_loss": 0.07380063831806183, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.000376678304746747, "grad_norm": 8.775245666503906, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8607658743858337, "num_tokens": 799795151.0, "step": 20958 }, { "epoch": 2.6662002289785014, "ewc_loss": 0.07351045310497284, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003713350452017039, "grad_norm": 8.65612506866455, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8716472387313843, "num_tokens": 799832249.0, "step": 20959 }, { "epoch": 2.666327439257092, "ewc_loss": 0.07405953109264374, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037682574475184083, "grad_norm": 8.812515258789062, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8441591858863831, "num_tokens": 799867762.0, "step": 20960 }, { "epoch": 2.6664546495356825, "ewc_loss": 0.0732298493385315, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037097037420608103, "grad_norm": 8.608000755310059, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8623409867286682, "num_tokens": 799909012.0, "step": 20961 }, { "epoch": 2.666581859814273, "ewc_loss": 0.07393597066402435, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003780315746553242, "grad_norm": 8.8014497756958, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8625630736351013, "num_tokens": 799950316.0, "step": 20962 }, { "epoch": 2.6667090700928635, "ewc_loss": 0.07322169840335846, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.0003708888543769717, "grad_norm": 8.647843360900879, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8807058930397034, "num_tokens": 799990037.0, "step": 20963 }, { "epoch": 2.666836280371454, "ewc_loss": 0.07422463595867157, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037847686326131225, "grad_norm": 8.907050132751465, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8756685853004456, "num_tokens": 800023935.0, "step": 20964 }, { "epoch": 2.6669634906500446, "ewc_loss": 0.07311004400253296, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00036977234412916005, "grad_norm": 8.63960075378418, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8727507591247559, "num_tokens": 800066357.0, "step": 20965 }, { "epoch": 2.667090700928635, "ewc_loss": 0.07408755272626877, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003771060146391392, "grad_norm": 8.837248802185059, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8506468534469604, "num_tokens": 800100667.0, "step": 20966 }, { "epoch": 2.6672179112072256, "ewc_loss": 0.07338330894708633, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003700635570567101, "grad_norm": 8.628453254699707, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8702684044837952, "num_tokens": 800135101.0, "step": 20967 }, { "epoch": 2.667345121485816, "ewc_loss": 0.07409580796957016, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037718855310231447, "grad_norm": 8.808404922485352, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8808085918426514, "num_tokens": 800176308.0, "step": 20968 }, { "epoch": 2.6674723317644067, "ewc_loss": 0.07342381775379181, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003704686532728374, "grad_norm": 8.680817604064941, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8765645027160645, "num_tokens": 800205585.0, "step": 20969 }, { "epoch": 2.667599542042997, "ewc_loss": 0.07400194555521011, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037624992546625435, "grad_norm": 8.810285568237305, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8650518655776978, "num_tokens": 800241400.0, "step": 20970 }, { "epoch": 2.6677267523215877, "ewc_loss": 0.07350610196590424, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003712914767675102, "grad_norm": 8.706777572631836, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8735988140106201, "num_tokens": 800279060.0, "step": 20971 }, { "epoch": 2.6678539626001783, "ewc_loss": 0.0737387016415596, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003736174840014428, "grad_norm": 8.709559440612793, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8746231198310852, "num_tokens": 800318606.0, "step": 20972 }, { "epoch": 2.6679811728787683, "ewc_loss": 0.07357846200466156, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037201508530415595, "grad_norm": 8.730835914611816, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.865481972694397, "num_tokens": 800356795.0, "step": 20973 }, { "epoch": 2.6681083831573593, "ewc_loss": 0.07366704940795898, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003729010059032589, "grad_norm": 8.680811882019043, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8749146461486816, "num_tokens": 800393786.0, "step": 20974 }, { "epoch": 2.6682355934359494, "ewc_loss": 0.07373321056365967, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003735625359695405, "grad_norm": 8.681778907775879, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8629046082496643, "num_tokens": 800440153.0, "step": 20975 }, { "epoch": 2.6683628037145404, "ewc_loss": 0.07378882169723511, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037411871016956866, "grad_norm": 8.748106002807617, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.847282886505127, "num_tokens": 800476343.0, "step": 20976 }, { "epoch": 2.6684900139931305, "ewc_loss": 0.07378905266523361, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003741209802683443, "grad_norm": 8.69702434539795, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8788774013519287, "num_tokens": 800514227.0, "step": 20977 }, { "epoch": 2.668617224271721, "ewc_loss": 0.07367932051420212, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037302367854863405, "grad_norm": 8.718568801879883, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8644758462905884, "num_tokens": 800552357.0, "step": 20978 }, { "epoch": 2.6687444345503115, "ewc_loss": 0.07374091446399689, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037363963201642036, "grad_norm": 8.77652359008789, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8742403388023376, "num_tokens": 800587624.0, "step": 20979 }, { "epoch": 2.668871644828902, "ewc_loss": 0.07357583940029144, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037198886275291443, "grad_norm": 8.71618938446045, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8592698574066162, "num_tokens": 800631183.0, "step": 20980 }, { "epoch": 2.6689988551074926, "ewc_loss": 0.07377228140830994, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003739532839972526, "grad_norm": 8.670917510986328, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8691315054893494, "num_tokens": 800670334.0, "step": 20981 }, { "epoch": 2.669126065386083, "ewc_loss": 0.07363063097000122, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003725367714650929, "grad_norm": 8.802979469299316, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8626201748847961, "num_tokens": 800703500.0, "step": 20982 }, { "epoch": 2.6692532756646736, "ewc_loss": 0.07314319908618927, "ewc_loss_diag": 3.62396240234375e-05, "ewc_loss_parallel": 0.00037010383675806224, "grad_norm": 8.610732078552246, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8580288887023926, "num_tokens": 800742904.0, "step": 20983 }, { "epoch": 2.669380485943264, "ewc_loss": 0.07403784990310669, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037660892121493816, "grad_norm": 8.746488571166992, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8621745109558105, "num_tokens": 800777556.0, "step": 20984 }, { "epoch": 2.6695076962218547, "ewc_loss": 0.0734362006187439, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037059251917526126, "grad_norm": 8.81129264831543, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8559572100639343, "num_tokens": 800817532.0, "step": 20985 }, { "epoch": 2.669634906500445, "ewc_loss": 0.0735703632235527, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003719341184478253, "grad_norm": 8.658639907836914, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8751909136772156, "num_tokens": 800856567.0, "step": 20986 }, { "epoch": 2.6697621167790357, "ewc_loss": 0.07376639544963837, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003738944069482386, "grad_norm": 8.719454765319824, "learning_rate": 1e-06, "loss": 0.5315, "mean_token_accuracy": 0.8432109355926514, "num_tokens": 800901234.0, "step": 20987 }, { "epoch": 2.6698893270576263, "ewc_loss": 0.07356981933116913, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003719286760315299, "grad_norm": 8.66970443725586, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8626886606216431, "num_tokens": 800942647.0, "step": 20988 }, { "epoch": 2.670016537336217, "ewc_loss": 0.07385028153657913, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003747332957573235, "grad_norm": 8.716203689575195, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8728911876678467, "num_tokens": 800980194.0, "step": 20989 }, { "epoch": 2.6701437476148073, "ewc_loss": 0.07368221879005432, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037305266596376896, "grad_norm": 8.731451988220215, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8704270124435425, "num_tokens": 801016978.0, "step": 20990 }, { "epoch": 2.670270957893398, "ewc_loss": 0.07382206618785858, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037445107591338456, "grad_norm": 8.756437301635742, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8513728380203247, "num_tokens": 801051921.0, "step": 20991 }, { "epoch": 2.6703981681719884, "ewc_loss": 0.07370360195636749, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037326652090996504, "grad_norm": 8.744140625, "learning_rate": 1e-06, "loss": 0.5328, "mean_token_accuracy": 0.8399782776832581, "num_tokens": 801087189.0, "step": 20992 }, { "epoch": 2.670525378450579, "ewc_loss": 0.07375727593898773, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037380322464741766, "grad_norm": 8.752225875854492, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8715274930000305, "num_tokens": 801121574.0, "step": 20993 }, { "epoch": 2.6706525887291694, "ewc_loss": 0.0737658143043518, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003738886152859777, "grad_norm": 8.659590721130371, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8713382482528687, "num_tokens": 801160497.0, "step": 20994 }, { "epoch": 2.67077979900776, "ewc_loss": 0.07386445999145508, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037487511872313917, "grad_norm": 8.730690002441406, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8681536912918091, "num_tokens": 801198556.0, "step": 20995 }, { "epoch": 2.6709070092863505, "ewc_loss": 0.07382742315530777, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037450468516908586, "grad_norm": 8.71406364440918, "learning_rate": 1e-06, "loss": 0.3796, "mean_token_accuracy": 0.8920295238494873, "num_tokens": 801230616.0, "step": 20996 }, { "epoch": 2.671034219564941, "ewc_loss": 0.07378717511892319, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037410223740153015, "grad_norm": 8.773792266845703, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8643578290939331, "num_tokens": 801266582.0, "step": 20997 }, { "epoch": 2.671161429843531, "ewc_loss": 0.0736597329378128, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037282780976966023, "grad_norm": 8.751033782958984, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8775289058685303, "num_tokens": 801305127.0, "step": 20998 }, { "epoch": 2.671288640122122, "ewc_loss": 0.07378631830215454, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037409362266771495, "grad_norm": 8.655385971069336, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8699650168418884, "num_tokens": 801348284.0, "step": 20999 }, { "epoch": 2.671415850400712, "ewc_loss": 0.07378275692462921, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003740580577868968, "grad_norm": 8.65453815460205, "learning_rate": 1e-06, "loss": 0.3982, "mean_token_accuracy": 0.8868718147277832, "num_tokens": 801385298.0, "step": 21000 }, { "epoch": 2.671543060679303, "ewc_loss": 0.07372409105300903, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037347138277255, "grad_norm": 8.78292179107666, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8571751117706299, "num_tokens": 801417020.0, "step": 21001 }, { "epoch": 2.6716702709578932, "ewc_loss": 0.07358868420124054, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037211732706055045, "grad_norm": 8.654970169067383, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8590466976165771, "num_tokens": 801455328.0, "step": 21002 }, { "epoch": 2.6717974812364838, "ewc_loss": 0.07397396862506866, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037597017944790423, "grad_norm": 8.705223083496094, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8759282827377319, "num_tokens": 801494342.0, "step": 21003 }, { "epoch": 2.6719246915150743, "ewc_loss": 0.07364930212497711, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037272347253747284, "grad_norm": 8.681395530700684, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8693515062332153, "num_tokens": 801528182.0, "step": 21004 }, { "epoch": 2.672051901793665, "ewc_loss": 0.07389292120933533, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003751596959773451, "grad_norm": 8.69826602935791, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8647646307945251, "num_tokens": 801569424.0, "step": 21005 }, { "epoch": 2.6721791120722553, "ewc_loss": 0.07370655238628387, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003732960030902177, "grad_norm": 8.704444885253906, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8650782704353333, "num_tokens": 801608809.0, "step": 21006 }, { "epoch": 2.672306322350846, "ewc_loss": 0.07405668497085571, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037435590638779104, "grad_norm": 8.710413932800293, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8563443422317505, "num_tokens": 801651910.0, "step": 21007 }, { "epoch": 2.6724335326294364, "ewc_loss": 0.07368464767932892, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003730769967660308, "grad_norm": 8.728704452514648, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8654282689094543, "num_tokens": 801692961.0, "step": 21008 }, { "epoch": 2.672560742908027, "ewc_loss": 0.0736851692199707, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037308220635168254, "grad_norm": 8.683530807495117, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8536410927772522, "num_tokens": 801731278.0, "step": 21009 }, { "epoch": 2.6726879531866174, "ewc_loss": 0.07387778162956238, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037500832695513964, "grad_norm": 8.741110801696777, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8621220588684082, "num_tokens": 801769562.0, "step": 21010 }, { "epoch": 2.672815163465208, "ewc_loss": 0.07371433079242706, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003733737685251981, "grad_norm": 8.683537483215332, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.86231529712677, "num_tokens": 801811046.0, "step": 21011 }, { "epoch": 2.6729423737437985, "ewc_loss": 0.07385954260826111, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037482590414583683, "grad_norm": 8.809218406677246, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.842455267906189, "num_tokens": 801847833.0, "step": 21012 }, { "epoch": 2.673069584022389, "ewc_loss": 0.07361061125993729, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000372336566215381, "grad_norm": 8.616426467895508, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8596661686897278, "num_tokens": 801887819.0, "step": 21013 }, { "epoch": 2.6731967943009796, "ewc_loss": 0.07404869794845581, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003767174493987113, "grad_norm": 8.766460418701172, "learning_rate": 1e-06, "loss": 0.4265, "mean_token_accuracy": 0.8780828714370728, "num_tokens": 801922535.0, "step": 21014 }, { "epoch": 2.67332400457957, "ewc_loss": 0.07350293546915054, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037125981179997325, "grad_norm": 8.636263847351074, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8705809116363525, "num_tokens": 801958025.0, "step": 21015 }, { "epoch": 2.6734512148581606, "ewc_loss": 0.07399459183216095, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003761764382943511, "grad_norm": 8.736554145812988, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8690789937973022, "num_tokens": 801991291.0, "step": 21016 }, { "epoch": 2.673578425136751, "ewc_loss": 0.07353194057941437, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003715498314704746, "grad_norm": 8.677370071411133, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8676660060882568, "num_tokens": 802024952.0, "step": 21017 }, { "epoch": 2.6737056354153417, "ewc_loss": 0.07392340153455734, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000375464471289888, "grad_norm": 8.723956108093262, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8643137812614441, "num_tokens": 802065741.0, "step": 21018 }, { "epoch": 2.673832845693932, "ewc_loss": 0.07363274693489075, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037255798815749586, "grad_norm": 8.651164054870605, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8633559942245483, "num_tokens": 802107153.0, "step": 21019 }, { "epoch": 2.6739600559725227, "ewc_loss": 0.07379333674907684, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003741638211067766, "grad_norm": 8.673480033874512, "learning_rate": 1e-06, "loss": 0.4426, "mean_token_accuracy": 0.869975745677948, "num_tokens": 802148177.0, "step": 21020 }, { "epoch": 2.674087266251113, "ewc_loss": 0.07378312200307846, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003740616957657039, "grad_norm": 8.712967872619629, "learning_rate": 1e-06, "loss": 0.4544, "mean_token_accuracy": 0.8659259080886841, "num_tokens": 802186144.0, "step": 21021 }, { "epoch": 2.674214476529704, "ewc_loss": 0.0737524926662445, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003737554361578077, "grad_norm": 11.341306686401367, "learning_rate": 1e-06, "loss": 0.4194, "mean_token_accuracy": 0.8808674812316895, "num_tokens": 802224054.0, "step": 21022 }, { "epoch": 2.674341686808294, "ewc_loss": 0.07444630563259125, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038069349830038846, "grad_norm": 8.594610214233398, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8497954607009888, "num_tokens": 802264483.0, "step": 21023 }, { "epoch": 2.674468897086885, "ewc_loss": 0.07658768445253372, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004021073109470308, "grad_norm": 9.289522171020508, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8735049366950989, "num_tokens": 802302933.0, "step": 21024 }, { "epoch": 2.674596107365475, "ewc_loss": 0.07310998439788818, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036733029992319643, "grad_norm": 8.496296882629395, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8593448400497437, "num_tokens": 802347354.0, "step": 21025 }, { "epoch": 2.674723317644066, "ewc_loss": 0.07679278403520584, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004041583160869777, "grad_norm": 9.19316577911377, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.876021146774292, "num_tokens": 802392159.0, "step": 21026 }, { "epoch": 2.674850527922656, "ewc_loss": 0.07377146184444427, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037394504761323333, "grad_norm": 8.638411521911621, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8688345551490784, "num_tokens": 802432629.0, "step": 21027 }, { "epoch": 2.6749777382012465, "ewc_loss": 0.0756625086069107, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003928555815946311, "grad_norm": 9.02669906616211, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8758306503295898, "num_tokens": 802471776.0, "step": 21028 }, { "epoch": 2.675104948479837, "ewc_loss": 0.07419285178184509, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037815902032889426, "grad_norm": 8.816261291503906, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8582364320755005, "num_tokens": 802507314.0, "step": 21029 }, { "epoch": 2.6752321587584276, "ewc_loss": 0.0747876688838005, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038410714478231966, "grad_norm": 8.933332443237305, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8745671510696411, "num_tokens": 802550587.0, "step": 21030 }, { "epoch": 2.675359369037018, "ewc_loss": 0.07414117455482483, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037764222361147404, "grad_norm": 8.828667640686035, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.865211009979248, "num_tokens": 802591544.0, "step": 21031 }, { "epoch": 2.6754865793156086, "ewc_loss": 0.07431644201278687, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003793949435930699, "grad_norm": 8.839876174926758, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8739445209503174, "num_tokens": 802627336.0, "step": 21032 }, { "epoch": 2.675613789594199, "ewc_loss": 0.07413847744464874, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003776152152568102, "grad_norm": 8.804089546203613, "learning_rate": 1e-06, "loss": 0.4751, "mean_token_accuracy": 0.8659905791282654, "num_tokens": 802668358.0, "step": 21033 }, { "epoch": 2.6757409998727897, "ewc_loss": 0.0739343985915184, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000375574454665184, "grad_norm": 8.76343059539795, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8726645112037659, "num_tokens": 802708023.0, "step": 21034 }, { "epoch": 2.67586821015138, "ewc_loss": 0.07412964105606079, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037752691423520446, "grad_norm": 8.805980682373047, "learning_rate": 1e-06, "loss": 0.4087, "mean_token_accuracy": 0.8880435228347778, "num_tokens": 802745856.0, "step": 21035 }, { "epoch": 2.6759954204299707, "ewc_loss": 0.07372668385505676, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037349731428548694, "grad_norm": 8.728771209716797, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.877059817314148, "num_tokens": 802779032.0, "step": 21036 }, { "epoch": 2.6761226307085613, "ewc_loss": 0.07411327213048935, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037736320518888533, "grad_norm": 8.82640266418457, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8658677339553833, "num_tokens": 802815736.0, "step": 21037 }, { "epoch": 2.676249840987152, "ewc_loss": 0.07366624474525452, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003728929441422224, "grad_norm": 8.70761489868164, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8603324294090271, "num_tokens": 802854133.0, "step": 21038 }, { "epoch": 2.6763770512657423, "ewc_loss": 0.07411710172891617, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037740147672593594, "grad_norm": 8.811429023742676, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8729188442230225, "num_tokens": 802892969.0, "step": 21039 }, { "epoch": 2.676504261544333, "ewc_loss": 0.07371251285076141, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037335563683882356, "grad_norm": 8.7013578414917, "learning_rate": 1e-06, "loss": 0.4184, "mean_token_accuracy": 0.8814462423324585, "num_tokens": 802936998.0, "step": 21040 }, { "epoch": 2.6766314718229234, "ewc_loss": 0.07412195205688477, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003774499346036464, "grad_norm": 8.758988380432129, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8742854595184326, "num_tokens": 802979040.0, "step": 21041 }, { "epoch": 2.676758682101514, "ewc_loss": 0.07391048967838287, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037533536669798195, "grad_norm": 8.720763206481934, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8763410449028015, "num_tokens": 803016516.0, "step": 21042 }, { "epoch": 2.6768858923801044, "ewc_loss": 0.07405999302864075, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037683043046854436, "grad_norm": 8.777708053588867, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8645133376121521, "num_tokens": 803055566.0, "step": 21043 }, { "epoch": 2.677013102658695, "ewc_loss": 0.07390405237674713, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003752709599211812, "grad_norm": 8.709935188293457, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.869132936000824, "num_tokens": 803093683.0, "step": 21044 }, { "epoch": 2.6771403129372855, "ewc_loss": 0.07402835786342621, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003765140427276492, "grad_norm": 8.729591369628906, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8747611045837402, "num_tokens": 803133691.0, "step": 21045 }, { "epoch": 2.6772675232158756, "ewc_loss": 0.07402237504720688, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003764542343560606, "grad_norm": 8.75406265258789, "learning_rate": 1e-06, "loss": 0.4233, "mean_token_accuracy": 0.8758793473243713, "num_tokens": 803169870.0, "step": 21046 }, { "epoch": 2.6773947334944665, "ewc_loss": 0.0740009993314743, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000376240408513695, "grad_norm": 8.73951530456543, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.858661949634552, "num_tokens": 803209038.0, "step": 21047 }, { "epoch": 2.6775219437730566, "ewc_loss": 0.07400961220264435, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037632655585184693, "grad_norm": 8.788375854492188, "learning_rate": 1e-06, "loss": 0.4923, "mean_token_accuracy": 0.8522653579711914, "num_tokens": 803239725.0, "step": 21048 }, { "epoch": 2.6776491540516476, "ewc_loss": 0.07386423647403717, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037487284862436354, "grad_norm": 8.746003150939941, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8665541410446167, "num_tokens": 803273580.0, "step": 21049 }, { "epoch": 2.6777763643302377, "ewc_loss": 0.07403261214494705, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037655659252777696, "grad_norm": 8.71745777130127, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8728417158126831, "num_tokens": 803313583.0, "step": 21050 }, { "epoch": 2.6779035746088287, "ewc_loss": 0.07396833598613739, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037591383443214, "grad_norm": 8.763991355895996, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8739559054374695, "num_tokens": 803347517.0, "step": 21051 }, { "epoch": 2.6780307848874187, "ewc_loss": 0.07389001548290253, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003751305921468884, "grad_norm": 8.653031349182129, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8531860113143921, "num_tokens": 803388183.0, "step": 21052 }, { "epoch": 2.6781579951660093, "ewc_loss": 0.07432200014591217, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037945041549392045, "grad_norm": 11.399450302124023, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8678896427154541, "num_tokens": 803428377.0, "step": 21053 }, { "epoch": 2.6782852054446, "ewc_loss": 0.07485450804233551, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000384775543352589, "grad_norm": 8.651577949523926, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8769540190696716, "num_tokens": 803460930.0, "step": 21054 }, { "epoch": 2.6784124157231903, "ewc_loss": 0.0772315040230751, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00040854551480151713, "grad_norm": 9.365335464477539, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8757429122924805, "num_tokens": 803497971.0, "step": 21055 }, { "epoch": 2.678539626001781, "ewc_loss": 0.07347230613231659, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003709535812959075, "grad_norm": 8.58590316772461, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.8815475106239319, "num_tokens": 803535514.0, "step": 21056 }, { "epoch": 2.6786668362803714, "ewc_loss": 0.07723391056060791, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00040856958366930485, "grad_norm": 9.374967575073242, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8611710667610168, "num_tokens": 803580767.0, "step": 21057 }, { "epoch": 2.678794046558962, "ewc_loss": 0.07390987128019333, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037532916758209467, "grad_norm": 8.660273551940918, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8682315349578857, "num_tokens": 803619168.0, "step": 21058 }, { "epoch": 2.6789212568375524, "ewc_loss": 0.07618703693151474, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003981008194386959, "grad_norm": 9.1456298828125, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8518227338790894, "num_tokens": 803654785.0, "step": 21059 }, { "epoch": 2.679048467116143, "ewc_loss": 0.07412064075469971, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003774368960876018, "grad_norm": 8.763528823852539, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8755199909210205, "num_tokens": 803692505.0, "step": 21060 }, { "epoch": 2.6791756773947335, "ewc_loss": 0.07531049847602844, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038933547330088913, "grad_norm": 8.925705909729004, "learning_rate": 1e-06, "loss": 0.4324, "mean_token_accuracy": 0.8738917708396912, "num_tokens": 803732592.0, "step": 21061 }, { "epoch": 2.679302887673324, "ewc_loss": 0.07430549710988998, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003792854258790612, "grad_norm": 8.833373069763184, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8549923896789551, "num_tokens": 803767601.0, "step": 21062 }, { "epoch": 2.6794300979519146, "ewc_loss": 0.07451239228248596, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813543589785695, "grad_norm": 8.838409423828125, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8584482669830322, "num_tokens": 803804329.0, "step": 21063 }, { "epoch": 2.679557308230505, "ewc_loss": 0.07447357475757599, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038096625939942896, "grad_norm": 8.856569290161133, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8624534606933594, "num_tokens": 803846182.0, "step": 21064 }, { "epoch": 2.6796845185090956, "ewc_loss": 0.07403138279914856, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037654433981515467, "grad_norm": 8.823102951049805, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8615736961364746, "num_tokens": 803883561.0, "step": 21065 }, { "epoch": 2.679811728787686, "ewc_loss": 0.07424449920654297, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378675467800349, "grad_norm": 8.867883682250977, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8860811591148376, "num_tokens": 803916860.0, "step": 21066 }, { "epoch": 2.6799389390662767, "ewc_loss": 0.0738515630364418, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037474610144272447, "grad_norm": 8.746804237365723, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8673450350761414, "num_tokens": 803951949.0, "step": 21067 }, { "epoch": 2.680066149344867, "ewc_loss": 0.07404281198978424, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037665863055735826, "grad_norm": 8.770813941955566, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8608018159866333, "num_tokens": 803992094.0, "step": 21068 }, { "epoch": 2.6801933596234577, "ewc_loss": 0.07392479479312897, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037547844112850726, "grad_norm": 8.757173538208008, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8652482628822327, "num_tokens": 804034058.0, "step": 21069 }, { "epoch": 2.6803205699020483, "ewc_loss": 0.07399841398000717, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003762145934160799, "grad_norm": 8.724279403686523, "learning_rate": 1e-06, "loss": 0.4895, "mean_token_accuracy": 0.8585894703865051, "num_tokens": 804072722.0, "step": 21070 }, { "epoch": 2.6804477801806383, "ewc_loss": 0.07404892146587372, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037671971949748695, "grad_norm": 8.735668182373047, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8737572431564331, "num_tokens": 804114253.0, "step": 21071 }, { "epoch": 2.6805749904592293, "ewc_loss": 0.0740322470664978, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003765529254451394, "grad_norm": 8.78097152709961, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8691607117652893, "num_tokens": 804154877.0, "step": 21072 }, { "epoch": 2.6807022007378194, "ewc_loss": 0.07395823299884796, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037581281503662467, "grad_norm": 8.773555755615234, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.86170494556427, "num_tokens": 804191060.0, "step": 21073 }, { "epoch": 2.6808294110164104, "ewc_loss": 0.07393371313810349, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003755676152650267, "grad_norm": 8.772332191467285, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8585258722305298, "num_tokens": 804231574.0, "step": 21074 }, { "epoch": 2.6809566212950005, "ewc_loss": 0.07380867004394531, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003743171691894531, "grad_norm": 8.771139144897461, "learning_rate": 1e-06, "loss": 0.5239, "mean_token_accuracy": 0.8456403017044067, "num_tokens": 804264027.0, "step": 21075 }, { "epoch": 2.681083831573591, "ewc_loss": 0.0739331841468811, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003755622892640531, "grad_norm": 8.76551342010498, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8686790466308594, "num_tokens": 804302906.0, "step": 21076 }, { "epoch": 2.6812110418521815, "ewc_loss": 0.07398073375225067, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003760378167498857, "grad_norm": 8.743968963623047, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8611627221107483, "num_tokens": 804339107.0, "step": 21077 }, { "epoch": 2.681338252130772, "ewc_loss": 0.07407190650701523, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003769495524466038, "grad_norm": 8.820786476135254, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8708125352859497, "num_tokens": 804374292.0, "step": 21078 }, { "epoch": 2.6814654624093626, "ewc_loss": 0.07374106347560883, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037364105810411274, "grad_norm": 8.678120613098145, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8690171241760254, "num_tokens": 804420287.0, "step": 21079 }, { "epoch": 2.681592672687953, "ewc_loss": 0.07425042986869812, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003787347231991589, "grad_norm": 8.775358200073242, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8719803094863892, "num_tokens": 804453977.0, "step": 21080 }, { "epoch": 2.6817198829665436, "ewc_loss": 0.073736771941185, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037359818816185, "grad_norm": 8.691791534423828, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8506462574005127, "num_tokens": 804494958.0, "step": 21081 }, { "epoch": 2.681847093245134, "ewc_loss": 0.07431960850954056, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003794265503529459, "grad_norm": 8.853477478027344, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8468912839889526, "num_tokens": 804532209.0, "step": 21082 }, { "epoch": 2.6819743035237247, "ewc_loss": 0.07370403409004211, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003732707700692117, "grad_norm": 8.633489608764648, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8681323528289795, "num_tokens": 804572937.0, "step": 21083 }, { "epoch": 2.682101513802315, "ewc_loss": 0.07439526170492172, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038018307532183826, "grad_norm": 8.82738208770752, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.8657367825508118, "num_tokens": 804610263.0, "step": 21084 }, { "epoch": 2.6822287240809057, "ewc_loss": 0.07368600368499756, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037309047183953226, "grad_norm": 8.60180950164795, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8703552484512329, "num_tokens": 804645516.0, "step": 21085 }, { "epoch": 2.6823559343594963, "ewc_loss": 0.07456521689891815, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003818825935013592, "grad_norm": 8.840936660766602, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8783662915229797, "num_tokens": 804684540.0, "step": 21086 }, { "epoch": 2.682483144638087, "ewc_loss": 0.07375122606754303, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037374268868006766, "grad_norm": 8.686673164367676, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.8686317205429077, "num_tokens": 804723030.0, "step": 21087 }, { "epoch": 2.6826103549166773, "ewc_loss": 0.07444217056035995, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003806521708611399, "grad_norm": 8.802088737487793, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8752960562705994, "num_tokens": 804759202.0, "step": 21088 }, { "epoch": 2.682737565195268, "ewc_loss": 0.07379195094108582, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037414993857964873, "grad_norm": 8.71355152130127, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8639020919799805, "num_tokens": 804803199.0, "step": 21089 }, { "epoch": 2.6828647754738584, "ewc_loss": 0.07430265843868256, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037925702054053545, "grad_norm": 8.863362312316895, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8630837798118591, "num_tokens": 804835675.0, "step": 21090 }, { "epoch": 2.682991985752449, "ewc_loss": 0.07374295592308044, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003736600046977401, "grad_norm": 8.645254135131836, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8533151745796204, "num_tokens": 804876583.0, "step": 21091 }, { "epoch": 2.6831191960310394, "ewc_loss": 0.0742814764380455, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003790452319663018, "grad_norm": 8.885828971862793, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8559468984603882, "num_tokens": 804906110.0, "step": 21092 }, { "epoch": 2.68324640630963, "ewc_loss": 0.07357935607433319, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003720239910762757, "grad_norm": 8.604745864868164, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8582208156585693, "num_tokens": 804942196.0, "step": 21093 }, { "epoch": 2.6833736165882205, "ewc_loss": 0.07447253167629242, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038095578202046454, "grad_norm": 8.83247184753418, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8655813932418823, "num_tokens": 804978760.0, "step": 21094 }, { "epoch": 2.683500826866811, "ewc_loss": 0.07343120872974396, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037054254789836705, "grad_norm": 8.72586727142334, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.858220100402832, "num_tokens": 805014212.0, "step": 21095 }, { "epoch": 2.683628037145401, "ewc_loss": 0.07413972169160843, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003776277008000761, "grad_norm": 8.730195045471191, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8900787830352783, "num_tokens": 805050528.0, "step": 21096 }, { "epoch": 2.683755247423992, "ewc_loss": 0.07378556579351425, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003740861138794571, "grad_norm": 8.683242797851562, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8687101602554321, "num_tokens": 805087355.0, "step": 21097 }, { "epoch": 2.683882457702582, "ewc_loss": 0.07393951714038849, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037562567740678787, "grad_norm": 8.769758224487305, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8525117635726929, "num_tokens": 805117837.0, "step": 21098 }, { "epoch": 2.684009667981173, "ewc_loss": 0.0738389790058136, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037462028558366, "grad_norm": 8.65149211883545, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8654965162277222, "num_tokens": 805157734.0, "step": 21099 }, { "epoch": 2.684136878259763, "ewc_loss": 0.07419584691524506, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003781889099627733, "grad_norm": 8.774313926696777, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8734199404716492, "num_tokens": 805190657.0, "step": 21100 }, { "epoch": 2.6842640885383537, "ewc_loss": 0.07370491325855255, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003732795885298401, "grad_norm": 8.655025482177734, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8577936887741089, "num_tokens": 805229654.0, "step": 21101 }, { "epoch": 2.6843912988169443, "ewc_loss": 0.07422824949026108, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037851298111490905, "grad_norm": 8.75375747680664, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8661068081855774, "num_tokens": 805269199.0, "step": 21102 }, { "epoch": 2.684518509095535, "ewc_loss": 0.07376786321401596, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037390910438261926, "grad_norm": 8.685212135314941, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8605657815933228, "num_tokens": 805308918.0, "step": 21103 }, { "epoch": 2.6846457193741253, "ewc_loss": 0.07433441281318665, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003771332267206162, "grad_norm": 8.713699340820312, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8621379733085632, "num_tokens": 805345014.0, "step": 21104 }, { "epoch": 2.684772929652716, "ewc_loss": 0.07400760054588318, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037630650331266224, "grad_norm": 8.703845977783203, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8798496723175049, "num_tokens": 805383674.0, "step": 21105 }, { "epoch": 2.6849001399313064, "ewc_loss": 0.07390691339969635, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037529956898652017, "grad_norm": 8.789166450500488, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8696383237838745, "num_tokens": 805419765.0, "step": 21106 }, { "epoch": 2.685027350209897, "ewc_loss": 0.07389728724956512, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037520332261919975, "grad_norm": 8.754539489746094, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8662389516830444, "num_tokens": 805461346.0, "step": 21107 }, { "epoch": 2.6851545604884874, "ewc_loss": 0.07396561652421951, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037588662235066295, "grad_norm": 8.753880500793457, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8738001585006714, "num_tokens": 805494519.0, "step": 21108 }, { "epoch": 2.685281770767078, "ewc_loss": 0.07378602027893066, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003740906831808388, "grad_norm": 8.769492149353027, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8549588918685913, "num_tokens": 805530542.0, "step": 21109 }, { "epoch": 2.6854089810456685, "ewc_loss": 0.0737491101026535, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003737215884029865, "grad_norm": 8.708202362060547, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8629355430603027, "num_tokens": 805564531.0, "step": 21110 }, { "epoch": 2.685536191324259, "ewc_loss": 0.07384680211544037, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003746984584722668, "grad_norm": 8.768296241760254, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8623351454734802, "num_tokens": 805606011.0, "step": 21111 }, { "epoch": 2.6856634016028496, "ewc_loss": 0.0736735612154007, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003729660529643297, "grad_norm": 8.743470191955566, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.8610848188400269, "num_tokens": 805643078.0, "step": 21112 }, { "epoch": 2.68579061188144, "ewc_loss": 0.07373271882534027, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037355764652602375, "grad_norm": 8.698480606079102, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8706543445587158, "num_tokens": 805678554.0, "step": 21113 }, { "epoch": 2.6859178221600306, "ewc_loss": 0.07364647090435028, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037269521271809936, "grad_norm": 8.782548904418945, "learning_rate": 1e-06, "loss": 0.4272, "mean_token_accuracy": 0.8759188652038574, "num_tokens": 805713736.0, "step": 21114 }, { "epoch": 2.686045032438621, "ewc_loss": 0.07351206243038177, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000371351110516116, "grad_norm": 8.704425811767578, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8606210947036743, "num_tokens": 805751798.0, "step": 21115 }, { "epoch": 2.6861722427172117, "ewc_loss": 0.07371506094932556, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003733811026904732, "grad_norm": 8.710029602050781, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8743876814842224, "num_tokens": 805789205.0, "step": 21116 }, { "epoch": 2.686299452995802, "ewc_loss": 0.07367987930774689, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003730292373802513, "grad_norm": 8.746183395385742, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.8691019415855408, "num_tokens": 805824910.0, "step": 21117 }, { "epoch": 2.6864266632743927, "ewc_loss": 0.07362085580825806, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037243906990624964, "grad_norm": 8.711105346679688, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8629082441329956, "num_tokens": 805861335.0, "step": 21118 }, { "epoch": 2.686553873552983, "ewc_loss": 0.07384620606899261, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037469257949851453, "grad_norm": 9.35103702545166, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8627119064331055, "num_tokens": 805901488.0, "step": 21119 }, { "epoch": 2.686681083831574, "ewc_loss": 0.07275403290987015, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036377081414684653, "grad_norm": 8.556661605834961, "learning_rate": 1e-06, "loss": 0.578, "mean_token_accuracy": 0.8283262252807617, "num_tokens": 805941336.0, "step": 21120 }, { "epoch": 2.686808294110164, "ewc_loss": 0.0745098814368248, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813292714767158, "grad_norm": 8.916108131408691, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.873833179473877, "num_tokens": 805974222.0, "step": 21121 }, { "epoch": 2.686935504388755, "ewc_loss": 0.07284267246723175, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003646571421995759, "grad_norm": 8.471054077148438, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.862335741519928, "num_tokens": 806014942.0, "step": 21122 }, { "epoch": 2.687062714667345, "ewc_loss": 0.07473474740982056, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835779498331249, "grad_norm": 8.971757888793945, "learning_rate": 1e-06, "loss": 0.4229, "mean_token_accuracy": 0.8768646121025085, "num_tokens": 806048894.0, "step": 21123 }, { "epoch": 2.687189924945936, "ewc_loss": 0.07309653609991074, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003671958402264863, "grad_norm": 8.534675598144531, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.8646940588951111, "num_tokens": 806089714.0, "step": 21124 }, { "epoch": 2.687317135224526, "ewc_loss": 0.07478535175323486, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003840839781332761, "grad_norm": 8.935728073120117, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8539438247680664, "num_tokens": 806128974.0, "step": 21125 }, { "epoch": 2.6874443455031165, "ewc_loss": 0.07339587807655334, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003701892856042832, "grad_norm": 8.663167953491211, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8639136552810669, "num_tokens": 806169857.0, "step": 21126 }, { "epoch": 2.687571555781707, "ewc_loss": 0.07436511665582657, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037988164694979787, "grad_norm": 8.909605026245117, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8679419159889221, "num_tokens": 806202826.0, "step": 21127 }, { "epoch": 2.6876987660602976, "ewc_loss": 0.07335913181304932, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003698218206409365, "grad_norm": 8.629295349121094, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8639289140701294, "num_tokens": 806242850.0, "step": 21128 }, { "epoch": 2.687825976338888, "ewc_loss": 0.07440148293972015, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003802453284151852, "grad_norm": 8.905701637268066, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8684923648834229, "num_tokens": 806279024.0, "step": 21129 }, { "epoch": 2.6879531866174786, "ewc_loss": 0.07337610423564911, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003699914668686688, "grad_norm": 8.712545394897461, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8508700728416443, "num_tokens": 806313669.0, "step": 21130 }, { "epoch": 2.688080396896069, "ewc_loss": 0.07395496964454651, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037578018964268267, "grad_norm": 8.85408878326416, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8694657683372498, "num_tokens": 806353128.0, "step": 21131 }, { "epoch": 2.6882076071746597, "ewc_loss": 0.07341214269399643, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000370351888705045, "grad_norm": 8.753702163696289, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8668098449707031, "num_tokens": 806385044.0, "step": 21132 }, { "epoch": 2.68833481745325, "ewc_loss": 0.07368733733892441, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003731038304977119, "grad_norm": 8.771142959594727, "learning_rate": 1e-06, "loss": 0.5054, "mean_token_accuracy": 0.8567994832992554, "num_tokens": 806422157.0, "step": 21133 }, { "epoch": 2.6884620277318407, "ewc_loss": 0.07352250069379807, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037145547685213387, "grad_norm": 8.688811302185059, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8659752011299133, "num_tokens": 806463180.0, "step": 21134 }, { "epoch": 2.6885892380104313, "ewc_loss": 0.0736405998468399, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037263642298057675, "grad_norm": 9.410940170288086, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8614667057991028, "num_tokens": 806502031.0, "step": 21135 }, { "epoch": 2.688716448289022, "ewc_loss": 0.07274811714887619, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000363711646059528, "grad_norm": 8.56911849975586, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8686680793762207, "num_tokens": 806539585.0, "step": 21136 }, { "epoch": 2.6888436585676123, "ewc_loss": 0.07459371536970139, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000382167607313022, "grad_norm": 8.975693702697754, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8839023113250732, "num_tokens": 806573147.0, "step": 21137 }, { "epoch": 2.688970868846203, "ewc_loss": 0.07279005646705627, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036413100315257907, "grad_norm": 8.549760818481445, "learning_rate": 1e-06, "loss": 0.4246, "mean_token_accuracy": 0.8777987957000732, "num_tokens": 806609785.0, "step": 21138 }, { "epoch": 2.6890980791247934, "ewc_loss": 0.0746288150548935, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003825185995083302, "grad_norm": 8.930691719055176, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8724498748779297, "num_tokens": 806647709.0, "step": 21139 }, { "epoch": 2.689225289403384, "ewc_loss": 0.07310711592435837, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036730163265019655, "grad_norm": 8.612167358398438, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8707497119903564, "num_tokens": 806684448.0, "step": 21140 }, { "epoch": 2.6893524996819744, "ewc_loss": 0.07436900585889816, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037992052966728806, "grad_norm": 8.874542236328125, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.870714545249939, "num_tokens": 806725326.0, "step": 21141 }, { "epoch": 2.689479709960565, "ewc_loss": 0.07340946793556213, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037032514228485525, "grad_norm": 8.678618431091309, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8645689487457275, "num_tokens": 806759797.0, "step": 21142 }, { "epoch": 2.6896069202391555, "ewc_loss": 0.07413686811923981, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037759917904622853, "grad_norm": 8.832473754882812, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8737083673477173, "num_tokens": 806799986.0, "step": 21143 }, { "epoch": 2.6897341305177456, "ewc_loss": 0.0735846683382988, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037207716377452016, "grad_norm": 8.728239059448242, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8616142272949219, "num_tokens": 806843055.0, "step": 21144 }, { "epoch": 2.6898613407963365, "ewc_loss": 0.07393257319927216, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037555614835582674, "grad_norm": 8.813648223876953, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8800528049468994, "num_tokens": 806875897.0, "step": 21145 }, { "epoch": 2.6899885510749266, "ewc_loss": 0.07359284162521362, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003721588582266122, "grad_norm": 8.716405868530273, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8738366961479187, "num_tokens": 806915583.0, "step": 21146 }, { "epoch": 2.6901157613535176, "ewc_loss": 0.07388842105865479, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037511473055928946, "grad_norm": 8.85939884185791, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8708149790763855, "num_tokens": 806953851.0, "step": 21147 }, { "epoch": 2.6902429716321077, "ewc_loss": 0.07351119816303253, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003713424375746399, "grad_norm": 8.633012771606445, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8621097803115845, "num_tokens": 806988519.0, "step": 21148 }, { "epoch": 2.6903701819106987, "ewc_loss": 0.07400261610746384, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037625664845108986, "grad_norm": 8.809168815612793, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8690947890281677, "num_tokens": 807029612.0, "step": 21149 }, { "epoch": 2.6904973921892887, "ewc_loss": 0.07344508916139603, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003706813440658152, "grad_norm": 8.719572067260742, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8509100079536438, "num_tokens": 807064009.0, "step": 21150 }, { "epoch": 2.6906246024678793, "ewc_loss": 0.07382196187973022, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037445005727931857, "grad_norm": 8.696961402893066, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8638961315155029, "num_tokens": 807104694.0, "step": 21151 }, { "epoch": 2.69075181274647, "ewc_loss": 0.07365783303976059, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000372808804968372, "grad_norm": 8.818887710571289, "learning_rate": 1e-06, "loss": 0.5423, "mean_token_accuracy": 0.8427684307098389, "num_tokens": 807142500.0, "step": 21152 }, { "epoch": 2.6908790230250603, "ewc_loss": 0.07357373833656311, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037196784978732467, "grad_norm": 8.688632011413574, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8622392416000366, "num_tokens": 807172123.0, "step": 21153 }, { "epoch": 2.691006233303651, "ewc_loss": 0.07399839162826538, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037621433148160577, "grad_norm": 8.752922058105469, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8682554960250854, "num_tokens": 807206422.0, "step": 21154 }, { "epoch": 2.6911334435822414, "ewc_loss": 0.07363680005073547, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003725984424818307, "grad_norm": 8.770612716674805, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8518803119659424, "num_tokens": 807240867.0, "step": 21155 }, { "epoch": 2.691260653860832, "ewc_loss": 0.07390165328979492, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003752470074687153, "grad_norm": 8.66907024383545, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8762943744659424, "num_tokens": 807275838.0, "step": 21156 }, { "epoch": 2.6913878641394224, "ewc_loss": 0.07385948300361633, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003748253220692277, "grad_norm": 8.71833324432373, "learning_rate": 1e-06, "loss": 0.4294, "mean_token_accuracy": 0.8739163875579834, "num_tokens": 807315164.0, "step": 21157 }, { "epoch": 2.691515074418013, "ewc_loss": 0.07380741089582443, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003743045963346958, "grad_norm": 8.700288772583008, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8584515452384949, "num_tokens": 807358016.0, "step": 21158 }, { "epoch": 2.6916422846966035, "ewc_loss": 0.07374103367328644, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003736407670658082, "grad_norm": 8.737208366394043, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8654707670211792, "num_tokens": 807402751.0, "step": 21159 }, { "epoch": 2.691769494975194, "ewc_loss": 0.07383204996585846, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000374551018467173, "grad_norm": 8.725984573364258, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8698311448097229, "num_tokens": 807442761.0, "step": 21160 }, { "epoch": 2.6918967052537845, "ewc_loss": 0.07375144213438034, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037374490057118237, "grad_norm": 8.752824783325195, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8723882436752319, "num_tokens": 807478550.0, "step": 21161 }, { "epoch": 2.692023915532375, "ewc_loss": 0.0737498551607132, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037372903898358345, "grad_norm": 8.725582122802734, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8751488924026489, "num_tokens": 807522756.0, "step": 21162 }, { "epoch": 2.6921511258109656, "ewc_loss": 0.07376714050769806, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003739018284250051, "grad_norm": 8.68494701385498, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8708308935165405, "num_tokens": 807561701.0, "step": 21163 }, { "epoch": 2.692278336089556, "ewc_loss": 0.07391422986984253, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003753727942239493, "grad_norm": 8.765031814575195, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8648412227630615, "num_tokens": 807604058.0, "step": 21164 }, { "epoch": 2.6924055463681467, "ewc_loss": 0.07360385358333588, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037226895801723003, "grad_norm": 8.733633995056152, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8721323013305664, "num_tokens": 807637344.0, "step": 21165 }, { "epoch": 2.692532756646737, "ewc_loss": 0.07385209202766418, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003747513401322067, "grad_norm": 8.690046310424805, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8584734201431274, "num_tokens": 807676867.0, "step": 21166 }, { "epoch": 2.6926599669253277, "ewc_loss": 0.07392770051956177, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037550742854364216, "grad_norm": 8.765737533569336, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8690850734710693, "num_tokens": 807707183.0, "step": 21167 }, { "epoch": 2.6927871772039182, "ewc_loss": 0.073746457695961, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037369501660577953, "grad_norm": 8.720507621765137, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.8563058376312256, "num_tokens": 807737617.0, "step": 21168 }, { "epoch": 2.6929143874825083, "ewc_loss": 0.07399005442857742, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037613100721500814, "grad_norm": 8.798395156860352, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.858784556388855, "num_tokens": 807774141.0, "step": 21169 }, { "epoch": 2.6930415977610993, "ewc_loss": 0.073702372610569, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037325420998968184, "grad_norm": 8.898609161376953, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8721126317977905, "num_tokens": 807813375.0, "step": 21170 }, { "epoch": 2.6931688080396894, "ewc_loss": 0.0736183375120163, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003724138077814132, "grad_norm": 8.739120483398438, "learning_rate": 1e-06, "loss": 0.5061, "mean_token_accuracy": 0.8524342775344849, "num_tokens": 807852317.0, "step": 21171 }, { "epoch": 2.6932960183182804, "ewc_loss": 0.07381607592105865, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037439126754179597, "grad_norm": 8.83033275604248, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8673566579818726, "num_tokens": 807895266.0, "step": 21172 }, { "epoch": 2.6934232285968704, "ewc_loss": 0.0735229104757309, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037145958049222827, "grad_norm": 8.701788902282715, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8676970601081848, "num_tokens": 807938600.0, "step": 21173 }, { "epoch": 2.693550438875461, "ewc_loss": 0.07386644184589386, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037489485112018883, "grad_norm": 8.82638168334961, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8858479261398315, "num_tokens": 807974329.0, "step": 21174 }, { "epoch": 2.6936776491540515, "ewc_loss": 0.07347895205020905, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037101993802934885, "grad_norm": 8.691943168640137, "learning_rate": 1e-06, "loss": 0.4276, "mean_token_accuracy": 0.8779937028884888, "num_tokens": 808015666.0, "step": 21175 }, { "epoch": 2.693804859432642, "ewc_loss": 0.07390640676021576, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037529453402385116, "grad_norm": 8.803576469421387, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8783232569694519, "num_tokens": 808050176.0, "step": 21176 }, { "epoch": 2.6939320697112326, "ewc_loss": 0.0734759047627449, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037098952452652156, "grad_norm": 8.740680694580078, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8587600588798523, "num_tokens": 808091910.0, "step": 21177 }, { "epoch": 2.694059279989823, "ewc_loss": 0.07386208325624466, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003748512826859951, "grad_norm": 8.819597244262695, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8627251982688904, "num_tokens": 808128646.0, "step": 21178 }, { "epoch": 2.6941864902684136, "ewc_loss": 0.0734664797782898, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037089528632350266, "grad_norm": 8.709766387939453, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8701116442680359, "num_tokens": 808166544.0, "step": 21179 }, { "epoch": 2.694313700547004, "ewc_loss": 0.07371494919061661, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003733799676410854, "grad_norm": 8.853254318237305, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8553637266159058, "num_tokens": 808200051.0, "step": 21180 }, { "epoch": 2.6944409108255947, "ewc_loss": 0.0734274834394455, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037050529499538243, "grad_norm": 8.776626586914062, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8682913184165955, "num_tokens": 808239251.0, "step": 21181 }, { "epoch": 2.694568121104185, "ewc_loss": 0.0736662894487381, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037289332249201834, "grad_norm": 8.744356155395508, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8615535497665405, "num_tokens": 808280639.0, "step": 21182 }, { "epoch": 2.6946953313827757, "ewc_loss": 0.07342009246349335, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003704314003698528, "grad_norm": 8.728033065795898, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.874363124370575, "num_tokens": 808316141.0, "step": 21183 }, { "epoch": 2.6948225416613663, "ewc_loss": 0.07365866005420685, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037281704135239124, "grad_norm": 8.782869338989258, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8650228977203369, "num_tokens": 808359172.0, "step": 21184 }, { "epoch": 2.694949751939957, "ewc_loss": 0.07349589467048645, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037118943873792887, "grad_norm": 8.733282089233398, "learning_rate": 1e-06, "loss": 0.5608, "mean_token_accuracy": 0.837620198726654, "num_tokens": 808397823.0, "step": 21185 }, { "epoch": 2.6950769622185473, "ewc_loss": 0.07372159510850906, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003734464116860181, "grad_norm": 8.80773639678955, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8759901523590088, "num_tokens": 808435355.0, "step": 21186 }, { "epoch": 2.695204172497138, "ewc_loss": 0.07348012179136276, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037103169597685337, "grad_norm": 8.76289176940918, "learning_rate": 1e-06, "loss": 0.4386, "mean_token_accuracy": 0.8726183176040649, "num_tokens": 808470792.0, "step": 21187 }, { "epoch": 2.6953313827757284, "ewc_loss": 0.07362881302833557, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037251863977871835, "grad_norm": 8.79261302947998, "learning_rate": 1e-06, "loss": 0.4072, "mean_token_accuracy": 0.8838831186294556, "num_tokens": 808504591.0, "step": 21188 }, { "epoch": 2.695458593054319, "ewc_loss": 0.0735711082816124, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003719415399245918, "grad_norm": 8.765484809875488, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8783116340637207, "num_tokens": 808537167.0, "step": 21189 }, { "epoch": 2.6955858033329094, "ewc_loss": 0.07362857460975647, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003725162532646209, "grad_norm": 8.74929428100586, "learning_rate": 1e-06, "loss": 0.4162, "mean_token_accuracy": 0.8818085193634033, "num_tokens": 808572055.0, "step": 21190 }, { "epoch": 2.6957130136115, "ewc_loss": 0.07366327196359634, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003728632000274956, "grad_norm": 8.720420837402344, "learning_rate": 1e-06, "loss": 0.4136, "mean_token_accuracy": 0.8793075084686279, "num_tokens": 808605239.0, "step": 21191 }, { "epoch": 2.6958402238900905, "ewc_loss": 0.07383190840482712, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037454956327565014, "grad_norm": 8.87368106842041, "learning_rate": 1e-06, "loss": 0.5309, "mean_token_accuracy": 0.843538761138916, "num_tokens": 808643227.0, "step": 21192 }, { "epoch": 2.695967434168681, "ewc_loss": 0.07336625456809998, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003698930668178946, "grad_norm": 8.611310005187988, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8594947457313538, "num_tokens": 808683855.0, "step": 21193 }, { "epoch": 2.696094644447271, "ewc_loss": 0.07415194809436798, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003777499950956553, "grad_norm": 8.764626502990723, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8724265098571777, "num_tokens": 808723467.0, "step": 21194 }, { "epoch": 2.696221854725862, "ewc_loss": 0.0734388679265976, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037061917828395963, "grad_norm": 8.724621772766113, "learning_rate": 1e-06, "loss": 0.5071, "mean_token_accuracy": 0.8546789288520813, "num_tokens": 808759805.0, "step": 21195 }, { "epoch": 2.696349065004452, "ewc_loss": 0.07401563972234726, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003763868589885533, "grad_norm": 8.81949520111084, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8503122925758362, "num_tokens": 808802024.0, "step": 21196 }, { "epoch": 2.696476275283043, "ewc_loss": 0.07355701923370361, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003718006773851812, "grad_norm": 8.725566864013672, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8567966222763062, "num_tokens": 808837086.0, "step": 21197 }, { "epoch": 2.696603485561633, "ewc_loss": 0.073906309902668, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003752935735974461, "grad_norm": 8.737872123718262, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.865900456905365, "num_tokens": 808875261.0, "step": 21198 }, { "epoch": 2.6967306958402237, "ewc_loss": 0.073793426156044, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003741646942216903, "grad_norm": 8.704767227172852, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.87291419506073, "num_tokens": 808913044.0, "step": 21199 }, { "epoch": 2.6968579061188143, "ewc_loss": 0.0738307386636734, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003745378344319761, "grad_norm": 8.769622802734375, "learning_rate": 1e-06, "loss": 0.4389, "mean_token_accuracy": 0.8749247789382935, "num_tokens": 808952374.0, "step": 21200 }, { "epoch": 2.696985116397405, "ewc_loss": 0.0737726166844368, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003739566309377551, "grad_norm": 8.725698471069336, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8666421175003052, "num_tokens": 808994397.0, "step": 21201 }, { "epoch": 2.6971123266759953, "ewc_loss": 0.07392625510692596, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037549305125139654, "grad_norm": 8.791810989379883, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8700544834136963, "num_tokens": 809036401.0, "step": 21202 }, { "epoch": 2.697239536954586, "ewc_loss": 0.07377102971076965, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037394079845398664, "grad_norm": 8.692934036254883, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.871637225151062, "num_tokens": 809073073.0, "step": 21203 }, { "epoch": 2.6973667472331764, "ewc_loss": 0.07402244955301285, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037645496195182204, "grad_norm": 8.760403633117676, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8662283420562744, "num_tokens": 809117348.0, "step": 21204 }, { "epoch": 2.697493957511767, "ewc_loss": 0.07378250360488892, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003740554675459862, "grad_norm": 8.740860939025879, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8802974820137024, "num_tokens": 809151357.0, "step": 21205 }, { "epoch": 2.6976211677903574, "ewc_loss": 0.07388998568058014, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037513027200475335, "grad_norm": 8.76225471496582, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8611605763435364, "num_tokens": 809189000.0, "step": 21206 }, { "epoch": 2.697748378068948, "ewc_loss": 0.07383869588375092, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037461737520061433, "grad_norm": 8.701160430908203, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8689363598823547, "num_tokens": 809227380.0, "step": 21207 }, { "epoch": 2.6978755883475385, "ewc_loss": 0.07404354959726334, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037666596472263336, "grad_norm": 8.775062561035156, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8663820028305054, "num_tokens": 809265131.0, "step": 21208 }, { "epoch": 2.698002798626129, "ewc_loss": 0.07375522702932358, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003737827355507761, "grad_norm": 8.730010986328125, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8652043342590332, "num_tokens": 809297190.0, "step": 21209 }, { "epoch": 2.6981300089047195, "ewc_loss": 0.0739893913269043, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003761244297493249, "grad_norm": 8.74133014678955, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8742542266845703, "num_tokens": 809336774.0, "step": 21210 }, { "epoch": 2.69825721918331, "ewc_loss": 0.07380731403827667, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037430363590829074, "grad_norm": 8.722729682922363, "learning_rate": 1e-06, "loss": 0.5117, "mean_token_accuracy": 0.8489236831665039, "num_tokens": 809376612.0, "step": 21211 }, { "epoch": 2.6983844294619006, "ewc_loss": 0.07394863665103912, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003757168014999479, "grad_norm": 8.72443675994873, "learning_rate": 1e-06, "loss": 0.3834, "mean_token_accuracy": 0.8896706104278564, "num_tokens": 809416058.0, "step": 21212 }, { "epoch": 2.698511639740491, "ewc_loss": 0.07391376048326492, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003753680794034153, "grad_norm": 8.741618156433105, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8788479566574097, "num_tokens": 809449580.0, "step": 21213 }, { "epoch": 2.6986388500190817, "ewc_loss": 0.07395558059215546, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003757863014470786, "grad_norm": 8.78188419342041, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8693360090255737, "num_tokens": 809484983.0, "step": 21214 }, { "epoch": 2.698766060297672, "ewc_loss": 0.07398197799921036, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003760502440854907, "grad_norm": 8.705655097961426, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8703613877296448, "num_tokens": 809521086.0, "step": 21215 }, { "epoch": 2.6988932705762627, "ewc_loss": 0.07406194508075714, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003768499300349504, "grad_norm": 8.758065223693848, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8649140000343323, "num_tokens": 809562475.0, "step": 21216 }, { "epoch": 2.699020480854853, "ewc_loss": 0.07423523813486099, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037614142638631165, "grad_norm": 8.776577949523926, "learning_rate": 1e-06, "loss": 0.5076, "mean_token_accuracy": 0.8532440662384033, "num_tokens": 809596686.0, "step": 21217 }, { "epoch": 2.6991476911334438, "ewc_loss": 0.07398897409439087, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037612023879773915, "grad_norm": 8.751209259033203, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8689681887626648, "num_tokens": 809636614.0, "step": 21218 }, { "epoch": 2.699274901412034, "ewc_loss": 0.07396900653839111, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037592058652080595, "grad_norm": 8.849865913391113, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.8547937273979187, "num_tokens": 809674542.0, "step": 21219 }, { "epoch": 2.699402111690625, "ewc_loss": 0.07369142770767212, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037314469227567315, "grad_norm": 8.665058135986328, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8838536739349365, "num_tokens": 809716726.0, "step": 21220 }, { "epoch": 2.699529321969215, "ewc_loss": 0.07417729496955872, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378003460355103, "grad_norm": 8.827160835266113, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8764827847480774, "num_tokens": 809756790.0, "step": 21221 }, { "epoch": 2.699656532247806, "ewc_loss": 0.07360519468784332, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037228240398690104, "grad_norm": 8.65928840637207, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8639692068099976, "num_tokens": 809794616.0, "step": 21222 }, { "epoch": 2.699783742526396, "ewc_loss": 0.07431981712579727, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003794286458287388, "grad_norm": 8.766824722290039, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8733834624290466, "num_tokens": 809836662.0, "step": 21223 }, { "epoch": 2.6999109528049865, "ewc_loss": 0.07388933002948761, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037512375274673104, "grad_norm": 8.72231674194336, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8632736206054688, "num_tokens": 809873812.0, "step": 21224 }, { "epoch": 2.700038163083577, "ewc_loss": 0.0741642564535141, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003778729878831655, "grad_norm": 8.768580436706543, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.866844892501831, "num_tokens": 809913139.0, "step": 21225 }, { "epoch": 2.7001653733621676, "ewc_loss": 0.0739343911409378, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037557442556135356, "grad_norm": 8.752594947814941, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8682012557983398, "num_tokens": 809947741.0, "step": 21226 }, { "epoch": 2.700292583640758, "ewc_loss": 0.0739673301577568, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037590376450680196, "grad_norm": 8.68617057800293, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8578224182128906, "num_tokens": 809987120.0, "step": 21227 }, { "epoch": 2.7004197939193486, "ewc_loss": 0.0741538256406784, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037776867975480855, "grad_norm": 8.856827735900879, "learning_rate": 1e-06, "loss": 0.4875, "mean_token_accuracy": 0.8588094711303711, "num_tokens": 810026972.0, "step": 21228 }, { "epoch": 2.700547004197939, "ewc_loss": 0.07368550449609756, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003730855241883546, "grad_norm": 8.647360801696777, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8655422925949097, "num_tokens": 810062767.0, "step": 21229 }, { "epoch": 2.7006742144765297, "ewc_loss": 0.07456529140472412, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000381883408408612, "grad_norm": 8.95290470123291, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8748148083686829, "num_tokens": 810096930.0, "step": 21230 }, { "epoch": 2.70080142475512, "ewc_loss": 0.07358843088150024, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003721147950273007, "grad_norm": 8.672018051147461, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8527045249938965, "num_tokens": 810134600.0, "step": 21231 }, { "epoch": 2.7009286350337107, "ewc_loss": 0.07474707067012787, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038370120455510914, "grad_norm": 8.934196472167969, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8689789772033691, "num_tokens": 810169535.0, "step": 21232 }, { "epoch": 2.7010558453123013, "ewc_loss": 0.07348404824733734, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003710709570441395, "grad_norm": 8.67512321472168, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8553634881973267, "num_tokens": 810205288.0, "step": 21233 }, { "epoch": 2.701183055590892, "ewc_loss": 0.07476947456598282, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003839252167381346, "grad_norm": 8.9988374710083, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8637998700141907, "num_tokens": 810242868.0, "step": 21234 }, { "epoch": 2.7013102658694823, "ewc_loss": 0.07343664765357971, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003705969429574907, "grad_norm": 8.576579093933105, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8543486595153809, "num_tokens": 810286539.0, "step": 21235 }, { "epoch": 2.701437476148073, "ewc_loss": 0.07487707585096359, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000385001243557781, "grad_norm": 9.031682014465332, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8693605661392212, "num_tokens": 810331598.0, "step": 21236 }, { "epoch": 2.7015646864266634, "ewc_loss": 0.0736057311296463, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000369846384273842, "grad_norm": 8.626479148864746, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.874975323677063, "num_tokens": 810369501.0, "step": 21237 }, { "epoch": 2.701691896705254, "ewc_loss": 0.0747581496834755, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038381197373382747, "grad_norm": 8.916138648986816, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8634862303733826, "num_tokens": 810410727.0, "step": 21238 }, { "epoch": 2.7018191069838444, "ewc_loss": 0.07360969483852386, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037232745671644807, "grad_norm": 8.718172073364258, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8584327101707458, "num_tokens": 810450315.0, "step": 21239 }, { "epoch": 2.701946317262435, "ewc_loss": 0.07447899878025055, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038102042162790895, "grad_norm": 8.776870727539062, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8573411703109741, "num_tokens": 810493972.0, "step": 21240 }, { "epoch": 2.7020735275410255, "ewc_loss": 0.07387455552816391, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037497602170333266, "grad_norm": 8.714780807495117, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.878516435623169, "num_tokens": 810526038.0, "step": 21241 }, { "epoch": 2.7022007378196156, "ewc_loss": 0.07428251951932907, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003790556511376053, "grad_norm": 8.891688346862793, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8706164360046387, "num_tokens": 810560772.0, "step": 21242 }, { "epoch": 2.7023279480982065, "ewc_loss": 0.07371982932090759, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037342877476476133, "grad_norm": 8.661065101623535, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8729816675186157, "num_tokens": 810598322.0, "step": 21243 }, { "epoch": 2.7024551583767966, "ewc_loss": 0.0744629055261612, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003808595356531441, "grad_norm": 8.882193565368652, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8866319060325623, "num_tokens": 810638652.0, "step": 21244 }, { "epoch": 2.7025823686553876, "ewc_loss": 0.07371088117361069, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037333928048610687, "grad_norm": 8.669696807861328, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8643984198570251, "num_tokens": 810680318.0, "step": 21245 }, { "epoch": 2.7027095789339777, "ewc_loss": 0.07424155622720718, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003786460147239268, "grad_norm": 8.780930519104004, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8697500824928284, "num_tokens": 810715001.0, "step": 21246 }, { "epoch": 2.7028367892125686, "ewc_loss": 0.07389994710683823, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003752299235202372, "grad_norm": 8.806321144104004, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8574143648147583, "num_tokens": 810748897.0, "step": 21247 }, { "epoch": 2.7029639994911587, "ewc_loss": 0.07402943074703217, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037652478204108775, "grad_norm": 8.756375312805176, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8608306646347046, "num_tokens": 810788954.0, "step": 21248 }, { "epoch": 2.7030912097697493, "ewc_loss": 0.07409035414457321, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037713401252403855, "grad_norm": 8.733222007751465, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8693954944610596, "num_tokens": 810828897.0, "step": 21249 }, { "epoch": 2.70321842004834, "ewc_loss": 0.07398689538240433, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003760994295589626, "grad_norm": 8.722734451293945, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8840513825416565, "num_tokens": 810866698.0, "step": 21250 }, { "epoch": 2.7033456303269303, "ewc_loss": 0.07409379631280899, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037716844235546887, "grad_norm": 8.790474891662598, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8589798808097839, "num_tokens": 810903140.0, "step": 21251 }, { "epoch": 2.703472840605521, "ewc_loss": 0.07389368116855621, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003751672920770943, "grad_norm": 8.691814422607422, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8694144487380981, "num_tokens": 810940664.0, "step": 21252 }, { "epoch": 2.7036000508841114, "ewc_loss": 0.07426275312900543, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003788579779211432, "grad_norm": 8.793927192687988, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8642255067825317, "num_tokens": 810981818.0, "step": 21253 }, { "epoch": 2.703727261162702, "ewc_loss": 0.07391895353794098, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003754200297407806, "grad_norm": 8.69166374206543, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8662272691726685, "num_tokens": 811019897.0, "step": 21254 }, { "epoch": 2.7038544714412924, "ewc_loss": 0.07443797588348389, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038061023224145174, "grad_norm": 8.843897819519043, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8697784543037415, "num_tokens": 811056310.0, "step": 21255 }, { "epoch": 2.703981681719883, "ewc_loss": 0.07379559427499771, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000374186405679211, "grad_norm": 8.755008697509766, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.856844425201416, "num_tokens": 811091505.0, "step": 21256 }, { "epoch": 2.7041088919984735, "ewc_loss": 0.07429748773574829, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037920536124147475, "grad_norm": 8.766789436340332, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8794600963592529, "num_tokens": 811134861.0, "step": 21257 }, { "epoch": 2.704236102277064, "ewc_loss": 0.07388348877429962, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037506537046283484, "grad_norm": 8.745966911315918, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8566396832466125, "num_tokens": 811170001.0, "step": 21258 }, { "epoch": 2.7043633125556545, "ewc_loss": 0.07412348687648773, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000377465330529958, "grad_norm": 8.776344299316406, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8715284466743469, "num_tokens": 811204203.0, "step": 21259 }, { "epoch": 2.704490522834245, "ewc_loss": 0.07405048608779907, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003767353482544422, "grad_norm": 8.72195816040039, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.869745135307312, "num_tokens": 811244598.0, "step": 21260 }, { "epoch": 2.7046177331128356, "ewc_loss": 0.07411174476146698, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037734792567789555, "grad_norm": 8.734830856323242, "learning_rate": 1e-06, "loss": 0.4325, "mean_token_accuracy": 0.8769298791885376, "num_tokens": 811283352.0, "step": 21261 }, { "epoch": 2.704744943391426, "ewc_loss": 0.07413462549448013, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037757671088911593, "grad_norm": 8.791418075561523, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8544666171073914, "num_tokens": 811317605.0, "step": 21262 }, { "epoch": 2.7048721536700167, "ewc_loss": 0.0739501416683197, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003757319354917854, "grad_norm": 8.708560943603516, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.881315290927887, "num_tokens": 811352117.0, "step": 21263 }, { "epoch": 2.704999363948607, "ewc_loss": 0.07421234250068665, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037835384136997163, "grad_norm": 8.76253890991211, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.86011803150177, "num_tokens": 811386118.0, "step": 21264 }, { "epoch": 2.7051265742271977, "ewc_loss": 0.07400578260421753, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003762883134186268, "grad_norm": 8.740262985229492, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8758020997047424, "num_tokens": 811424479.0, "step": 21265 }, { "epoch": 2.7052537845057882, "ewc_loss": 0.07407931983470917, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037702362169511616, "grad_norm": 8.755650520324707, "learning_rate": 1e-06, "loss": 0.5002, "mean_token_accuracy": 0.8542394638061523, "num_tokens": 811464683.0, "step": 21266 }, { "epoch": 2.7053809947843783, "ewc_loss": 0.07407963275909424, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003770267649088055, "grad_norm": 8.70151424407959, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8827700018882751, "num_tokens": 811504801.0, "step": 21267 }, { "epoch": 2.7055082050629693, "ewc_loss": 0.07408536225557327, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037708409945480525, "grad_norm": 8.758461952209473, "learning_rate": 1e-06, "loss": 0.4891, "mean_token_accuracy": 0.857826828956604, "num_tokens": 811546278.0, "step": 21268 }, { "epoch": 2.7056354153415594, "ewc_loss": 0.07401403784751892, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037637079367414117, "grad_norm": 8.741073608398438, "learning_rate": 1e-06, "loss": 0.4988, "mean_token_accuracy": 0.8528305888175964, "num_tokens": 811587019.0, "step": 21269 }, { "epoch": 2.7057626256201504, "ewc_loss": 0.07414686679840088, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037769917980767787, "grad_norm": 8.777009010314941, "learning_rate": 1e-06, "loss": 0.5327, "mean_token_accuracy": 0.8425019979476929, "num_tokens": 811619843.0, "step": 21270 }, { "epoch": 2.7058898358987404, "ewc_loss": 0.07426241040229797, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037641319795511663, "grad_norm": 8.780001640319824, "learning_rate": 1e-06, "loss": 0.4396, "mean_token_accuracy": 0.8723717927932739, "num_tokens": 811655797.0, "step": 21271 }, { "epoch": 2.706017046177331, "ewc_loss": 0.07423676550388336, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003785981680266559, "grad_norm": 8.797368049621582, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8567238450050354, "num_tokens": 811691626.0, "step": 21272 }, { "epoch": 2.7061442564559215, "ewc_loss": 0.07393287122249603, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003755592042580247, "grad_norm": 8.697982788085938, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8687394857406616, "num_tokens": 811726046.0, "step": 21273 }, { "epoch": 2.706271466734512, "ewc_loss": 0.07430081069469452, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037923859781585634, "grad_norm": 8.827622413635254, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8573348522186279, "num_tokens": 811765686.0, "step": 21274 }, { "epoch": 2.7063986770131025, "ewc_loss": 0.07390090823173523, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003752395568881184, "grad_norm": 8.759378433227539, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8727033138275146, "num_tokens": 811801143.0, "step": 21275 }, { "epoch": 2.706525887291693, "ewc_loss": 0.07411309331655502, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000377361400751397, "grad_norm": 8.810858726501465, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.873992383480072, "num_tokens": 811834807.0, "step": 21276 }, { "epoch": 2.7066530975702836, "ewc_loss": 0.07403998076915741, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037418887950479984, "grad_norm": 8.763126373291016, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.884729266166687, "num_tokens": 811865676.0, "step": 21277 }, { "epoch": 2.706780307848874, "ewc_loss": 0.07426978647708893, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003764868888538331, "grad_norm": 8.814579010009766, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8629047870635986, "num_tokens": 811901259.0, "step": 21278 }, { "epoch": 2.7069075181274647, "ewc_loss": 0.07401520758867264, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003739411476999521, "grad_norm": 8.695393562316895, "learning_rate": 1e-06, "loss": 0.5155, "mean_token_accuracy": 0.8521825671195984, "num_tokens": 811937768.0, "step": 21279 }, { "epoch": 2.707034728406055, "ewc_loss": 0.07414913922548294, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037772185169160366, "grad_norm": 8.827972412109375, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8658770322799683, "num_tokens": 811967805.0, "step": 21280 }, { "epoch": 2.7071619386846457, "ewc_loss": 0.07375660538673401, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037379650166258216, "grad_norm": 8.673087120056152, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8623684048652649, "num_tokens": 812007468.0, "step": 21281 }, { "epoch": 2.7072891489632362, "ewc_loss": 0.07426629960536957, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003788934845943004, "grad_norm": 8.778729438781738, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8676499128341675, "num_tokens": 812048127.0, "step": 21282 }, { "epoch": 2.7074163592418268, "ewc_loss": 0.07384496927261353, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037468012305907905, "grad_norm": 8.742025375366211, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8663427233695984, "num_tokens": 812083904.0, "step": 21283 }, { "epoch": 2.7075435695204173, "ewc_loss": 0.07409977912902832, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003772282798308879, "grad_norm": 8.757851600646973, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8701266646385193, "num_tokens": 812119643.0, "step": 21284 }, { "epoch": 2.707670779799008, "ewc_loss": 0.07383055984973907, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003745360590983182, "grad_norm": 8.739557266235352, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8647319078445435, "num_tokens": 812155338.0, "step": 21285 }, { "epoch": 2.7077979900775984, "ewc_loss": 0.07402318716049194, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037646235432475805, "grad_norm": 8.780561447143555, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8665971755981445, "num_tokens": 812193075.0, "step": 21286 }, { "epoch": 2.707925200356189, "ewc_loss": 0.07380077242851257, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037423823960125446, "grad_norm": 8.741289138793945, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.8754931688308716, "num_tokens": 812228163.0, "step": 21287 }, { "epoch": 2.7080524106347794, "ewc_loss": 0.07418462634086609, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037563536898232996, "grad_norm": 8.812381744384766, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.86086106300354, "num_tokens": 812268565.0, "step": 21288 }, { "epoch": 2.70817962091337, "ewc_loss": 0.07413837313652039, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003751727635972202, "grad_norm": 8.760498046875, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8718048334121704, "num_tokens": 812307292.0, "step": 21289 }, { "epoch": 2.7083068311919605, "ewc_loss": 0.07417874038219452, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000375576491933316, "grad_norm": 8.761523246765137, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8763493299484253, "num_tokens": 812346180.0, "step": 21290 }, { "epoch": 2.708434041470551, "ewc_loss": 0.07382434606552124, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037447389331646264, "grad_norm": 8.78690242767334, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8630588054656982, "num_tokens": 812376033.0, "step": 21291 }, { "epoch": 2.708561251749141, "ewc_loss": 0.0737842321395874, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003740728134289384, "grad_norm": 8.73314094543457, "learning_rate": 1e-06, "loss": 0.4073, "mean_token_accuracy": 0.8842753171920776, "num_tokens": 812407925.0, "step": 21292 }, { "epoch": 2.708688462027732, "ewc_loss": 0.07391573488712311, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037538778269663453, "grad_norm": 8.825055122375488, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8711682558059692, "num_tokens": 812452354.0, "step": 21293 }, { "epoch": 2.708815672306322, "ewc_loss": 0.07392068952322006, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003729959425982088, "grad_norm": 8.788893699645996, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8651450276374817, "num_tokens": 812490015.0, "step": 21294 }, { "epoch": 2.708942882584913, "ewc_loss": 0.07380060106515884, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003742364642675966, "grad_norm": 8.77078628540039, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8633202910423279, "num_tokens": 812529919.0, "step": 21295 }, { "epoch": 2.709070092863503, "ewc_loss": 0.0737466961145401, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000373697403119877, "grad_norm": 8.8147611618042, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8794472217559814, "num_tokens": 812568541.0, "step": 21296 }, { "epoch": 2.7091973031420937, "ewc_loss": 0.07363247871398926, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003725552523974329, "grad_norm": 8.714550018310547, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8581815361976624, "num_tokens": 812612098.0, "step": 21297 }, { "epoch": 2.7093245134206843, "ewc_loss": 0.07370688766241074, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037329935003072023, "grad_norm": 8.750896453857422, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8620237112045288, "num_tokens": 812648843.0, "step": 21298 }, { "epoch": 2.709451723699275, "ewc_loss": 0.07373778522014618, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003736083453986794, "grad_norm": 8.712223052978516, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8680800795555115, "num_tokens": 812692080.0, "step": 21299 }, { "epoch": 2.7095789339778653, "ewc_loss": 0.07385426759719849, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003747731971088797, "grad_norm": 8.767762184143066, "learning_rate": 1e-06, "loss": 0.4282, "mean_token_accuracy": 0.8806130886077881, "num_tokens": 812729441.0, "step": 21300 }, { "epoch": 2.709706144256456, "ewc_loss": 0.07377558946609497, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037398640415631235, "grad_norm": 8.749671936035156, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.860753059387207, "num_tokens": 812768908.0, "step": 21301 }, { "epoch": 2.7098333545350464, "ewc_loss": 0.07395428419113159, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037577335024252534, "grad_norm": 8.790051460266113, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8650898933410645, "num_tokens": 812808384.0, "step": 21302 }, { "epoch": 2.709960564813637, "ewc_loss": 0.07372218370437622, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003734523488674313, "grad_norm": 8.735397338867188, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8588967323303223, "num_tokens": 812847658.0, "step": 21303 }, { "epoch": 2.7100877750922274, "ewc_loss": 0.07399870455265045, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037621756200678647, "grad_norm": 8.82453441619873, "learning_rate": 1e-06, "loss": 0.4524, "mean_token_accuracy": 0.867422342300415, "num_tokens": 812889394.0, "step": 21304 }, { "epoch": 2.710214985370818, "ewc_loss": 0.07367601990699768, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003729907039087266, "grad_norm": 8.735518455505371, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8524616956710815, "num_tokens": 812924202.0, "step": 21305 }, { "epoch": 2.7103421956494085, "ewc_loss": 0.0740046501159668, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003762769920285791, "grad_norm": 8.868959426879883, "learning_rate": 1e-06, "loss": 0.3978, "mean_token_accuracy": 0.8851317167282104, "num_tokens": 812954391.0, "step": 21306 }, { "epoch": 2.710469405927999, "ewc_loss": 0.07351556420326233, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000371386093320325, "grad_norm": 8.69973087310791, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8565757870674133, "num_tokens": 812989498.0, "step": 21307 }, { "epoch": 2.7105966162065895, "ewc_loss": 0.07418636232614517, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003780940896831453, "grad_norm": 8.854759216308594, "learning_rate": 1e-06, "loss": 0.4379, "mean_token_accuracy": 0.8727962970733643, "num_tokens": 813030968.0, "step": 21308 }, { "epoch": 2.71072382648518, "ewc_loss": 0.0734727680683136, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003709581505972892, "grad_norm": 8.631860733032227, "learning_rate": 1e-06, "loss": 0.3904, "mean_token_accuracy": 0.8898640275001526, "num_tokens": 813065843.0, "step": 21309 }, { "epoch": 2.7108510367637706, "ewc_loss": 0.07423542439937592, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378584663849324, "grad_norm": 8.792424201965332, "learning_rate": 1e-06, "loss": 0.4428, "mean_token_accuracy": 0.8714384436607361, "num_tokens": 813107410.0, "step": 21310 }, { "epoch": 2.710978247042361, "ewc_loss": 0.07361658662557602, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003723963163793087, "grad_norm": 8.631430625915527, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8740718364715576, "num_tokens": 813155959.0, "step": 21311 }, { "epoch": 2.7111054573209517, "ewc_loss": 0.07428069412708282, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037903746124356985, "grad_norm": 8.843685150146484, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8733648657798767, "num_tokens": 813194133.0, "step": 21312 }, { "epoch": 2.711232667599542, "ewc_loss": 0.07357755303382874, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003720060340128839, "grad_norm": 8.657346725463867, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8549401760101318, "num_tokens": 813231015.0, "step": 21313 }, { "epoch": 2.7113598778781327, "ewc_loss": 0.07440997660160065, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003803302824962884, "grad_norm": 8.825318336486816, "learning_rate": 1e-06, "loss": 0.4047, "mean_token_accuracy": 0.8820028901100159, "num_tokens": 813271497.0, "step": 21314 }, { "epoch": 2.711487088156723, "ewc_loss": 0.0737428069114685, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037365849129855633, "grad_norm": 8.664816856384277, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.867396891117096, "num_tokens": 813305758.0, "step": 21315 }, { "epoch": 2.7116142984353138, "ewc_loss": 0.07436935603618622, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003799240803346038, "grad_norm": 8.807374000549316, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8647477030754089, "num_tokens": 813345958.0, "step": 21316 }, { "epoch": 2.711741508713904, "ewc_loss": 0.07390530407428741, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037528350367210805, "grad_norm": 8.731623649597168, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8694630861282349, "num_tokens": 813381630.0, "step": 21317 }, { "epoch": 2.711868718992495, "ewc_loss": 0.07425443828105927, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037877479917369783, "grad_norm": 8.783730506896973, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8661399483680725, "num_tokens": 813421320.0, "step": 21318 }, { "epoch": 2.711995929271085, "ewc_loss": 0.07400635629892349, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037629404687322676, "grad_norm": 8.74578857421875, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8625627756118774, "num_tokens": 813458323.0, "step": 21319 }, { "epoch": 2.712123139549676, "ewc_loss": 0.07397964596748352, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037602687370963395, "grad_norm": 8.716623306274414, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.8486783504486084, "num_tokens": 813501436.0, "step": 21320 }, { "epoch": 2.712250349828266, "ewc_loss": 0.07408730685710907, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037710348260588944, "grad_norm": 8.686546325683594, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8677241802215576, "num_tokens": 813545942.0, "step": 21321 }, { "epoch": 2.7123775601068565, "ewc_loss": 0.07409948110580444, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003772253112401813, "grad_norm": 8.818902969360352, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8606057167053223, "num_tokens": 813582202.0, "step": 21322 }, { "epoch": 2.712504770385447, "ewc_loss": 0.07386603951454163, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003748908347915858, "grad_norm": 8.638346672058105, "learning_rate": 1e-06, "loss": 0.5089, "mean_token_accuracy": 0.8519047498703003, "num_tokens": 813621711.0, "step": 21323 }, { "epoch": 2.7126319806640375, "ewc_loss": 0.07438269257545471, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003800574049819261, "grad_norm": 8.76901626586914, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8607380986213684, "num_tokens": 813665404.0, "step": 21324 }, { "epoch": 2.712759190942628, "ewc_loss": 0.07385550439357758, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037478553713299334, "grad_norm": 8.67723560333252, "learning_rate": 1e-06, "loss": 0.4341, "mean_token_accuracy": 0.8731861114501953, "num_tokens": 813697470.0, "step": 21325 }, { "epoch": 2.7128864012212186, "ewc_loss": 0.07471684366464615, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003809574991464615, "grad_norm": 8.842168807983398, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8754136562347412, "num_tokens": 813733308.0, "step": 21326 }, { "epoch": 2.713013611499809, "ewc_loss": 0.07375016063451767, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037373206578195095, "grad_norm": 8.66391372680664, "learning_rate": 1e-06, "loss": 0.5041, "mean_token_accuracy": 0.8540189266204834, "num_tokens": 813770881.0, "step": 21327 }, { "epoch": 2.7131408217783997, "ewc_loss": 0.07448389381170273, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038106940337456763, "grad_norm": 8.841203689575195, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8720232248306274, "num_tokens": 813806495.0, "step": 21328 }, { "epoch": 2.71326803205699, "ewc_loss": 0.07375034689903259, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003737339866347611, "grad_norm": 8.66793155670166, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8714378476142883, "num_tokens": 813842408.0, "step": 21329 }, { "epoch": 2.7133952423355807, "ewc_loss": 0.07434231787919998, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003796536475419998, "grad_norm": 8.80196762084961, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8632797598838806, "num_tokens": 813883041.0, "step": 21330 }, { "epoch": 2.7135224526141712, "ewc_loss": 0.07377889752388, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003740194661077112, "grad_norm": 8.662155151367188, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8631221055984497, "num_tokens": 813922901.0, "step": 21331 }, { "epoch": 2.7136496628927618, "ewc_loss": 0.07437915354967117, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003800220147240907, "grad_norm": 8.792770385742188, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8687113523483276, "num_tokens": 813961948.0, "step": 21332 }, { "epoch": 2.7137768731713523, "ewc_loss": 0.07375818490982056, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003738122759386897, "grad_norm": 8.661555290222168, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.881253182888031, "num_tokens": 813999194.0, "step": 21333 }, { "epoch": 2.713904083449943, "ewc_loss": 0.07455772906541824, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038180776755325496, "grad_norm": 8.8223876953125, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.8569092750549316, "num_tokens": 814038986.0, "step": 21334 }, { "epoch": 2.7140312937285334, "ewc_loss": 0.07382170855998993, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003744475543498993, "grad_norm": 8.649881362915039, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8814266324043274, "num_tokens": 814074351.0, "step": 21335 }, { "epoch": 2.714158504007124, "ewc_loss": 0.07448673248291016, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038109783781692386, "grad_norm": 8.849420547485352, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8677825331687927, "num_tokens": 814109255.0, "step": 21336 }, { "epoch": 2.7142857142857144, "ewc_loss": 0.07381206005811691, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003743510751519352, "grad_norm": 8.686734199523926, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8620926737785339, "num_tokens": 814146601.0, "step": 21337 }, { "epoch": 2.714412924564305, "ewc_loss": 0.07449072599411011, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003811377682723105, "grad_norm": 8.824463844299316, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.872322678565979, "num_tokens": 814181299.0, "step": 21338 }, { "epoch": 2.7145401348428955, "ewc_loss": 0.07382167875766754, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003744472051039338, "grad_norm": 8.680876731872559, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8765217661857605, "num_tokens": 814218178.0, "step": 21339 }, { "epoch": 2.7146673451214856, "ewc_loss": 0.07434152811765671, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037964576040394604, "grad_norm": 8.782783508300781, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8692627549171448, "num_tokens": 814254161.0, "step": 21340 }, { "epoch": 2.7147945554000765, "ewc_loss": 0.07400958240032196, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003763263230212033, "grad_norm": 8.71835994720459, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8665516376495361, "num_tokens": 814290164.0, "step": 21341 }, { "epoch": 2.7149217656786666, "ewc_loss": 0.07419541478157043, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003781846025958657, "grad_norm": 8.786467552185059, "learning_rate": 1e-06, "loss": 0.5294, "mean_token_accuracy": 0.8467624187469482, "num_tokens": 814334459.0, "step": 21342 }, { "epoch": 2.7150489759572576, "ewc_loss": 0.07395508885383606, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003757813828997314, "grad_norm": 8.710561752319336, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8585770130157471, "num_tokens": 814374291.0, "step": 21343 }, { "epoch": 2.7151761862358477, "ewc_loss": 0.07406766712665558, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003769071481656283, "grad_norm": 8.719501495361328, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.862978458404541, "num_tokens": 814413196.0, "step": 21344 }, { "epoch": 2.7153033965144386, "ewc_loss": 0.07407617568969727, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003769922477658838, "grad_norm": 8.78294849395752, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8645393252372742, "num_tokens": 814455312.0, "step": 21345 }, { "epoch": 2.7154306067930287, "ewc_loss": 0.0739489495754242, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037572000292129815, "grad_norm": 8.657787322998047, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8630359172821045, "num_tokens": 814496586.0, "step": 21346 }, { "epoch": 2.7155578170716193, "ewc_loss": 0.07422872632741928, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003785177250392735, "grad_norm": 8.765411376953125, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8735259175300598, "num_tokens": 814531566.0, "step": 21347 }, { "epoch": 2.71568502735021, "ewc_loss": 0.07392697036266327, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003755001234821975, "grad_norm": 8.769915580749512, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8736684918403625, "num_tokens": 814566168.0, "step": 21348 }, { "epoch": 2.7158122376288003, "ewc_loss": 0.07402320206165314, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037646249984391034, "grad_norm": 8.72724723815918, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8673003911972046, "num_tokens": 814604927.0, "step": 21349 }, { "epoch": 2.715939447907391, "ewc_loss": 0.07409107685089111, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003771412593778223, "grad_norm": 8.732254981994629, "learning_rate": 1e-06, "loss": 0.4165, "mean_token_accuracy": 0.8818425536155701, "num_tokens": 814641401.0, "step": 21350 }, { "epoch": 2.7160666581859814, "ewc_loss": 0.07401260733604431, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003763565036933869, "grad_norm": 8.708394050598145, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.872984766960144, "num_tokens": 814674252.0, "step": 21351 }, { "epoch": 2.716193868464572, "ewc_loss": 0.07423144578933716, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037854493712075055, "grad_norm": 8.799724578857422, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8662059307098389, "num_tokens": 814704965.0, "step": 21352 }, { "epoch": 2.7163210787431624, "ewc_loss": 0.07396093010902405, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003758397651836276, "grad_norm": 8.721696853637695, "learning_rate": 1e-06, "loss": 0.5501, "mean_token_accuracy": 0.8374664783477783, "num_tokens": 814743740.0, "step": 21353 }, { "epoch": 2.716448289021753, "ewc_loss": 0.07427096366882324, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037894013803452253, "grad_norm": 8.765241622924805, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8731676340103149, "num_tokens": 814784598.0, "step": 21354 }, { "epoch": 2.7165754993003435, "ewc_loss": 0.07407747209072113, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003770052280742675, "grad_norm": 8.706750869750977, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8611863255500793, "num_tokens": 814821437.0, "step": 21355 }, { "epoch": 2.716702709578934, "ewc_loss": 0.07425078749656677, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003787383029703051, "grad_norm": 8.795260429382324, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8647679090499878, "num_tokens": 814861638.0, "step": 21356 }, { "epoch": 2.7168299198575245, "ewc_loss": 0.07401489466428757, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037637940840795636, "grad_norm": 8.735587120056152, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8673744201660156, "num_tokens": 814893799.0, "step": 21357 }, { "epoch": 2.716957130136115, "ewc_loss": 0.07419485598802567, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003781790437642485, "grad_norm": 8.754170417785645, "learning_rate": 1e-06, "loss": 0.4217, "mean_token_accuracy": 0.8755491971969604, "num_tokens": 814930726.0, "step": 21358 }, { "epoch": 2.7170843404147056, "ewc_loss": 0.07407434284687042, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003769739414565265, "grad_norm": 8.754278182983398, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8660407066345215, "num_tokens": 814970041.0, "step": 21359 }, { "epoch": 2.717211550693296, "ewc_loss": 0.07405231893062592, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037675362545996904, "grad_norm": 8.773452758789062, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8698833584785461, "num_tokens": 815008587.0, "step": 21360 }, { "epoch": 2.7173387609718866, "ewc_loss": 0.07396925985813141, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003759230603463948, "grad_norm": 8.771899223327637, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8534772396087646, "num_tokens": 815047659.0, "step": 21361 }, { "epoch": 2.717465971250477, "ewc_loss": 0.07400248944759369, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037625536788254976, "grad_norm": 8.731438636779785, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.867308497428894, "num_tokens": 815082631.0, "step": 21362 }, { "epoch": 2.7175931815290677, "ewc_loss": 0.0741608664393425, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003778391401283443, "grad_norm": 8.752930641174316, "learning_rate": 1e-06, "loss": 0.4025, "mean_token_accuracy": 0.8846899271011353, "num_tokens": 815117568.0, "step": 21363 }, { "epoch": 2.7177203918076582, "ewc_loss": 0.07405252754688263, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037675577914342284, "grad_norm": 8.69906234741211, "learning_rate": 1e-06, "loss": 0.4539, "mean_token_accuracy": 0.8664376735687256, "num_tokens": 815155019.0, "step": 21364 }, { "epoch": 2.7178476020862483, "ewc_loss": 0.07423597574234009, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037859019357711077, "grad_norm": 8.765621185302734, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.865737795829773, "num_tokens": 815195386.0, "step": 21365 }, { "epoch": 2.7179748123648393, "ewc_loss": 0.07404781877994537, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037670868914574385, "grad_norm": 8.713363647460938, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8709793090820312, "num_tokens": 815228304.0, "step": 21366 }, { "epoch": 2.7181020226434294, "ewc_loss": 0.07423152029514313, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003785457229241729, "grad_norm": 8.764531135559082, "learning_rate": 1e-06, "loss": 0.3944, "mean_token_accuracy": 0.8856309652328491, "num_tokens": 815260111.0, "step": 21367 }, { "epoch": 2.7182292329220203, "ewc_loss": 0.07400240004062653, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037625443655997515, "grad_norm": 8.675421714782715, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8652088642120361, "num_tokens": 815297272.0, "step": 21368 }, { "epoch": 2.7183564432006104, "ewc_loss": 0.07430350035429001, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037926548975519836, "grad_norm": 8.819218635559082, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.865938663482666, "num_tokens": 815336016.0, "step": 21369 }, { "epoch": 2.718483653479201, "ewc_loss": 0.0738484114408493, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003747146110981703, "grad_norm": 8.693854331970215, "learning_rate": 1e-06, "loss": 0.5417, "mean_token_accuracy": 0.8398752212524414, "num_tokens": 815373940.0, "step": 21370 }, { "epoch": 2.7186108637577915, "ewc_loss": 0.07445095479488373, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038074006442911923, "grad_norm": 8.8324556350708, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8552302122116089, "num_tokens": 815410836.0, "step": 21371 }, { "epoch": 2.718738074036382, "ewc_loss": 0.0738036036491394, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037426649942062795, "grad_norm": 8.710930824279785, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8725085258483887, "num_tokens": 815449577.0, "step": 21372 }, { "epoch": 2.7188652843149725, "ewc_loss": 0.0743737518787384, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037996802711859345, "grad_norm": 8.805388450622559, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8689326047897339, "num_tokens": 815485869.0, "step": 21373 }, { "epoch": 2.718992494593563, "ewc_loss": 0.07390797138214111, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003753101627808064, "grad_norm": 8.751389503479004, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8475362062454224, "num_tokens": 815523614.0, "step": 21374 }, { "epoch": 2.7191197048721536, "ewc_loss": 0.07413908839225769, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000377621385268867, "grad_norm": 8.830066680908203, "learning_rate": 1e-06, "loss": 0.5213, "mean_token_accuracy": 0.8459019660949707, "num_tokens": 815561835.0, "step": 21375 }, { "epoch": 2.719246915150744, "ewc_loss": 0.07384121417999268, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003746426082216203, "grad_norm": 8.659320831298828, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8499332666397095, "num_tokens": 815605596.0, "step": 21376 }, { "epoch": 2.7193741254293347, "ewc_loss": 0.07430892437696457, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037931971019133925, "grad_norm": 8.816946983337402, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8598166704177856, "num_tokens": 815638720.0, "step": 21377 }, { "epoch": 2.719501335707925, "ewc_loss": 0.07380189001560211, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003742493281606585, "grad_norm": 8.733760833740234, "learning_rate": 1e-06, "loss": 0.4112, "mean_token_accuracy": 0.881812334060669, "num_tokens": 815673996.0, "step": 21378 }, { "epoch": 2.7196285459865157, "ewc_loss": 0.07417711615562439, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003780016559176147, "grad_norm": 8.817806243896484, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.853279709815979, "num_tokens": 815716571.0, "step": 21379 }, { "epoch": 2.7197557562651062, "ewc_loss": 0.0738641619682312, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037487203371711075, "grad_norm": 8.75338363647461, "learning_rate": 1e-06, "loss": 0.4355, "mean_token_accuracy": 0.8726791739463806, "num_tokens": 815748973.0, "step": 21380 }, { "epoch": 2.7198829665436968, "ewc_loss": 0.07397763431072235, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003760068502742797, "grad_norm": 8.708711624145508, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8685963153839111, "num_tokens": 815794754.0, "step": 21381 }, { "epoch": 2.7200101768222873, "ewc_loss": 0.07412559539079666, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037748643080703914, "grad_norm": 8.788301467895508, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8687574863433838, "num_tokens": 815831051.0, "step": 21382 }, { "epoch": 2.720137387100878, "ewc_loss": 0.07383235543966293, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037455401616171, "grad_norm": 8.718291282653809, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8674769997596741, "num_tokens": 815874291.0, "step": 21383 }, { "epoch": 2.7202645973794684, "ewc_loss": 0.07416024804115295, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003778329992201179, "grad_norm": 8.870747566223145, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8629345893859863, "num_tokens": 815912484.0, "step": 21384 }, { "epoch": 2.720391807658059, "ewc_loss": 0.07377272844314575, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000373957707779482, "grad_norm": 8.684427261352539, "learning_rate": 1e-06, "loss": 0.5193, "mean_token_accuracy": 0.8511875867843628, "num_tokens": 815948626.0, "step": 21385 }, { "epoch": 2.7205190179366494, "ewc_loss": 0.07425950467586517, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037882549804635346, "grad_norm": 8.840117454528809, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8766109943389893, "num_tokens": 815983154.0, "step": 21386 }, { "epoch": 2.72064622821524, "ewc_loss": 0.07367652654647827, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037299570976756513, "grad_norm": 8.724775314331055, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8688216209411621, "num_tokens": 816017524.0, "step": 21387 }, { "epoch": 2.7207734384938305, "ewc_loss": 0.07428163290023804, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003790468326769769, "grad_norm": 8.799074172973633, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8619731664657593, "num_tokens": 816055204.0, "step": 21388 }, { "epoch": 2.720900648772421, "ewc_loss": 0.07368043065071106, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003730347380042076, "grad_norm": 8.68536376953125, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.874678373336792, "num_tokens": 816086671.0, "step": 21389 }, { "epoch": 2.721027859051011, "ewc_loss": 0.07418584078550339, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037808888009749353, "grad_norm": 8.875207901000977, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8467037677764893, "num_tokens": 816114173.0, "step": 21390 }, { "epoch": 2.721155069329602, "ewc_loss": 0.07367511093616486, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037298156530596316, "grad_norm": 8.62662124633789, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8789845705032349, "num_tokens": 816156434.0, "step": 21391 }, { "epoch": 2.721282279608192, "ewc_loss": 0.0744217187166214, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003804476873483509, "grad_norm": 8.880779266357422, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8605703115463257, "num_tokens": 816191037.0, "step": 21392 }, { "epoch": 2.721409489886783, "ewc_loss": 0.07365727424621582, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003728032170329243, "grad_norm": 8.6421537399292, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.8805273175239563, "num_tokens": 816227031.0, "step": 21393 }, { "epoch": 2.721536700165373, "ewc_loss": 0.07436251640319824, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037985562812536955, "grad_norm": 8.818900108337402, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8702202439308167, "num_tokens": 816264308.0, "step": 21394 }, { "epoch": 2.7216639104439637, "ewc_loss": 0.07362811267375946, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003725115966517478, "grad_norm": 8.689764976501465, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8639435172080994, "num_tokens": 816300955.0, "step": 21395 }, { "epoch": 2.7217911207225542, "ewc_loss": 0.074119433760643, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003774247597903013, "grad_norm": 8.808860778808594, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8662371635437012, "num_tokens": 816332881.0, "step": 21396 }, { "epoch": 2.7219183310011448, "ewc_loss": 0.0737309679389, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037354015512391925, "grad_norm": 8.733274459838867, "learning_rate": 1e-06, "loss": 0.4024, "mean_token_accuracy": 0.8860411643981934, "num_tokens": 816367446.0, "step": 21397 }, { "epoch": 2.7220455412797353, "ewc_loss": 0.07398930191993713, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037612346932291985, "grad_norm": 8.779833793640137, "learning_rate": 1e-06, "loss": 0.513, "mean_token_accuracy": 0.850006103515625, "num_tokens": 816407286.0, "step": 21398 }, { "epoch": 2.722172751558326, "ewc_loss": 0.07384993880987167, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003747298615053296, "grad_norm": 8.704362869262695, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8767664432525635, "num_tokens": 816445277.0, "step": 21399 }, { "epoch": 2.7222999618369164, "ewc_loss": 0.07405112683773041, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037674172199331224, "grad_norm": 8.759153366088867, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.873762845993042, "num_tokens": 816481638.0, "step": 21400 }, { "epoch": 2.722427172115507, "ewc_loss": 0.07375113666057587, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003737417864613235, "grad_norm": 8.699150085449219, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8684961795806885, "num_tokens": 816521386.0, "step": 21401 }, { "epoch": 2.7225543823940974, "ewc_loss": 0.07404570281505585, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037668750155717134, "grad_norm": 8.826361656188965, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8783315420150757, "num_tokens": 816554760.0, "step": 21402 }, { "epoch": 2.722681592672688, "ewc_loss": 0.07371601462364197, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037339061964303255, "grad_norm": 8.714995384216309, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8744632005691528, "num_tokens": 816595195.0, "step": 21403 }, { "epoch": 2.7228088029512785, "ewc_loss": 0.0738803893327713, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003750343166757375, "grad_norm": 8.707242012023926, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8696194887161255, "num_tokens": 816633046.0, "step": 21404 }, { "epoch": 2.722936013229869, "ewc_loss": 0.07390792667865753, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037530975532718003, "grad_norm": 8.772497177124023, "learning_rate": 1e-06, "loss": 0.4198, "mean_token_accuracy": 0.8815215229988098, "num_tokens": 816675283.0, "step": 21405 }, { "epoch": 2.7230632235084595, "ewc_loss": 0.07373972237110138, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037362767034210265, "grad_norm": 8.714834213256836, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8585962653160095, "num_tokens": 816716499.0, "step": 21406 }, { "epoch": 2.72319043378705, "ewc_loss": 0.0739474892616272, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037570539279840887, "grad_norm": 8.685790061950684, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8609649538993835, "num_tokens": 816761825.0, "step": 21407 }, { "epoch": 2.7233176440656406, "ewc_loss": 0.0740463063120842, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003742521221283823, "grad_norm": 8.754548072814941, "learning_rate": 1e-06, "loss": 0.4335, "mean_token_accuracy": 0.8754016757011414, "num_tokens": 816792080.0, "step": 21408 }, { "epoch": 2.723444854344231, "ewc_loss": 0.07391971349716187, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003754276258405298, "grad_norm": 8.737614631652832, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8562318086624146, "num_tokens": 816832121.0, "step": 21409 }, { "epoch": 2.7235720646228216, "ewc_loss": 0.07391177117824554, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003753481723833829, "grad_norm": 8.701807022094727, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8600287437438965, "num_tokens": 816874032.0, "step": 21410 }, { "epoch": 2.723699274901412, "ewc_loss": 0.07404172420501709, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037664768751710653, "grad_norm": 8.763473510742188, "learning_rate": 1e-06, "loss": 0.5203, "mean_token_accuracy": 0.8484474420547485, "num_tokens": 816920471.0, "step": 21411 }, { "epoch": 2.7238264851800027, "ewc_loss": 0.07373715937137604, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003736020880751312, "grad_norm": 8.6964750289917, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8675006031990051, "num_tokens": 816958042.0, "step": 21412 }, { "epoch": 2.723953695458593, "ewc_loss": 0.0739409327507019, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037563982186838984, "grad_norm": 8.758288383483887, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8582509160041809, "num_tokens": 816992158.0, "step": 21413 }, { "epoch": 2.7240809057371838, "ewc_loss": 0.07397262752056122, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003759567625820637, "grad_norm": 8.753593444824219, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.854878306388855, "num_tokens": 817032552.0, "step": 21414 }, { "epoch": 2.724208116015774, "ewc_loss": 0.07389377057552338, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000375168165192008, "grad_norm": 8.70146369934082, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8653467893600464, "num_tokens": 817072240.0, "step": 21415 }, { "epoch": 2.724335326294365, "ewc_loss": 0.07402781397104263, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003765086003113538, "grad_norm": 8.842162132263184, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.857326090335846, "num_tokens": 817104741.0, "step": 21416 }, { "epoch": 2.724462536572955, "ewc_loss": 0.07358501851558685, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003720806271303445, "grad_norm": 8.683176040649414, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8549269437789917, "num_tokens": 817143876.0, "step": 21417 }, { "epoch": 2.724589746851546, "ewc_loss": 0.07413686066865921, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037759909173473716, "grad_norm": 8.764311790466309, "learning_rate": 1e-06, "loss": 0.384, "mean_token_accuracy": 0.8894737958908081, "num_tokens": 817183647.0, "step": 21418 }, { "epoch": 2.724716957130136, "ewc_loss": 0.07352860271930695, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003715165366884321, "grad_norm": 8.721542358398438, "learning_rate": 1e-06, "loss": 0.4805, "mean_token_accuracy": 0.8616376519203186, "num_tokens": 817214520.0, "step": 21419 }, { "epoch": 2.7248441674087265, "ewc_loss": 0.0740768238902092, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003769987088162452, "grad_norm": 8.777853965759277, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8571276664733887, "num_tokens": 817256936.0, "step": 21420 }, { "epoch": 2.724971377687317, "ewc_loss": 0.07361341267824173, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037236459320411086, "grad_norm": 8.676802635192871, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8680628538131714, "num_tokens": 817292840.0, "step": 21421 }, { "epoch": 2.7250985879659075, "ewc_loss": 0.07409188151359558, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003771493211388588, "grad_norm": 8.80327033996582, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8626712560653687, "num_tokens": 817332872.0, "step": 21422 }, { "epoch": 2.725225798244498, "ewc_loss": 0.07367432862520218, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003729737363755703, "grad_norm": 8.69482421875, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8736388087272644, "num_tokens": 817367668.0, "step": 21423 }, { "epoch": 2.7253530085230886, "ewc_loss": 0.07403506338596344, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003765810979530215, "grad_norm": 8.787120819091797, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8854961395263672, "num_tokens": 817407726.0, "step": 21424 }, { "epoch": 2.725480218801679, "ewc_loss": 0.07369988411664963, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037322932621464133, "grad_norm": 8.708277702331543, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8772638440132141, "num_tokens": 817441003.0, "step": 21425 }, { "epoch": 2.7256074290802697, "ewc_loss": 0.07411584258079529, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003773889038711786, "grad_norm": 8.823539733886719, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.873009979724884, "num_tokens": 817482587.0, "step": 21426 }, { "epoch": 2.72573463935886, "ewc_loss": 0.07367484271526337, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003729789168573916, "grad_norm": 8.700849533081055, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8710241317749023, "num_tokens": 817519727.0, "step": 21427 }, { "epoch": 2.7258618496374507, "ewc_loss": 0.07406022399663925, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037683270056732, "grad_norm": 8.850417137145996, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8634836673736572, "num_tokens": 817553809.0, "step": 21428 }, { "epoch": 2.7259890599160412, "ewc_loss": 0.07362937927246094, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003725242568179965, "grad_norm": 8.886005401611328, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8506515026092529, "num_tokens": 817588893.0, "step": 21429 }, { "epoch": 2.7261162701946318, "ewc_loss": 0.07367964088916779, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037302690907381475, "grad_norm": 9.08310604095459, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8745816946029663, "num_tokens": 817627789.0, "step": 21430 }, { "epoch": 2.7262434804732223, "ewc_loss": 0.07332783937454224, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003695088380482048, "grad_norm": 9.033955574035645, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8559138774871826, "num_tokens": 817670048.0, "step": 21431 }, { "epoch": 2.726370690751813, "ewc_loss": 0.07301963865756989, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036642682971432805, "grad_norm": 8.658377647399902, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8725320100784302, "num_tokens": 817707108.0, "step": 21432 }, { "epoch": 2.7264979010304033, "ewc_loss": 0.07381591200828552, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003743895795196295, "grad_norm": 8.886192321777344, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8640600442886353, "num_tokens": 817748126.0, "step": 21433 }, { "epoch": 2.726625111308994, "ewc_loss": 0.07304343581199646, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003666648408398032, "grad_norm": 8.838640213012695, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8689582943916321, "num_tokens": 817781578.0, "step": 21434 }, { "epoch": 2.7267523215875844, "ewc_loss": 0.07350608706474304, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037129136035218835, "grad_norm": 8.787782669067383, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8727720379829407, "num_tokens": 817822143.0, "step": 21435 }, { "epoch": 2.726879531866175, "ewc_loss": 0.07333160191774368, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003695464984048158, "grad_norm": 8.779862403869629, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8769229650497437, "num_tokens": 817858456.0, "step": 21436 }, { "epoch": 2.7270067421447655, "ewc_loss": 0.07330925762653351, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036932301009073853, "grad_norm": 8.75583267211914, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8744680285453796, "num_tokens": 817892863.0, "step": 21437 }, { "epoch": 2.7271339524233555, "ewc_loss": 0.0735338032245636, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003715684579219669, "grad_norm": 8.842581748962402, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.870254635810852, "num_tokens": 817931120.0, "step": 21438 }, { "epoch": 2.7272611627019465, "ewc_loss": 0.07320934534072876, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00036832396290265024, "grad_norm": 8.609257698059082, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8685226440429688, "num_tokens": 817963076.0, "step": 21439 }, { "epoch": 2.7273883729805366, "ewc_loss": 0.07383345067501068, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037456501740962267, "grad_norm": 8.796158790588379, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8670175075531006, "num_tokens": 818003844.0, "step": 21440 }, { "epoch": 2.7275155832591276, "ewc_loss": 0.073331817984581, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003695486520882696, "grad_norm": 8.654196739196777, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8692100048065186, "num_tokens": 818048533.0, "step": 21441 }, { "epoch": 2.7276427935377177, "ewc_loss": 0.07405180484056473, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037674850318580866, "grad_norm": 8.78743839263916, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.864824116230011, "num_tokens": 818083290.0, "step": 21442 }, { "epoch": 2.7277700038163086, "ewc_loss": 0.07341194152832031, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003703499387484044, "grad_norm": 8.680994987487793, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8766851425170898, "num_tokens": 818120037.0, "step": 21443 }, { "epoch": 2.7278972140948987, "ewc_loss": 0.07397010177373886, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003759314713533968, "grad_norm": 8.685737609863281, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.866389274597168, "num_tokens": 818164122.0, "step": 21444 }, { "epoch": 2.7280244243734892, "ewc_loss": 0.0738997757434845, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003752281772904098, "grad_norm": 8.75362491607666, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8692790865898132, "num_tokens": 818198065.0, "step": 21445 }, { "epoch": 2.7281516346520798, "ewc_loss": 0.07389555871486664, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037518603494390845, "grad_norm": 8.721282958984375, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.868607223033905, "num_tokens": 818236843.0, "step": 21446 }, { "epoch": 2.7282788449306703, "ewc_loss": 0.07401042431592941, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003763347049243748, "grad_norm": 8.759939193725586, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8687519431114197, "num_tokens": 818271300.0, "step": 21447 }, { "epoch": 2.728406055209261, "ewc_loss": 0.07385789602994919, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003748094313777983, "grad_norm": 8.719588279724121, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.8521008491516113, "num_tokens": 818311799.0, "step": 21448 }, { "epoch": 2.7285332654878514, "ewc_loss": 0.07409274578094482, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003771578776650131, "grad_norm": 8.741168022155762, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8731474876403809, "num_tokens": 818346662.0, "step": 21449 }, { "epoch": 2.728660475766442, "ewc_loss": 0.07401973009109497, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037642777897417545, "grad_norm": 8.720856666564941, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8714310526847839, "num_tokens": 818384465.0, "step": 21450 }, { "epoch": 2.7287876860450324, "ewc_loss": 0.07408377528190613, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037706823786720634, "grad_norm": 8.749969482421875, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8641912937164307, "num_tokens": 818425625.0, "step": 21451 }, { "epoch": 2.728914896323623, "ewc_loss": 0.0739668533205986, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037589899147860706, "grad_norm": 8.68789005279541, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8727132081985474, "num_tokens": 818458544.0, "step": 21452 }, { "epoch": 2.7290421066022135, "ewc_loss": 0.07413336634635925, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003775641380343586, "grad_norm": 8.777915000915527, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8626468181610107, "num_tokens": 818503507.0, "step": 21453 }, { "epoch": 2.729169316880804, "ewc_loss": 0.073849618434906, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003747266891878098, "grad_norm": 8.650055885314941, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8731768131256104, "num_tokens": 818545411.0, "step": 21454 }, { "epoch": 2.7292965271593945, "ewc_loss": 0.07431599497795105, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003793904324993491, "grad_norm": 8.791913032531738, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8634013533592224, "num_tokens": 818582100.0, "step": 21455 }, { "epoch": 2.729423737437985, "ewc_loss": 0.07377363741397858, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003739668463822454, "grad_norm": 8.702444076538086, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8646380305290222, "num_tokens": 818615549.0, "step": 21456 }, { "epoch": 2.7295509477165756, "ewc_loss": 0.0742504820227623, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037873530527576804, "grad_norm": 8.799070358276367, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8718528747558594, "num_tokens": 818652476.0, "step": 21457 }, { "epoch": 2.729678157995166, "ewc_loss": 0.07395100593566895, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037574057932943106, "grad_norm": 8.662389755249023, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8710841536521912, "num_tokens": 818692435.0, "step": 21458 }, { "epoch": 2.7298053682737566, "ewc_loss": 0.07426191866397858, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378849683329463, "grad_norm": 9.650681495666504, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.8660162687301636, "num_tokens": 818733206.0, "step": 21459 }, { "epoch": 2.729932578552347, "ewc_loss": 0.07315458357334137, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003677763161249459, "grad_norm": 8.492551803588867, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8692927956581116, "num_tokens": 818771717.0, "step": 21460 }, { "epoch": 2.7300597888309377, "ewc_loss": 0.0756928026676178, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003931584651581943, "grad_norm": 9.063244819641113, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8660647869110107, "num_tokens": 818806999.0, "step": 21461 }, { "epoch": 2.7301869991095282, "ewc_loss": 0.07307989150285721, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003670293663162738, "grad_norm": 8.539758682250977, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8735739588737488, "num_tokens": 818846226.0, "step": 21462 }, { "epoch": 2.7303142093881183, "ewc_loss": 0.07561290264129639, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003923595359083265, "grad_norm": 9.04259204864502, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8726485967636108, "num_tokens": 818885667.0, "step": 21463 }, { "epoch": 2.7304414196667093, "ewc_loss": 0.07354575395584106, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037168795824982226, "grad_norm": 8.668730735778809, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8745009899139404, "num_tokens": 818924009.0, "step": 21464 }, { "epoch": 2.7305686299452994, "ewc_loss": 0.07496942579746246, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038592470809817314, "grad_norm": 8.919734001159668, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8741040229797363, "num_tokens": 818962614.0, "step": 21465 }, { "epoch": 2.7306958402238903, "ewc_loss": 0.07380922883749008, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003743227571249008, "grad_norm": 8.747889518737793, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8692400455474854, "num_tokens": 819004486.0, "step": 21466 }, { "epoch": 2.7308230505024804, "ewc_loss": 0.07455170154571533, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038174749352037907, "grad_norm": 8.878704071044922, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8608867526054382, "num_tokens": 819048364.0, "step": 21467 }, { "epoch": 2.730950260781071, "ewc_loss": 0.07402768731117249, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003765073779504746, "grad_norm": 8.738565444946289, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8639724254608154, "num_tokens": 819086250.0, "step": 21468 }, { "epoch": 2.7310774710596615, "ewc_loss": 0.07426723837852478, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037890285602770746, "grad_norm": 8.830452919006348, "learning_rate": 1e-06, "loss": 0.4269, "mean_token_accuracy": 0.8795610666275024, "num_tokens": 819126705.0, "step": 21469 }, { "epoch": 2.731204681338252, "ewc_loss": 0.07416115701198578, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037784199230372906, "grad_norm": 8.781232833862305, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8682711720466614, "num_tokens": 819165967.0, "step": 21470 }, { "epoch": 2.7313318916168425, "ewc_loss": 0.07427854835987091, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003790159826166928, "grad_norm": 8.809919357299805, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8644581437110901, "num_tokens": 819208066.0, "step": 21471 }, { "epoch": 2.731459101895433, "ewc_loss": 0.0742088034749031, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037831850931979716, "grad_norm": 8.873337745666504, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8663067817687988, "num_tokens": 819246269.0, "step": 21472 }, { "epoch": 2.7315863121740236, "ewc_loss": 0.07389919459819794, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037522241473197937, "grad_norm": 8.746673583984375, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8696079850196838, "num_tokens": 819286027.0, "step": 21473 }, { "epoch": 2.731713522452614, "ewc_loss": 0.07421067357063293, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003783372521866113, "grad_norm": 8.841904640197754, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8756415247917175, "num_tokens": 819319994.0, "step": 21474 }, { "epoch": 2.7318407327312046, "ewc_loss": 0.07394047826528549, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037563525256700814, "grad_norm": 8.650208473205566, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8629448413848877, "num_tokens": 819360113.0, "step": 21475 }, { "epoch": 2.731967943009795, "ewc_loss": 0.07452894747257233, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003815199015662074, "grad_norm": 8.87363052368164, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8747163414955139, "num_tokens": 819394575.0, "step": 21476 }, { "epoch": 2.7320951532883857, "ewc_loss": 0.07392065227031708, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003754369681701064, "grad_norm": 8.742928504943848, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8613204956054688, "num_tokens": 819434545.0, "step": 21477 }, { "epoch": 2.7322223635669762, "ewc_loss": 0.0744611918926239, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038084242260083556, "grad_norm": 8.84861946105957, "learning_rate": 1e-06, "loss": 0.4288, "mean_token_accuracy": 0.8767021894454956, "num_tokens": 819470699.0, "step": 21478 }, { "epoch": 2.7323495738455668, "ewc_loss": 0.07379300892353058, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003741605905815959, "grad_norm": 8.676019668579102, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.861280083656311, "num_tokens": 819514447.0, "step": 21479 }, { "epoch": 2.7324767841241573, "ewc_loss": 0.07453012466430664, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038153171772137284, "grad_norm": 8.815787315368652, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8615646362304688, "num_tokens": 819553434.0, "step": 21480 }, { "epoch": 2.732603994402748, "ewc_loss": 0.07394376397132874, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003756681107915938, "grad_norm": 8.707354545593262, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8601098656654358, "num_tokens": 819591594.0, "step": 21481 }, { "epoch": 2.7327312046813383, "ewc_loss": 0.07432650029659271, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037949546822346747, "grad_norm": 8.778928756713867, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8642752170562744, "num_tokens": 819634530.0, "step": 21482 }, { "epoch": 2.732858414959929, "ewc_loss": 0.07413861900568008, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037761664134450257, "grad_norm": 8.769143104553223, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.857589602470398, "num_tokens": 819669358.0, "step": 21483 }, { "epoch": 2.7329856252385194, "ewc_loss": 0.07433606684207916, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037959113251417875, "grad_norm": 8.745515823364258, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8657931089401245, "num_tokens": 819711336.0, "step": 21484 }, { "epoch": 2.73311283551711, "ewc_loss": 0.0743502825498581, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037973333382979035, "grad_norm": 8.800930976867676, "learning_rate": 1e-06, "loss": 0.4865, "mean_token_accuracy": 0.8569461107254028, "num_tokens": 819746973.0, "step": 21485 }, { "epoch": 2.7332400457957005, "ewc_loss": 0.07425001263618469, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003787305613514036, "grad_norm": 8.807028770446777, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8770874738693237, "num_tokens": 819780305.0, "step": 21486 }, { "epoch": 2.733367256074291, "ewc_loss": 0.07435154914855957, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037974599399603903, "grad_norm": 8.823709487915039, "learning_rate": 1e-06, "loss": 0.4307, "mean_token_accuracy": 0.8754391074180603, "num_tokens": 819817326.0, "step": 21487 }, { "epoch": 2.733494466352881, "ewc_loss": 0.07420939952135086, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037832444650121033, "grad_norm": 8.782549858093262, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.860862672328949, "num_tokens": 819857597.0, "step": 21488 }, { "epoch": 2.733621676631472, "ewc_loss": 0.0743342861533165, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037957332096993923, "grad_norm": 8.81667709350586, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8686294555664062, "num_tokens": 819893471.0, "step": 21489 }, { "epoch": 2.733748886910062, "ewc_loss": 0.07408073544502258, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003770377952605486, "grad_norm": 8.70151424407959, "learning_rate": 1e-06, "loss": 0.522, "mean_token_accuracy": 0.8496277928352356, "num_tokens": 819934175.0, "step": 21490 }, { "epoch": 2.733876097188653, "ewc_loss": 0.07457989454269409, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038202942232601345, "grad_norm": 8.880924224853516, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8756074905395508, "num_tokens": 819971451.0, "step": 21491 }, { "epoch": 2.734003307467243, "ewc_loss": 0.07401859760284424, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037641648668795824, "grad_norm": 8.691357612609863, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8627065420150757, "num_tokens": 820011566.0, "step": 21492 }, { "epoch": 2.7341305177458337, "ewc_loss": 0.07459321618080139, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038216260145418346, "grad_norm": 8.826481819152832, "learning_rate": 1e-06, "loss": 0.5416, "mean_token_accuracy": 0.8423657417297363, "num_tokens": 820047944.0, "step": 21493 }, { "epoch": 2.7342577280244242, "ewc_loss": 0.07408896088600159, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003771201299969107, "grad_norm": 8.830443382263184, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8544187545776367, "num_tokens": 820083416.0, "step": 21494 }, { "epoch": 2.7343849383030148, "ewc_loss": 0.07440581917762756, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003802886640187353, "grad_norm": 8.796406745910645, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.858979344367981, "num_tokens": 820122476.0, "step": 21495 }, { "epoch": 2.7345121485816053, "ewc_loss": 0.07416927814483643, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003779232210945338, "grad_norm": 8.742131233215332, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8715234398841858, "num_tokens": 820159866.0, "step": 21496 }, { "epoch": 2.734639358860196, "ewc_loss": 0.07425379753112793, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037876839633099735, "grad_norm": 8.770423889160156, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8674303293228149, "num_tokens": 820202710.0, "step": 21497 }, { "epoch": 2.7347665691387864, "ewc_loss": 0.07421191781759262, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037834965041838586, "grad_norm": 8.715303421020508, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8651162385940552, "num_tokens": 820239156.0, "step": 21498 }, { "epoch": 2.734893779417377, "ewc_loss": 0.07435202598571777, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000379750708816573, "grad_norm": 8.801772117614746, "learning_rate": 1e-06, "loss": 0.3946, "mean_token_accuracy": 0.8846980333328247, "num_tokens": 820274010.0, "step": 21499 }, { "epoch": 2.7350209896959674, "ewc_loss": 0.07405710220336914, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037680144305340946, "grad_norm": 8.759236335754395, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8659325838088989, "num_tokens": 820319171.0, "step": 21500 }, { "epoch": 2.735148199974558, "ewc_loss": 0.07438011467456818, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003800316189881414, "grad_norm": 8.840836524963379, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.8521600365638733, "num_tokens": 820355038.0, "step": 21501 }, { "epoch": 2.7352754102531485, "ewc_loss": 0.07413789629936218, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037760945269837976, "grad_norm": 8.669442176818848, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8628697395324707, "num_tokens": 820390973.0, "step": 21502 }, { "epoch": 2.735402620531739, "ewc_loss": 0.07479086518287659, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038169772597029805, "grad_norm": 9.17708683013916, "learning_rate": 1e-06, "loss": 0.4811, "mean_token_accuracy": 0.8602030277252197, "num_tokens": 820424543.0, "step": 21503 }, { "epoch": 2.7355298308103295, "ewc_loss": 0.07366886734962463, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000372919108485803, "grad_norm": 8.613518714904785, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8665337562561035, "num_tokens": 820462702.0, "step": 21504 }, { "epoch": 2.73565704108892, "ewc_loss": 0.0752168744802475, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000388399203075096, "grad_norm": 8.924626350402832, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8770337104797363, "num_tokens": 820498840.0, "step": 21505 }, { "epoch": 2.7357842513675106, "ewc_loss": 0.07360542565584183, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003722847322933376, "grad_norm": 8.600533485412598, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8605489730834961, "num_tokens": 820534083.0, "step": 21506 }, { "epoch": 2.735911461646101, "ewc_loss": 0.07511377334594727, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003873682289849967, "grad_norm": 8.917693138122559, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8695055842399597, "num_tokens": 820572479.0, "step": 21507 }, { "epoch": 2.7360386719246916, "ewc_loss": 0.07382119446992874, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037444240297190845, "grad_norm": 8.62069034576416, "learning_rate": 1e-06, "loss": 0.4964, "mean_token_accuracy": 0.8541699051856995, "num_tokens": 820610283.0, "step": 21508 }, { "epoch": 2.736165882203282, "ewc_loss": 0.07502437382936478, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003864741884171963, "grad_norm": 8.933236122131348, "learning_rate": 1e-06, "loss": 0.4137, "mean_token_accuracy": 0.8835625648498535, "num_tokens": 820647580.0, "step": 21509 }, { "epoch": 2.7362930924818727, "ewc_loss": 0.07391293346881866, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003753597557079047, "grad_norm": 8.59560489654541, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8725875020027161, "num_tokens": 820686774.0, "step": 21510 }, { "epoch": 2.7364203027604628, "ewc_loss": 0.07505759596824646, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003868064668495208, "grad_norm": 8.927927017211914, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8761404156684875, "num_tokens": 820724635.0, "step": 21511 }, { "epoch": 2.7365475130390537, "ewc_loss": 0.07379350066184998, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003741654218174517, "grad_norm": 8.652782440185547, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8744087815284729, "num_tokens": 820761467.0, "step": 21512 }, { "epoch": 2.736674723317644, "ewc_loss": 0.07501006126403809, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003863310848828405, "grad_norm": 8.98957633972168, "learning_rate": 1e-06, "loss": 0.5034, "mean_token_accuracy": 0.8534624576568604, "num_tokens": 820801001.0, "step": 21513 }, { "epoch": 2.736801933596235, "ewc_loss": 0.07383187860250473, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003745492431335151, "grad_norm": 8.688863754272461, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8616881370544434, "num_tokens": 820841028.0, "step": 21514 }, { "epoch": 2.736929143874825, "ewc_loss": 0.07491485029459, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038537898217327893, "grad_norm": 8.881641387939453, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8753585815429688, "num_tokens": 820881613.0, "step": 21515 }, { "epoch": 2.737056354153416, "ewc_loss": 0.07405120134353638, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003767424786929041, "grad_norm": 8.698323249816895, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8753321766853333, "num_tokens": 820918205.0, "step": 21516 }, { "epoch": 2.737183564432006, "ewc_loss": 0.0747065395116806, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003832958173006773, "grad_norm": 8.902013778686523, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8603575825691223, "num_tokens": 820950751.0, "step": 21517 }, { "epoch": 2.7373107747105965, "ewc_loss": 0.07411427795886993, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037737327511422336, "grad_norm": 8.73149299621582, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8683630228042603, "num_tokens": 820987385.0, "step": 21518 }, { "epoch": 2.737437984989187, "ewc_loss": 0.07461970299482346, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038242750451900065, "grad_norm": 8.86495304107666, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8592528104782104, "num_tokens": 821026273.0, "step": 21519 }, { "epoch": 2.7375651952677775, "ewc_loss": 0.07410978525876999, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003773283096961677, "grad_norm": 8.681914329528809, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8576839566230774, "num_tokens": 821067978.0, "step": 21520 }, { "epoch": 2.737692405546368, "ewc_loss": 0.07468359917402267, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003830664500128478, "grad_norm": 8.820213317871094, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8713713884353638, "num_tokens": 821111925.0, "step": 21521 }, { "epoch": 2.7378196158249586, "ewc_loss": 0.0742136538028717, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037836702540516853, "grad_norm": 8.761873245239258, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8693124651908875, "num_tokens": 821151019.0, "step": 21522 }, { "epoch": 2.737946826103549, "ewc_loss": 0.07449020445346832, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003811325295828283, "grad_norm": 8.805000305175781, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.872947096824646, "num_tokens": 821185520.0, "step": 21523 }, { "epoch": 2.7380740363821396, "ewc_loss": 0.07433658838272095, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003795964003074914, "grad_norm": 8.766336441040039, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8677974939346313, "num_tokens": 821224273.0, "step": 21524 }, { "epoch": 2.73820124666073, "ewc_loss": 0.07444809377193451, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038071139715611935, "grad_norm": 8.778314590454102, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.873388409614563, "num_tokens": 821263541.0, "step": 21525 }, { "epoch": 2.7383284569393207, "ewc_loss": 0.07454134523868561, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816438838839531, "grad_norm": 8.849451065063477, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8555167317390442, "num_tokens": 821304053.0, "step": 21526 }, { "epoch": 2.7384556672179112, "ewc_loss": 0.07430226355791092, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037925312062725425, "grad_norm": 8.762284278869629, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8671725988388062, "num_tokens": 821339710.0, "step": 21527 }, { "epoch": 2.7385828774965018, "ewc_loss": 0.07459668815135956, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003821973514277488, "grad_norm": 8.824299812316895, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.866142213344574, "num_tokens": 821378377.0, "step": 21528 }, { "epoch": 2.7387100877750923, "ewc_loss": 0.07419134676456451, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037814394454471767, "grad_norm": 8.71710205078125, "learning_rate": 1e-06, "loss": 0.435, "mean_token_accuracy": 0.8769707679748535, "num_tokens": 821417434.0, "step": 21529 }, { "epoch": 2.738837298053683, "ewc_loss": 0.07459479570388794, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000382178375730291, "grad_norm": 8.807097434997559, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8631008863449097, "num_tokens": 821464323.0, "step": 21530 }, { "epoch": 2.7389645083322733, "ewc_loss": 0.07429550588130951, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003791855415329337, "grad_norm": 8.714960098266602, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.863899827003479, "num_tokens": 821509891.0, "step": 21531 }, { "epoch": 2.739091718610864, "ewc_loss": 0.07468471676111221, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003830776549875736, "grad_norm": 8.816851615905762, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8565289974212646, "num_tokens": 821557309.0, "step": 21532 }, { "epoch": 2.7392189288894544, "ewc_loss": 0.07428253442049026, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037905582576058805, "grad_norm": 8.833751678466797, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8628426790237427, "num_tokens": 821597751.0, "step": 21533 }, { "epoch": 2.739346139168045, "ewc_loss": 0.07445599883794785, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003807904722634703, "grad_norm": 8.793601036071777, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8792489767074585, "num_tokens": 821637961.0, "step": 21534 }, { "epoch": 2.7394733494466355, "ewc_loss": 0.07455284893512726, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000381758960429579, "grad_norm": 8.870028495788574, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8503764271736145, "num_tokens": 821684728.0, "step": 21535 }, { "epoch": 2.7396005597252255, "ewc_loss": 0.07417459785938263, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003779764811042696, "grad_norm": 8.778860092163086, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.872157096862793, "num_tokens": 821715359.0, "step": 21536 }, { "epoch": 2.7397277700038165, "ewc_loss": 0.07458353787660599, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003820658312179148, "grad_norm": 8.82132625579834, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8619852066040039, "num_tokens": 821755540.0, "step": 21537 }, { "epoch": 2.7398549802824066, "ewc_loss": 0.07430823147296906, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003793127543758601, "grad_norm": 8.809146881103516, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8672822713851929, "num_tokens": 821788506.0, "step": 21538 }, { "epoch": 2.7399821905609976, "ewc_loss": 0.07438743114471436, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038010472781024873, "grad_norm": 8.803346633911133, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8649522066116333, "num_tokens": 821826555.0, "step": 21539 }, { "epoch": 2.7401094008395877, "ewc_loss": 0.07446415722370148, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038087202119641006, "grad_norm": 8.857739448547363, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8558946847915649, "num_tokens": 821866160.0, "step": 21540 }, { "epoch": 2.7402366111181786, "ewc_loss": 0.07418166100978851, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003780471161007881, "grad_norm": 8.813224792480469, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8725890517234802, "num_tokens": 821905595.0, "step": 21541 }, { "epoch": 2.7403638213967687, "ewc_loss": 0.0743366926908493, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037959744804538786, "grad_norm": 8.814896583557129, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.8568434715270996, "num_tokens": 821944769.0, "step": 21542 }, { "epoch": 2.7404910316753592, "ewc_loss": 0.07430589199066162, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037928944220766425, "grad_norm": 8.820480346679688, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8739148378372192, "num_tokens": 821988889.0, "step": 21543 }, { "epoch": 2.7406182419539498, "ewc_loss": 0.07440018653869629, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038023231900297105, "grad_norm": 8.841041564941406, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8694683313369751, "num_tokens": 822025343.0, "step": 21544 }, { "epoch": 2.7407454522325403, "ewc_loss": 0.07431275397539139, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003793580108322203, "grad_norm": 8.799788475036621, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.854947566986084, "num_tokens": 822067651.0, "step": 21545 }, { "epoch": 2.740872662511131, "ewc_loss": 0.07437252998352051, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037995577440597117, "grad_norm": 8.847180366516113, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8576664924621582, "num_tokens": 822102885.0, "step": 21546 }, { "epoch": 2.7409998727897213, "ewc_loss": 0.07429565489292145, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037918699672445655, "grad_norm": 8.823695182800293, "learning_rate": 1e-06, "loss": 0.4941, "mean_token_accuracy": 0.8579634428024292, "num_tokens": 822142515.0, "step": 21547 }, { "epoch": 2.741127083068312, "ewc_loss": 0.07452985644340515, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038152901106514037, "grad_norm": 8.938289642333984, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8630841374397278, "num_tokens": 822180745.0, "step": 21548 }, { "epoch": 2.7412542933469024, "ewc_loss": 0.07407689839601517, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003769994655158371, "grad_norm": 8.772688865661621, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8760193586349487, "num_tokens": 822215117.0, "step": 21549 }, { "epoch": 2.741381503625493, "ewc_loss": 0.07454471290111542, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816776443272829, "grad_norm": 8.79284954071045, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8578152656555176, "num_tokens": 822258883.0, "step": 21550 }, { "epoch": 2.7415087139040835, "ewc_loss": 0.07415413856506348, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037777185207232833, "grad_norm": 8.751551628112793, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8633502125740051, "num_tokens": 822295254.0, "step": 21551 }, { "epoch": 2.741635924182674, "ewc_loss": 0.07462473213672638, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038247782504186034, "grad_norm": 8.883282661437988, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8694455623626709, "num_tokens": 822335769.0, "step": 21552 }, { "epoch": 2.7417631344612645, "ewc_loss": 0.07417141646146774, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037794464151374996, "grad_norm": 8.713481903076172, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8758283853530884, "num_tokens": 822376873.0, "step": 21553 }, { "epoch": 2.741890344739855, "ewc_loss": 0.07473554462194443, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038358592428267, "grad_norm": 8.903421401977539, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8618658185005188, "num_tokens": 822419868.0, "step": 21554 }, { "epoch": 2.7420175550184456, "ewc_loss": 0.07404446601867676, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003766750742215663, "grad_norm": 8.734749794006348, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8658827543258667, "num_tokens": 822456757.0, "step": 21555 }, { "epoch": 2.742144765297036, "ewc_loss": 0.07481448352336884, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038437533657997847, "grad_norm": 8.87455940246582, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8804178237915039, "num_tokens": 822492772.0, "step": 21556 }, { "epoch": 2.7422719755756266, "ewc_loss": 0.07416744530200958, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037790497299283743, "grad_norm": 8.842535972595215, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.849104106426239, "num_tokens": 822535400.0, "step": 21557 }, { "epoch": 2.742399185854217, "ewc_loss": 0.07451900839805603, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003814205701928586, "grad_norm": 8.866230964660645, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.8724492788314819, "num_tokens": 822568313.0, "step": 21558 }, { "epoch": 2.7425263961328077, "ewc_loss": 0.07423798739910126, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037861039163544774, "grad_norm": 8.893510818481445, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8718662261962891, "num_tokens": 822608487.0, "step": 21559 }, { "epoch": 2.742653606411398, "ewc_loss": 0.0741138905286789, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037736937520094216, "grad_norm": 8.7478609085083, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8666992783546448, "num_tokens": 822647938.0, "step": 21560 }, { "epoch": 2.7427808166899883, "ewc_loss": 0.07453782111406326, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816086973529309, "grad_norm": 8.862074851989746, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8631695508956909, "num_tokens": 822685974.0, "step": 21561 }, { "epoch": 2.7429080269685793, "ewc_loss": 0.07409663498401642, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037719678948633373, "grad_norm": 8.77985668182373, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.870123028755188, "num_tokens": 822719166.0, "step": 21562 }, { "epoch": 2.7430352372471694, "ewc_loss": 0.07445456087589264, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038077603676356375, "grad_norm": 8.861869812011719, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8558031320571899, "num_tokens": 822758364.0, "step": 21563 }, { "epoch": 2.7431624475257603, "ewc_loss": 0.07410506159067154, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003772811032831669, "grad_norm": 8.714661598205566, "learning_rate": 1e-06, "loss": 0.437, "mean_token_accuracy": 0.8747488260269165, "num_tokens": 822798002.0, "step": 21564 }, { "epoch": 2.7432896578043504, "ewc_loss": 0.07452119886875153, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038144245627336204, "grad_norm": 8.884405136108398, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8663054704666138, "num_tokens": 822837958.0, "step": 21565 }, { "epoch": 2.743416868082941, "ewc_loss": 0.07392999529838562, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037553044967353344, "grad_norm": 8.792099952697754, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8662674427032471, "num_tokens": 822873076.0, "step": 21566 }, { "epoch": 2.7435440783615315, "ewc_loss": 0.07450654357671738, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038129588938318193, "grad_norm": 8.892755508422852, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8656986951828003, "num_tokens": 822904693.0, "step": 21567 }, { "epoch": 2.743671288640122, "ewc_loss": 0.07394629716873169, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003756934602279216, "grad_norm": 8.798789978027344, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.8614277839660645, "num_tokens": 822941192.0, "step": 21568 }, { "epoch": 2.7437984989187125, "ewc_loss": 0.07436828315258026, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037991328281350434, "grad_norm": 8.822275161743164, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8536050319671631, "num_tokens": 822982166.0, "step": 21569 }, { "epoch": 2.743925709197303, "ewc_loss": 0.07421614229679108, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003783919382840395, "grad_norm": 8.81950569152832, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8598520755767822, "num_tokens": 823021630.0, "step": 21570 }, { "epoch": 2.7440529194758936, "ewc_loss": 0.07426422834396362, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037887273356318474, "grad_norm": 8.801689147949219, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.866100013256073, "num_tokens": 823062133.0, "step": 21571 }, { "epoch": 2.744180129754484, "ewc_loss": 0.07495900243520737, "ewc_loss_diag": 3.719329833984375e-05, "ewc_loss_parallel": 0.0003784962755162269, "grad_norm": 9.046523094177246, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8619877099990845, "num_tokens": 823105163.0, "step": 21572 }, { "epoch": 2.7443073400330746, "ewc_loss": 0.07377076148986816, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003739380626939237, "grad_norm": 8.734929084777832, "learning_rate": 1e-06, "loss": 0.5316, "mean_token_accuracy": 0.8468673229217529, "num_tokens": 823146878.0, "step": 21573 }, { "epoch": 2.744434550311665, "ewc_loss": 0.07466554641723633, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003828859480563551, "grad_norm": 8.885884284973145, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8832509517669678, "num_tokens": 823182216.0, "step": 21574 }, { "epoch": 2.7445617605902557, "ewc_loss": 0.07383795827627182, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037461004103533924, "grad_norm": 8.686685562133789, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8735347390174866, "num_tokens": 823227838.0, "step": 21575 }, { "epoch": 2.7446889708688462, "ewc_loss": 0.07513788342475891, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003827264590654522, "grad_norm": 9.294547080993652, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8579826354980469, "num_tokens": 823268609.0, "step": 21576 }, { "epoch": 2.7448161811474368, "ewc_loss": 0.07343046367168427, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037053506821393967, "grad_norm": 8.623006820678711, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8762290477752686, "num_tokens": 823309426.0, "step": 21577 }, { "epoch": 2.7449433914260273, "ewc_loss": 0.07519566267728806, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038818709435872734, "grad_norm": 9.034971237182617, "learning_rate": 1e-06, "loss": 0.4817, "mean_token_accuracy": 0.8600532412528992, "num_tokens": 823346049.0, "step": 21578 }, { "epoch": 2.745070601704618, "ewc_loss": 0.0734630674123764, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003708611475303769, "grad_norm": 8.688941955566406, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.8598607778549194, "num_tokens": 823378915.0, "step": 21579 }, { "epoch": 2.7451978119832083, "ewc_loss": 0.07506047189235687, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038683522143401206, "grad_norm": 8.994544982910156, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8572438955307007, "num_tokens": 823422985.0, "step": 21580 }, { "epoch": 2.745325022261799, "ewc_loss": 0.07383260875940323, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037455654819495976, "grad_norm": 8.682064056396484, "learning_rate": 1e-06, "loss": 0.5102, "mean_token_accuracy": 0.8527902960777283, "num_tokens": 823460703.0, "step": 21581 }, { "epoch": 2.7454522325403894, "ewc_loss": 0.07489994168281555, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852298541460186, "grad_norm": 9.33725643157959, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8679332733154297, "num_tokens": 823499651.0, "step": 21582 }, { "epoch": 2.74557944281898, "ewc_loss": 0.07354587316513062, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037168918061070144, "grad_norm": 8.674372673034668, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8661210536956787, "num_tokens": 823537044.0, "step": 21583 }, { "epoch": 2.7457066530975704, "ewc_loss": 0.07524581253528595, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038868861156515777, "grad_norm": 9.04642105102539, "learning_rate": 1e-06, "loss": 0.4155, "mean_token_accuracy": 0.8797929286956787, "num_tokens": 823571832.0, "step": 21584 }, { "epoch": 2.745833863376161, "ewc_loss": 0.07370048761367798, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003732353507075459, "grad_norm": 8.647543907165527, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8722584247589111, "num_tokens": 823609852.0, "step": 21585 }, { "epoch": 2.745961073654751, "ewc_loss": 0.075227290391922, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038850336568430066, "grad_norm": 9.071205139160156, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8689870834350586, "num_tokens": 823644419.0, "step": 21586 }, { "epoch": 2.746088283933342, "ewc_loss": 0.07380074262619019, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003742379485629499, "grad_norm": 8.731804847717285, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8710032105445862, "num_tokens": 823682451.0, "step": 21587 }, { "epoch": 2.746215494211932, "ewc_loss": 0.07493923604488373, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038562287227250636, "grad_norm": 8.995418548583984, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.877516508102417, "num_tokens": 823723051.0, "step": 21588 }, { "epoch": 2.746342704490523, "ewc_loss": 0.07399208098649979, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003761512925848365, "grad_norm": 8.745305061340332, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8624638319015503, "num_tokens": 823761353.0, "step": 21589 }, { "epoch": 2.746469914769113, "ewc_loss": 0.0748690813779831, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849212371278554, "grad_norm": 8.955018043518066, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8514928221702576, "num_tokens": 823800281.0, "step": 21590 }, { "epoch": 2.7465971250477037, "ewc_loss": 0.07415211200714111, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037775159580633044, "grad_norm": 8.814355850219727, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8800327777862549, "num_tokens": 823843255.0, "step": 21591 }, { "epoch": 2.7467243353262942, "ewc_loss": 0.0745558887720108, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038178934482857585, "grad_norm": 8.888875961303711, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8630058765411377, "num_tokens": 823880736.0, "step": 21592 }, { "epoch": 2.7468515456048848, "ewc_loss": 0.0741214007139206, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003774444339796901, "grad_norm": 8.802956581115723, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8646535873413086, "num_tokens": 823923232.0, "step": 21593 }, { "epoch": 2.7469787558834753, "ewc_loss": 0.07440653443336487, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003802958526648581, "grad_norm": 8.951695442199707, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8705090284347534, "num_tokens": 823957488.0, "step": 21594 }, { "epoch": 2.747105966162066, "ewc_loss": 0.07400916516780853, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003763221320696175, "grad_norm": 8.72557258605957, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.862385630607605, "num_tokens": 823999568.0, "step": 21595 }, { "epoch": 2.7472331764406563, "ewc_loss": 0.07458969950675964, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003821274440269917, "grad_norm": 8.954707145690918, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8773486614227295, "num_tokens": 824036947.0, "step": 21596 }, { "epoch": 2.747360386719247, "ewc_loss": 0.07394137978553772, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003756442747544497, "grad_norm": 8.798829078674316, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8780461549758911, "num_tokens": 824067966.0, "step": 21597 }, { "epoch": 2.7474875969978374, "ewc_loss": 0.07446178793907166, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003808483888860792, "grad_norm": 8.789250373840332, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8637310266494751, "num_tokens": 824112216.0, "step": 21598 }, { "epoch": 2.747614807276428, "ewc_loss": 0.07424744963645935, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037870494998060167, "grad_norm": 8.895281791687012, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8762984871864319, "num_tokens": 824148749.0, "step": 21599 }, { "epoch": 2.7477420175550185, "ewc_loss": 0.07400529086589813, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003762833948712796, "grad_norm": 8.765543937683105, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8647235035896301, "num_tokens": 824186685.0, "step": 21600 }, { "epoch": 2.747869227833609, "ewc_loss": 0.07444356381893158, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003806661406997591, "grad_norm": 8.870793342590332, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8600324392318726, "num_tokens": 824223716.0, "step": 21601 }, { "epoch": 2.7479964381121995, "ewc_loss": 0.07398320734500885, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003760625550057739, "grad_norm": 8.854500770568848, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8653350472450256, "num_tokens": 824264627.0, "step": 21602 }, { "epoch": 2.74812364839079, "ewc_loss": 0.0742051899433136, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003782823623623699, "grad_norm": 8.796745300292969, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8666004538536072, "num_tokens": 824301901.0, "step": 21603 }, { "epoch": 2.7482508586693806, "ewc_loss": 0.0741354376077652, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037758483085781336, "grad_norm": 8.741800308227539, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8707998991012573, "num_tokens": 824344621.0, "step": 21604 }, { "epoch": 2.748378068947971, "ewc_loss": 0.07429829239845276, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037921336479485035, "grad_norm": 8.89493465423584, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.872749388217926, "num_tokens": 824385850.0, "step": 21605 }, { "epoch": 2.7485052792265616, "ewc_loss": 0.07386873662471771, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037491778493858874, "grad_norm": 8.71996021270752, "learning_rate": 1e-06, "loss": 0.3752, "mean_token_accuracy": 0.8927148580551147, "num_tokens": 824425545.0, "step": 21606 }, { "epoch": 2.748632489505152, "ewc_loss": 0.0745636597275734, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003818670811597258, "grad_norm": 8.834959030151367, "learning_rate": 1e-06, "loss": 0.4129, "mean_token_accuracy": 0.8828797936439514, "num_tokens": 824468611.0, "step": 21607 }, { "epoch": 2.7487596997837427, "ewc_loss": 0.07382126152515411, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037444307236000896, "grad_norm": 8.756274223327637, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8711454272270203, "num_tokens": 824512161.0, "step": 21608 }, { "epoch": 2.7488869100623328, "ewc_loss": 0.0743573009967804, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003798034449573606, "grad_norm": 8.802983283996582, "learning_rate": 1e-06, "loss": 0.4238, "mean_token_accuracy": 0.8797045946121216, "num_tokens": 824555502.0, "step": 21609 }, { "epoch": 2.7490141203409237, "ewc_loss": 0.07422808557748795, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378511322196573, "grad_norm": 8.829219818115234, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8609640598297119, "num_tokens": 824596277.0, "step": 21610 }, { "epoch": 2.749141330619514, "ewc_loss": 0.07420146465301514, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003782451094593853, "grad_norm": 8.894278526306152, "learning_rate": 1e-06, "loss": 0.5018, "mean_token_accuracy": 0.8537598252296448, "num_tokens": 824633704.0, "step": 21611 }, { "epoch": 2.749268540898105, "ewc_loss": 0.07412952929735184, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003775257500819862, "grad_norm": 8.789393424987793, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8701473474502563, "num_tokens": 824673584.0, "step": 21612 }, { "epoch": 2.749395751176695, "ewc_loss": 0.07429172098636627, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037914764834567904, "grad_norm": 8.836329460144043, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8587099313735962, "num_tokens": 824712508.0, "step": 21613 }, { "epoch": 2.749522961455286, "ewc_loss": 0.07409022748470306, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000377132702851668, "grad_norm": 8.749412536621094, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8775394558906555, "num_tokens": 824754353.0, "step": 21614 }, { "epoch": 2.749650171733876, "ewc_loss": 0.0744093805551529, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003803242288995534, "grad_norm": 8.787773132324219, "learning_rate": 1e-06, "loss": 0.5107, "mean_token_accuracy": 0.8496357202529907, "num_tokens": 824796333.0, "step": 21615 }, { "epoch": 2.7497773820124665, "ewc_loss": 0.07423137128353119, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037854420952498913, "grad_norm": 8.84253978729248, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.860213041305542, "num_tokens": 824832048.0, "step": 21616 }, { "epoch": 2.749904592291057, "ewc_loss": 0.07425321638584137, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037876260466873646, "grad_norm": 8.842732429504395, "learning_rate": 1e-06, "loss": 0.4872, "mean_token_accuracy": 0.8602993488311768, "num_tokens": 824871889.0, "step": 21617 }, { "epoch": 2.7500318025696475, "ewc_loss": 0.07435006648302078, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037973112193867564, "grad_norm": 8.876091957092285, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8767179250717163, "num_tokens": 824912547.0, "step": 21618 }, { "epoch": 2.750159012848238, "ewc_loss": 0.07420170307159424, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003782475250773132, "grad_norm": 8.809324264526367, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8476352691650391, "num_tokens": 824955626.0, "step": 21619 }, { "epoch": 2.7502862231268286, "ewc_loss": 0.07443653047084808, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003805957967415452, "grad_norm": 8.938096046447754, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8749738335609436, "num_tokens": 824991813.0, "step": 21620 }, { "epoch": 2.750413433405419, "ewc_loss": 0.07393147051334381, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000375545205315575, "grad_norm": 8.817997932434082, "learning_rate": 1e-06, "loss": 0.4257, "mean_token_accuracy": 0.8745968341827393, "num_tokens": 825024356.0, "step": 21621 }, { "epoch": 2.7505406436840096, "ewc_loss": 0.07451477646827698, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813782532233745, "grad_norm": 8.913405418395996, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8634185791015625, "num_tokens": 825060316.0, "step": 21622 }, { "epoch": 2.7506678539626, "ewc_loss": 0.07402297109365463, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003764601715374738, "grad_norm": 8.83081340789795, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8597413301467896, "num_tokens": 825096044.0, "step": 21623 }, { "epoch": 2.7507950642411907, "ewc_loss": 0.0743284821510315, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037951525882817805, "grad_norm": 8.94495677947998, "learning_rate": 1e-06, "loss": 0.4618, "mean_token_accuracy": 0.8642602562904358, "num_tokens": 825123999.0, "step": 21624 }, { "epoch": 2.750922274519781, "ewc_loss": 0.0740717351436615, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037694783532060683, "grad_norm": 8.793956756591797, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.8559795618057251, "num_tokens": 825170155.0, "step": 21625 }, { "epoch": 2.7510494847983717, "ewc_loss": 0.07460218667984009, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003822523867711425, "grad_norm": 8.995266914367676, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8643289804458618, "num_tokens": 825213388.0, "step": 21626 }, { "epoch": 2.7511766950769623, "ewc_loss": 0.07386896014213562, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003749200841411948, "grad_norm": 8.749654769897461, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8724616169929504, "num_tokens": 825247171.0, "step": 21627 }, { "epoch": 2.751303905355553, "ewc_loss": 0.07468268275260925, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038305731141008437, "grad_norm": 9.011786460876465, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8667829632759094, "num_tokens": 825283271.0, "step": 21628 }, { "epoch": 2.7514311156341433, "ewc_loss": 0.07388676702976227, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003750981704797596, "grad_norm": 8.77097225189209, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8658062219619751, "num_tokens": 825318976.0, "step": 21629 }, { "epoch": 2.751558325912734, "ewc_loss": 0.07460571825504303, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038228760240599513, "grad_norm": 8.92839527130127, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8659257888793945, "num_tokens": 825358019.0, "step": 21630 }, { "epoch": 2.7516855361913244, "ewc_loss": 0.07405830174684525, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037681349203921854, "grad_norm": 8.823511123657227, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8840428590774536, "num_tokens": 825394025.0, "step": 21631 }, { "epoch": 2.751812746469915, "ewc_loss": 0.07451234757900238, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038135398062877357, "grad_norm": 8.849833488464355, "learning_rate": 1e-06, "loss": 0.4377, "mean_token_accuracy": 0.8781813979148865, "num_tokens": 825433021.0, "step": 21632 }, { "epoch": 2.7519399567485054, "ewc_loss": 0.07427969574928284, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037902744952589273, "grad_norm": 8.8334379196167, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8660677671432495, "num_tokens": 825477026.0, "step": 21633 }, { "epoch": 2.7520671670270955, "ewc_loss": 0.07426398247480392, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037887028884142637, "grad_norm": 8.832799911499023, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8719480037689209, "num_tokens": 825511054.0, "step": 21634 }, { "epoch": 2.7521943773056865, "ewc_loss": 0.07439633458852768, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003801938146352768, "grad_norm": 8.879914283752441, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.8535256385803223, "num_tokens": 825549654.0, "step": 21635 }, { "epoch": 2.7523215875842766, "ewc_loss": 0.07412172853946686, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003774477227125317, "grad_norm": 8.818650245666504, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8692417144775391, "num_tokens": 825587269.0, "step": 21636 }, { "epoch": 2.7524487978628676, "ewc_loss": 0.07436764240264893, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037990687997080386, "grad_norm": 8.827528953552246, "learning_rate": 1e-06, "loss": 0.3863, "mean_token_accuracy": 0.8898733258247375, "num_tokens": 825624864.0, "step": 21637 }, { "epoch": 2.7525760081414576, "ewc_loss": 0.07428381592035294, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000379068631445989, "grad_norm": 8.763147354125977, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8579469919204712, "num_tokens": 825667025.0, "step": 21638 }, { "epoch": 2.7527032184200486, "ewc_loss": 0.07444024085998535, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003806328750215471, "grad_norm": 8.844910621643066, "learning_rate": 1e-06, "loss": 0.5104, "mean_token_accuracy": 0.8513451814651489, "num_tokens": 825709505.0, "step": 21639 }, { "epoch": 2.7528304286986387, "ewc_loss": 0.07421940565109253, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037842453457415104, "grad_norm": 8.758896827697754, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8703077435493469, "num_tokens": 825747376.0, "step": 21640 }, { "epoch": 2.7529576389772292, "ewc_loss": 0.07470327615737915, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038326327921822667, "grad_norm": 8.878754615783691, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8739356994628906, "num_tokens": 825784817.0, "step": 21641 }, { "epoch": 2.7530848492558198, "ewc_loss": 0.07421627640724182, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037839324795641005, "grad_norm": 8.764764785766602, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8631323575973511, "num_tokens": 825815631.0, "step": 21642 }, { "epoch": 2.7532120595344103, "ewc_loss": 0.07472068071365356, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000383437261916697, "grad_norm": 8.884271621704102, "learning_rate": 1e-06, "loss": 0.4686, "mean_token_accuracy": 0.8663026094436646, "num_tokens": 825854202.0, "step": 21643 }, { "epoch": 2.753339269813001, "ewc_loss": 0.07432413101196289, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037947174860164523, "grad_norm": 8.793458938598633, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8732118606567383, "num_tokens": 825890636.0, "step": 21644 }, { "epoch": 2.7534664800915913, "ewc_loss": 0.07455725967884064, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003818030236288905, "grad_norm": 8.880242347717285, "learning_rate": 1e-06, "loss": 0.5249, "mean_token_accuracy": 0.8426425457000732, "num_tokens": 825931054.0, "step": 21645 }, { "epoch": 2.753593690370182, "ewc_loss": 0.07497310638427734, "ewc_loss_diag": 3.719329833984375e-05, "ewc_loss_parallel": 0.00037863728357478976, "grad_norm": 36.32948684692383, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8625117540359497, "num_tokens": 825972519.0, "step": 21646 }, { "epoch": 2.7537209006487724, "ewc_loss": 0.11255283653736115, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0007617588271386921, "grad_norm": 13.095868110656738, "learning_rate": 1e-06, "loss": 0.488, "mean_token_accuracy": 0.8693549633026123, "num_tokens": 826010010.0, "step": 21647 }, { "epoch": 2.753848110927363, "ewc_loss": 0.06937398761510849, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00032997035305015743, "grad_norm": 7.121953010559082, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8666625618934631, "num_tokens": 826048348.0, "step": 21648 }, { "epoch": 2.7539753212059535, "ewc_loss": 0.0978853851556778, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0006150843109935522, "grad_norm": 12.369417190551758, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.86236971616745, "num_tokens": 826087807.0, "step": 21649 }, { "epoch": 2.754102531484544, "ewc_loss": 0.09656061977148056, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0006018366548232734, "grad_norm": 11.149514198303223, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8625917434692383, "num_tokens": 826124657.0, "step": 21650 }, { "epoch": 2.7542297417631345, "ewc_loss": 0.08247368037700653, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00046096727601252496, "grad_norm": 9.35394287109375, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8680605888366699, "num_tokens": 826164190.0, "step": 21651 }, { "epoch": 2.754356952041725, "ewc_loss": 0.08474549651145935, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00048368540592491627, "grad_norm": 10.511641502380371, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8682315349578857, "num_tokens": 826202876.0, "step": 21652 }, { "epoch": 2.7544841623203156, "ewc_loss": 0.08403365314006805, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004765670164488256, "grad_norm": 9.480497360229492, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8644269704818726, "num_tokens": 826242955.0, "step": 21653 }, { "epoch": 2.754611372598906, "ewc_loss": 0.08097793906927109, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00044600985711440444, "grad_norm": 9.805121421813965, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8758602142333984, "num_tokens": 826279925.0, "step": 21654 }, { "epoch": 2.7547385828774966, "ewc_loss": 0.08067484200000763, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004429789260029793, "grad_norm": 9.573220252990723, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.875481128692627, "num_tokens": 826314076.0, "step": 21655 }, { "epoch": 2.754865793156087, "ewc_loss": 0.07899616658687592, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00042619218584150076, "grad_norm": 9.270089149475098, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8632200360298157, "num_tokens": 826354138.0, "step": 21656 }, { "epoch": 2.7549930034346777, "ewc_loss": 0.07874606549739838, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004236911772750318, "grad_norm": 9.494133949279785, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8610824346542358, "num_tokens": 826395393.0, "step": 21657 }, { "epoch": 2.755120213713268, "ewc_loss": 0.07744903862476349, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00041072085150517523, "grad_norm": 9.149349212646484, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8556258082389832, "num_tokens": 826437539.0, "step": 21658 }, { "epoch": 2.7552474239918583, "ewc_loss": 0.07756710797548294, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000411901535699144, "grad_norm": 9.351945877075195, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8593343496322632, "num_tokens": 826475857.0, "step": 21659 }, { "epoch": 2.7553746342704493, "ewc_loss": 0.0763305127620697, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003995356091763824, "grad_norm": 9.031540870666504, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8755432367324829, "num_tokens": 826513956.0, "step": 21660 }, { "epoch": 2.7555018445490393, "ewc_loss": 0.07660973072052002, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000402327801566571, "grad_norm": 9.131068229675293, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.864944577217102, "num_tokens": 826552747.0, "step": 21661 }, { "epoch": 2.7556290548276303, "ewc_loss": 0.07583919167518616, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039462241693399847, "grad_norm": 8.966058731079102, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.8728786110877991, "num_tokens": 826590830.0, "step": 21662 }, { "epoch": 2.7557562651062204, "ewc_loss": 0.07595585286617279, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039578895666636527, "grad_norm": 9.057311058044434, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8697121143341064, "num_tokens": 826633414.0, "step": 21663 }, { "epoch": 2.755883475384811, "ewc_loss": 0.07535720616579056, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003898025315720588, "grad_norm": 9.011880874633789, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8759590983390808, "num_tokens": 826668344.0, "step": 21664 }, { "epoch": 2.7560106856634015, "ewc_loss": 0.07552716135978699, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003915021079592407, "grad_norm": 8.973447799682617, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8507055044174194, "num_tokens": 826710587.0, "step": 21665 }, { "epoch": 2.756137895941992, "ewc_loss": 0.0751437097787857, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003876676200889051, "grad_norm": 8.977014541625977, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8566904067993164, "num_tokens": 826748854.0, "step": 21666 }, { "epoch": 2.7562651062205825, "ewc_loss": 0.07520546019077301, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003882850578520447, "grad_norm": 9.00019645690918, "learning_rate": 1e-06, "loss": 0.5152, "mean_token_accuracy": 0.8532341718673706, "num_tokens": 826789591.0, "step": 21667 }, { "epoch": 2.756392316499173, "ewc_loss": 0.07488813996315002, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000385111867217347, "grad_norm": 8.904556274414062, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8623591661453247, "num_tokens": 826831890.0, "step": 21668 }, { "epoch": 2.7565195267777636, "ewc_loss": 0.07493525743484497, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003855830291286111, "grad_norm": 8.991632461547852, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.8479300737380981, "num_tokens": 826870265.0, "step": 21669 }, { "epoch": 2.756646737056354, "ewc_loss": 0.0746389776468277, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003826202009804547, "grad_norm": 8.864351272583008, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8654966354370117, "num_tokens": 826910993.0, "step": 21670 }, { "epoch": 2.7567739473349446, "ewc_loss": 0.07490665465593338, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038529702578671277, "grad_norm": 8.96063232421875, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8637864589691162, "num_tokens": 826947492.0, "step": 21671 }, { "epoch": 2.756901157613535, "ewc_loss": 0.07452967762947083, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038152720662765205, "grad_norm": 8.909189224243164, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8684111833572388, "num_tokens": 826987215.0, "step": 21672 }, { "epoch": 2.7570283678921257, "ewc_loss": 0.07473710179328918, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003836014657281339, "grad_norm": 8.961066246032715, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8628656268119812, "num_tokens": 827024092.0, "step": 21673 }, { "epoch": 2.757155578170716, "ewc_loss": 0.07443167269229889, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003805472224485129, "grad_norm": 8.84083366394043, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8495593070983887, "num_tokens": 827060541.0, "step": 21674 }, { "epoch": 2.7572827884493067, "ewc_loss": 0.07464700937271118, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003827005566563457, "grad_norm": 8.904450416564941, "learning_rate": 1e-06, "loss": 0.4673, "mean_token_accuracy": 0.8643969893455505, "num_tokens": 827101205.0, "step": 21675 }, { "epoch": 2.7574099987278973, "ewc_loss": 0.0745294839143753, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003815253439825028, "grad_norm": 8.978771209716797, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8598629236221313, "num_tokens": 827132441.0, "step": 21676 }, { "epoch": 2.757537209006488, "ewc_loss": 0.07431486248970032, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000379379082005471, "grad_norm": 8.829276084899902, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8662161827087402, "num_tokens": 827169530.0, "step": 21677 }, { "epoch": 2.7576644192850783, "ewc_loss": 0.07476826757192612, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003839131386484951, "grad_norm": 8.954619407653809, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8705170154571533, "num_tokens": 827207352.0, "step": 21678 }, { "epoch": 2.757791629563669, "ewc_loss": 0.07423926144838333, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003786230809055269, "grad_norm": 8.876773834228516, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8592488765716553, "num_tokens": 827242790.0, "step": 21679 }, { "epoch": 2.7579188398422594, "ewc_loss": 0.07461830973625183, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824135346803814, "grad_norm": 8.883079528808594, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8679513931274414, "num_tokens": 827281676.0, "step": 21680 }, { "epoch": 2.75804605012085, "ewc_loss": 0.07436853647232056, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037991584395058453, "grad_norm": 8.861814498901367, "learning_rate": 1e-06, "loss": 0.4174, "mean_token_accuracy": 0.878681480884552, "num_tokens": 827313676.0, "step": 21681 }, { "epoch": 2.7581732603994404, "ewc_loss": 0.07448919117450714, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003811224305536598, "grad_norm": 8.86256217956543, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8645455241203308, "num_tokens": 827355017.0, "step": 21682 }, { "epoch": 2.758300470678031, "ewc_loss": 0.0743894875049591, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000380125391529873, "grad_norm": 8.796402931213379, "learning_rate": 1e-06, "loss": 0.4126, "mean_token_accuracy": 0.8828096389770508, "num_tokens": 827391773.0, "step": 21683 }, { "epoch": 2.758427680956621, "ewc_loss": 0.07460885494947433, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003823190345428884, "grad_norm": 8.870495796203613, "learning_rate": 1e-06, "loss": 0.4221, "mean_token_accuracy": 0.8816030025482178, "num_tokens": 827433034.0, "step": 21684 }, { "epoch": 2.758554891235212, "ewc_loss": 0.07432644069194794, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003794949152506888, "grad_norm": 8.821619033813477, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8774782419204712, "num_tokens": 827475729.0, "step": 21685 }, { "epoch": 2.758682101513802, "ewc_loss": 0.07460916042327881, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038232203223742545, "grad_norm": 8.853002548217773, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8706610798835754, "num_tokens": 827514127.0, "step": 21686 }, { "epoch": 2.758809311792393, "ewc_loss": 0.07441079616546631, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038033840246498585, "grad_norm": 8.814254760742188, "learning_rate": 1e-06, "loss": 0.4306, "mean_token_accuracy": 0.8773436546325684, "num_tokens": 827546709.0, "step": 21687 }, { "epoch": 2.758936522070983, "ewc_loss": 0.07463100552558899, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003825405437964946, "grad_norm": 8.871292114257812, "learning_rate": 1e-06, "loss": 0.3959, "mean_token_accuracy": 0.8851255178451538, "num_tokens": 827581450.0, "step": 21688 }, { "epoch": 2.7590637323495737, "ewc_loss": 0.07436718046665192, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003799022815655917, "grad_norm": 8.779458999633789, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8628710508346558, "num_tokens": 827623161.0, "step": 21689 }, { "epoch": 2.7591909426281642, "ewc_loss": 0.07469877600669861, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038321822648867965, "grad_norm": 8.897461891174316, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8626453876495361, "num_tokens": 827659460.0, "step": 21690 }, { "epoch": 2.7593181529067548, "ewc_loss": 0.07443149387836456, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003805453598033637, "grad_norm": 8.77890396118164, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8610827326774597, "num_tokens": 827701110.0, "step": 21691 }, { "epoch": 2.7594453631853453, "ewc_loss": 0.07469025999307632, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038313306868076324, "grad_norm": 8.88630199432373, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8632056713104248, "num_tokens": 827738992.0, "step": 21692 }, { "epoch": 2.759572573463936, "ewc_loss": 0.0743754431605339, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037998490734025836, "grad_norm": 8.803784370422363, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.8513889312744141, "num_tokens": 827775853.0, "step": 21693 }, { "epoch": 2.7596997837425263, "ewc_loss": 0.0747266411781311, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834968665614724, "grad_norm": 8.855384826660156, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8678321242332458, "num_tokens": 827819438.0, "step": 21694 }, { "epoch": 2.759826994021117, "ewc_loss": 0.07439444959163666, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003801749844569713, "grad_norm": 8.808178901672363, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.859717607498169, "num_tokens": 827858833.0, "step": 21695 }, { "epoch": 2.7599542042997074, "ewc_loss": 0.07462436705827713, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824741579592228, "grad_norm": 8.865299224853516, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8584715127944946, "num_tokens": 827895069.0, "step": 21696 }, { "epoch": 2.760081414578298, "ewc_loss": 0.0745781734585762, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003820122219622135, "grad_norm": 8.834009170532227, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8810144066810608, "num_tokens": 827932970.0, "step": 21697 }, { "epoch": 2.7602086248568884, "ewc_loss": 0.07461196184158325, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003823500592261553, "grad_norm": 8.817584037780762, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.877379298210144, "num_tokens": 827981859.0, "step": 21698 }, { "epoch": 2.760335835135479, "ewc_loss": 0.07470181584358215, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000383248581783846, "grad_norm": 8.9109525680542, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8598295450210571, "num_tokens": 828022018.0, "step": 21699 }, { "epoch": 2.7604630454140695, "ewc_loss": 0.0744432657957077, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003806631139013916, "grad_norm": 8.802318572998047, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8602553009986877, "num_tokens": 828061546.0, "step": 21700 }, { "epoch": 2.76059025569266, "ewc_loss": 0.07485701143741608, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038480054354295135, "grad_norm": 8.902040481567383, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8518741130828857, "num_tokens": 828104464.0, "step": 21701 }, { "epoch": 2.7607174659712506, "ewc_loss": 0.07442963123321533, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038052673335187137, "grad_norm": 8.860886573791504, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8550121188163757, "num_tokens": 828142527.0, "step": 21702 }, { "epoch": 2.760844676249841, "ewc_loss": 0.07483771443367004, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003846075851470232, "grad_norm": 8.897624015808105, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8669843673706055, "num_tokens": 828189670.0, "step": 21703 }, { "epoch": 2.7609718865284316, "ewc_loss": 0.07463260740041733, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003825565509032458, "grad_norm": 8.819707870483398, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8732494115829468, "num_tokens": 828231511.0, "step": 21704 }, { "epoch": 2.761099096807022, "ewc_loss": 0.07496562600135803, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038588675670325756, "grad_norm": 8.913217544555664, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8665331602096558, "num_tokens": 828274387.0, "step": 21705 }, { "epoch": 2.7612263070856127, "ewc_loss": 0.07451364398002625, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813669318333268, "grad_norm": 8.80628776550293, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8586996793746948, "num_tokens": 828312263.0, "step": 21706 }, { "epoch": 2.7613535173642028, "ewc_loss": 0.07505739480257034, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038680442958138883, "grad_norm": 8.944226264953613, "learning_rate": 1e-06, "loss": 0.4502, "mean_token_accuracy": 0.8677845597267151, "num_tokens": 828345550.0, "step": 21707 }, { "epoch": 2.7614807276427937, "ewc_loss": 0.07447706162929535, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038100112578831613, "grad_norm": 8.839942932128906, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8540561199188232, "num_tokens": 828378633.0, "step": 21708 }, { "epoch": 2.761607937921384, "ewc_loss": 0.0749729722738266, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003859602438751608, "grad_norm": 8.921022415161133, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8585735559463501, "num_tokens": 828414292.0, "step": 21709 }, { "epoch": 2.761735148199975, "ewc_loss": 0.0744963139295578, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003811936476267874, "grad_norm": 8.779242515563965, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8654670119285583, "num_tokens": 828452682.0, "step": 21710 }, { "epoch": 2.761862358478565, "ewc_loss": 0.07505422830581665, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038677279371768236, "grad_norm": 8.90170955657959, "learning_rate": 1e-06, "loss": 0.4163, "mean_token_accuracy": 0.8829729557037354, "num_tokens": 828494162.0, "step": 21711 }, { "epoch": 2.761989568757156, "ewc_loss": 0.07456355541944504, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038186603342182934, "grad_norm": 8.775721549987793, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8704499006271362, "num_tokens": 828525997.0, "step": 21712 }, { "epoch": 2.762116779035746, "ewc_loss": 0.07508422434329987, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038707273779436946, "grad_norm": 8.938573837280273, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8640576601028442, "num_tokens": 828562488.0, "step": 21713 }, { "epoch": 2.7622439893143365, "ewc_loss": 0.07447274029254913, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003809578192885965, "grad_norm": 8.789365768432617, "learning_rate": 1e-06, "loss": 0.4076, "mean_token_accuracy": 0.8833853602409363, "num_tokens": 828591818.0, "step": 21714 }, { "epoch": 2.762371199592927, "ewc_loss": 0.07530602812767029, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038929079892113805, "grad_norm": 8.982565879821777, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8543799519538879, "num_tokens": 828628872.0, "step": 21715 }, { "epoch": 2.7624984098715175, "ewc_loss": 0.07439859211444855, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003802163410000503, "grad_norm": 8.75811767578125, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8747992515563965, "num_tokens": 828670939.0, "step": 21716 }, { "epoch": 2.762625620150108, "ewc_loss": 0.07514159381389618, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038764646160416305, "grad_norm": 8.9788179397583, "learning_rate": 1e-06, "loss": 0.5197, "mean_token_accuracy": 0.8475691676139832, "num_tokens": 828708446.0, "step": 21717 }, { "epoch": 2.7627528304286986, "ewc_loss": 0.07440489530563354, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003802794381044805, "grad_norm": 8.733013153076172, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8710314631462097, "num_tokens": 828749873.0, "step": 21718 }, { "epoch": 2.762880040707289, "ewc_loss": 0.07522580027580261, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038848849362693727, "grad_norm": 8.956009864807129, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8587630987167358, "num_tokens": 828790066.0, "step": 21719 }, { "epoch": 2.7630072509858796, "ewc_loss": 0.0744146853685379, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038037728518247604, "grad_norm": 8.820335388183594, "learning_rate": 1e-06, "loss": 0.4867, "mean_token_accuracy": 0.8634225726127625, "num_tokens": 828821911.0, "step": 21720 }, { "epoch": 2.76313446126447, "ewc_loss": 0.07516500353813171, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038788054371252656, "grad_norm": 8.934769630432129, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8592125773429871, "num_tokens": 828856572.0, "step": 21721 }, { "epoch": 2.7632616715430607, "ewc_loss": 0.07439376413822174, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003801680577453226, "grad_norm": 8.78123950958252, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8781229257583618, "num_tokens": 828898865.0, "step": 21722 }, { "epoch": 2.763388881821651, "ewc_loss": 0.07524073123931885, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003886377380695194, "grad_norm": 8.964669227600098, "learning_rate": 1e-06, "loss": 0.4336, "mean_token_accuracy": 0.8734233379364014, "num_tokens": 828933768.0, "step": 21723 }, { "epoch": 2.7635160921002417, "ewc_loss": 0.07458890229463577, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038211949868127704, "grad_norm": 8.850042343139648, "learning_rate": 1e-06, "loss": 0.4183, "mean_token_accuracy": 0.8814817667007446, "num_tokens": 828968869.0, "step": 21724 }, { "epoch": 2.7636433023788323, "ewc_loss": 0.07489269971847534, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038515744381584227, "grad_norm": 8.963241577148438, "learning_rate": 1e-06, "loss": 0.5266, "mean_token_accuracy": 0.8499654531478882, "num_tokens": 829005621.0, "step": 21725 }, { "epoch": 2.763770512657423, "ewc_loss": 0.07452987134456635, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038152915658429265, "grad_norm": 8.833386421203613, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8542772531509399, "num_tokens": 829045046.0, "step": 21726 }, { "epoch": 2.7638977229360133, "ewc_loss": 0.07485520839691162, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038478252827189863, "grad_norm": 8.876716613769531, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.8796003460884094, "num_tokens": 829084588.0, "step": 21727 }, { "epoch": 2.764024933214604, "ewc_loss": 0.07446081936359406, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038083860999904573, "grad_norm": 8.907663345336914, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.849259614944458, "num_tokens": 829121010.0, "step": 21728 }, { "epoch": 2.7641521434931944, "ewc_loss": 0.07452917098999023, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038152217166498303, "grad_norm": 8.843278884887695, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8745725154876709, "num_tokens": 829159618.0, "step": 21729 }, { "epoch": 2.764279353771785, "ewc_loss": 0.07454569637775421, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816874523181468, "grad_norm": 8.823740005493164, "learning_rate": 1e-06, "loss": 0.5108, "mean_token_accuracy": 0.8529267311096191, "num_tokens": 829204826.0, "step": 21730 }, { "epoch": 2.7644065640503754, "ewc_loss": 0.07453710585832596, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038160153781063855, "grad_norm": 8.878926277160645, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.875472903251648, "num_tokens": 829240741.0, "step": 21731 }, { "epoch": 2.7645337743289655, "ewc_loss": 0.07471058517694473, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038333632983267307, "grad_norm": 8.915010452270508, "learning_rate": 1e-06, "loss": 0.4118, "mean_token_accuracy": 0.8855148553848267, "num_tokens": 829278065.0, "step": 21732 }, { "epoch": 2.7646609846075565, "ewc_loss": 0.07444555312395096, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003806859895121306, "grad_norm": 8.837750434875488, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8629371523857117, "num_tokens": 829319438.0, "step": 21733 }, { "epoch": 2.7647881948861466, "ewc_loss": 0.07462283968925476, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038245884934440255, "grad_norm": 8.846701622009277, "learning_rate": 1e-06, "loss": 0.4332, "mean_token_accuracy": 0.8764971494674683, "num_tokens": 829357299.0, "step": 21734 }, { "epoch": 2.7649154051647375, "ewc_loss": 0.07461271435022354, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003823575971182436, "grad_norm": 8.856802940368652, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8677943348884583, "num_tokens": 829398600.0, "step": 21735 }, { "epoch": 2.7650426154433276, "ewc_loss": 0.07458372414112091, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038206775207072496, "grad_norm": 8.882271766662598, "learning_rate": 1e-06, "loss": 0.4976, "mean_token_accuracy": 0.8545399904251099, "num_tokens": 829439214.0, "step": 21736 }, { "epoch": 2.7651698257219186, "ewc_loss": 0.07449403405189514, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003811708011198789, "grad_norm": 8.83723258972168, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8549996018409729, "num_tokens": 829477634.0, "step": 21737 }, { "epoch": 2.7652970360005087, "ewc_loss": 0.07464133203029633, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003826438041869551, "grad_norm": 8.839261054992676, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8727151155471802, "num_tokens": 829514527.0, "step": 21738 }, { "epoch": 2.765424246279099, "ewc_loss": 0.07473383843898773, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835688694380224, "grad_norm": 8.857752799987793, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8671349287033081, "num_tokens": 829558531.0, "step": 21739 }, { "epoch": 2.7655514565576897, "ewc_loss": 0.07456761598587036, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038190666236914694, "grad_norm": 8.820210456848145, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8698608875274658, "num_tokens": 829600769.0, "step": 21740 }, { "epoch": 2.7656786668362803, "ewc_loss": 0.07478959858417511, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000384126411518082, "grad_norm": 8.873055458068848, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8707493543624878, "num_tokens": 829635938.0, "step": 21741 }, { "epoch": 2.765805877114871, "ewc_loss": 0.07456205785274506, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038185101584531367, "grad_norm": 8.9099702835083, "learning_rate": 1e-06, "loss": 0.4679, "mean_token_accuracy": 0.8657673597335815, "num_tokens": 829672956.0, "step": 21742 }, { "epoch": 2.7659330873934613, "ewc_loss": 0.07465338706970215, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038276429404504597, "grad_norm": 8.852919578552246, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8654682636260986, "num_tokens": 829712658.0, "step": 21743 }, { "epoch": 2.766060297672052, "ewc_loss": 0.07472715526819229, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038350201793946326, "grad_norm": 8.888957977294922, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8681238889694214, "num_tokens": 829755429.0, "step": 21744 }, { "epoch": 2.7661875079506424, "ewc_loss": 0.07445604354143143, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038079090882092714, "grad_norm": 8.792150497436523, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8641700744628906, "num_tokens": 829796373.0, "step": 21745 }, { "epoch": 2.766314718229233, "ewc_loss": 0.07497523725032806, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003859828575514257, "grad_norm": 12.631047248840332, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.861653208732605, "num_tokens": 829834049.0, "step": 21746 }, { "epoch": 2.7664419285078234, "ewc_loss": 0.07759544253349304, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004121849196963012, "grad_norm": 9.02411937713623, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8662859201431274, "num_tokens": 829870853.0, "step": 21747 }, { "epoch": 2.766569138786414, "ewc_loss": 0.0780697762966156, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004169282619841397, "grad_norm": 9.490519523620605, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8596493005752563, "num_tokens": 829907519.0, "step": 21748 }, { "epoch": 2.7666963490650045, "ewc_loss": 0.07453729212284088, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816033713519573, "grad_norm": 8.839688301086426, "learning_rate": 1e-06, "loss": 0.5032, "mean_token_accuracy": 0.8528544902801514, "num_tokens": 829947162.0, "step": 21749 }, { "epoch": 2.766823559343595, "ewc_loss": 0.0781797468662262, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00041802795021794736, "grad_norm": 9.410943984985352, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8640570640563965, "num_tokens": 829991981.0, "step": 21750 }, { "epoch": 2.7669507696221856, "ewc_loss": 0.07493145763874054, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038554499042220414, "grad_norm": 8.889911651611328, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.872546374797821, "num_tokens": 830032409.0, "step": 21751 }, { "epoch": 2.767077979900776, "ewc_loss": 0.0769524872303009, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00040575533057563007, "grad_norm": 9.24207878112793, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8687273859977722, "num_tokens": 830072738.0, "step": 21752 }, { "epoch": 2.7672051901793666, "ewc_loss": 0.07505136728286743, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003867441264446825, "grad_norm": 8.915831565856934, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8623616695404053, "num_tokens": 830110880.0, "step": 21753 }, { "epoch": 2.767332400457957, "ewc_loss": 0.07628928124904633, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039912323700264096, "grad_norm": 9.16143798828125, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8602604269981384, "num_tokens": 830150148.0, "step": 21754 }, { "epoch": 2.7674596107365477, "ewc_loss": 0.07496907562017441, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038592121563851833, "grad_norm": 8.96486759185791, "learning_rate": 1e-06, "loss": 0.5137, "mean_token_accuracy": 0.8536354303359985, "num_tokens": 830190717.0, "step": 21755 }, { "epoch": 2.767586821015138, "ewc_loss": 0.07564368098974228, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000392667279811576, "grad_norm": 9.094059944152832, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.86789870262146, "num_tokens": 830226972.0, "step": 21756 }, { "epoch": 2.7677140312937283, "ewc_loss": 0.07490617036819458, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852921654470265, "grad_norm": 8.946928024291992, "learning_rate": 1e-06, "loss": 0.4315, "mean_token_accuracy": 0.8753946423530579, "num_tokens": 830262612.0, "step": 21757 }, { "epoch": 2.7678412415723193, "ewc_loss": 0.07529853284358978, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038921579835005105, "grad_norm": 8.991985321044922, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8716329336166382, "num_tokens": 830306856.0, "step": 21758 }, { "epoch": 2.7679684518509093, "ewc_loss": 0.07490426301956177, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852730442304164, "grad_norm": 8.983621597290039, "learning_rate": 1e-06, "loss": 0.4851, "mean_token_accuracy": 0.8645859956741333, "num_tokens": 830340726.0, "step": 21759 }, { "epoch": 2.7680956621295003, "ewc_loss": 0.07500233501195908, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003862538142129779, "grad_norm": 8.97671890258789, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8654793500900269, "num_tokens": 830380697.0, "step": 21760 }, { "epoch": 2.7682228724080904, "ewc_loss": 0.0747186690568924, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038341720937751234, "grad_norm": 8.884293556213379, "learning_rate": 1e-06, "loss": 0.5182, "mean_token_accuracy": 0.8435943126678467, "num_tokens": 830414737.0, "step": 21761 }, { "epoch": 2.768350082686681, "ewc_loss": 0.07507635653018951, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038699404103681445, "grad_norm": 8.982370376586914, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8669894337654114, "num_tokens": 830456518.0, "step": 21762 }, { "epoch": 2.7684772929652715, "ewc_loss": 0.07473717629909515, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038360219332389534, "grad_norm": 8.90710735321045, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8628398776054382, "num_tokens": 830494192.0, "step": 21763 }, { "epoch": 2.768604503243862, "ewc_loss": 0.07492206990718842, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003854511887766421, "grad_norm": 8.921050071716309, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8589347004890442, "num_tokens": 830533659.0, "step": 21764 }, { "epoch": 2.7687317135224525, "ewc_loss": 0.0747983455657959, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038421389763243496, "grad_norm": 8.95676326751709, "learning_rate": 1e-06, "loss": 0.4683, "mean_token_accuracy": 0.8601152300834656, "num_tokens": 830569605.0, "step": 21765 }, { "epoch": 2.768858923801043, "ewc_loss": 0.07475539296865463, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003837843833025545, "grad_norm": 8.84294319152832, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8445796966552734, "num_tokens": 830610814.0, "step": 21766 }, { "epoch": 2.7689861340796336, "ewc_loss": 0.07516078650951385, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003878383431583643, "grad_norm": 9.053735733032227, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8653765320777893, "num_tokens": 830646811.0, "step": 21767 }, { "epoch": 2.769113344358224, "ewc_loss": 0.074497751891613, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003812079958152026, "grad_norm": 8.846475601196289, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8697003126144409, "num_tokens": 830680984.0, "step": 21768 }, { "epoch": 2.7692405546368146, "ewc_loss": 0.0751962810754776, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003881933225784451, "grad_norm": 9.025344848632812, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8653419017791748, "num_tokens": 830713384.0, "step": 21769 }, { "epoch": 2.769367764915405, "ewc_loss": 0.07449209690093994, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038115138886496425, "grad_norm": 8.801671981811523, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8538775444030762, "num_tokens": 830746922.0, "step": 21770 }, { "epoch": 2.7694949751939957, "ewc_loss": 0.075235515832901, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038858564221300185, "grad_norm": 9.03365707397461, "learning_rate": 1e-06, "loss": 0.4399, "mean_token_accuracy": 0.8728646636009216, "num_tokens": 830789459.0, "step": 21771 }, { "epoch": 2.769622185472586, "ewc_loss": 0.07442502677440643, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038048074929974973, "grad_norm": 8.825078964233398, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8634556531906128, "num_tokens": 830826736.0, "step": 21772 }, { "epoch": 2.7697493957511767, "ewc_loss": 0.07519431412220001, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003881736483890563, "grad_norm": 9.011954307556152, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8685964345932007, "num_tokens": 830868118.0, "step": 21773 }, { "epoch": 2.7698766060297673, "ewc_loss": 0.07439696788787842, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003802001301664859, "grad_norm": 8.741948127746582, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8625943660736084, "num_tokens": 830911658.0, "step": 21774 }, { "epoch": 2.770003816308358, "ewc_loss": 0.07537184655666351, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003899489820469171, "grad_norm": 9.03530502319336, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8663222789764404, "num_tokens": 830948456.0, "step": 21775 }, { "epoch": 2.7701310265869483, "ewc_loss": 0.07430773228406906, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037930780672468245, "grad_norm": 8.752214431762695, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8747536540031433, "num_tokens": 830988790.0, "step": 21776 }, { "epoch": 2.770258236865539, "ewc_loss": 0.07557131350040436, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003919435548596084, "grad_norm": 9.10893726348877, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8585637807846069, "num_tokens": 831024551.0, "step": 21777 }, { "epoch": 2.7703854471441294, "ewc_loss": 0.07420440018177032, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003782745043281466, "grad_norm": 8.798124313354492, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8632689714431763, "num_tokens": 831060082.0, "step": 21778 }, { "epoch": 2.77051265742272, "ewc_loss": 0.07554548233747482, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039168528746813536, "grad_norm": 9.034171104431152, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8666492700576782, "num_tokens": 831100067.0, "step": 21779 }, { "epoch": 2.77063986770131, "ewc_loss": 0.07441496104001999, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003803800791501999, "grad_norm": 8.82664966583252, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8762251138687134, "num_tokens": 831131923.0, "step": 21780 }, { "epoch": 2.770767077979901, "ewc_loss": 0.07533206045627594, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003895510744769126, "grad_norm": 9.041994094848633, "learning_rate": 1e-06, "loss": 0.5135, "mean_token_accuracy": 0.8501938581466675, "num_tokens": 831167985.0, "step": 21781 }, { "epoch": 2.770894288258491, "ewc_loss": 0.07431583106517792, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000379388831788674, "grad_norm": 8.787753105163574, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8599233627319336, "num_tokens": 831211737.0, "step": 21782 }, { "epoch": 2.771021498537082, "ewc_loss": 0.07542552798986435, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003904857439920306, "grad_norm": 9.055465698242188, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.853895902633667, "num_tokens": 831248764.0, "step": 21783 }, { "epoch": 2.771148708815672, "ewc_loss": 0.07430397719144821, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003792702336795628, "grad_norm": 8.789775848388672, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8721156120300293, "num_tokens": 831284140.0, "step": 21784 }, { "epoch": 2.771275919094263, "ewc_loss": 0.07527007162570953, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003889311628881842, "grad_norm": 9.045367240905762, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8699162006378174, "num_tokens": 831318567.0, "step": 21785 }, { "epoch": 2.771403129372853, "ewc_loss": 0.07451826333999634, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003814130905084312, "grad_norm": 8.8547945022583, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8593652248382568, "num_tokens": 831357128.0, "step": 21786 }, { "epoch": 2.7715303396514437, "ewc_loss": 0.07512739300727844, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003875043767038733, "grad_norm": 9.00745677947998, "learning_rate": 1e-06, "loss": 0.5388, "mean_token_accuracy": 0.8438668251037598, "num_tokens": 831391551.0, "step": 21787 }, { "epoch": 2.771657549930034, "ewc_loss": 0.07439245283603668, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003801550483331084, "grad_norm": 8.820710182189941, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8625777959823608, "num_tokens": 831430332.0, "step": 21788 }, { "epoch": 2.7717847602086247, "ewc_loss": 0.07521393895149231, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003883698082063347, "grad_norm": 8.996817588806152, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.870171308517456, "num_tokens": 831471639.0, "step": 21789 }, { "epoch": 2.7719119704872153, "ewc_loss": 0.07435618340969086, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037979232729412615, "grad_norm": 8.819801330566406, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8699766993522644, "num_tokens": 831506470.0, "step": 21790 }, { "epoch": 2.772039180765806, "ewc_loss": 0.07506397366523743, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003868702333420515, "grad_norm": 8.969449996948242, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8592997789382935, "num_tokens": 831547212.0, "step": 21791 }, { "epoch": 2.7721663910443963, "ewc_loss": 0.07441239804029465, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003803544386755675, "grad_norm": 8.846348762512207, "learning_rate": 1e-06, "loss": 0.5062, "mean_token_accuracy": 0.8513752222061157, "num_tokens": 831593181.0, "step": 21792 }, { "epoch": 2.772293601322987, "ewc_loss": 0.07496905326843262, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003859210410155356, "grad_norm": 8.961435317993164, "learning_rate": 1e-06, "loss": 0.4041, "mean_token_accuracy": 0.8812816143035889, "num_tokens": 831624889.0, "step": 21793 }, { "epoch": 2.7724208116015774, "ewc_loss": 0.07457484304904938, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003819788689725101, "grad_norm": 8.846324920654297, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8717254996299744, "num_tokens": 831662389.0, "step": 21794 }, { "epoch": 2.772548021880168, "ewc_loss": 0.07480892539024353, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003843197482638061, "grad_norm": 8.921941757202148, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8678412437438965, "num_tokens": 831705282.0, "step": 21795 }, { "epoch": 2.7726752321587584, "ewc_loss": 0.07454460859298706, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038167659658938646, "grad_norm": 12.592667579650879, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8734031915664673, "num_tokens": 831741604.0, "step": 21796 }, { "epoch": 2.772802442437349, "ewc_loss": 0.07712332904338837, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00040746378363110125, "grad_norm": 8.9874906539917, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.855253279209137, "num_tokens": 831784165.0, "step": 21797 }, { "epoch": 2.7729296527159395, "ewc_loss": 0.07785211503505707, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004147516447119415, "grad_norm": 9.563413619995117, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8699800372123718, "num_tokens": 831821101.0, "step": 21798 }, { "epoch": 2.77305686299453, "ewc_loss": 0.07419297099113464, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378160213585943, "grad_norm": 8.785115242004395, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8685110211372375, "num_tokens": 831855766.0, "step": 21799 }, { "epoch": 2.7731840732731206, "ewc_loss": 0.0782603919506073, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00041883435915224254, "grad_norm": 9.530367851257324, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8705317974090576, "num_tokens": 831895692.0, "step": 21800 }, { "epoch": 2.773311283551711, "ewc_loss": 0.0746111124753952, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038234159001149237, "grad_norm": 8.915271759033203, "learning_rate": 1e-06, "loss": 0.3989, "mean_token_accuracy": 0.8841111063957214, "num_tokens": 831931079.0, "step": 21801 }, { "epoch": 2.7734384938303016, "ewc_loss": 0.0770220160484314, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000406450591981411, "grad_norm": 9.357177734375, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.838358998298645, "num_tokens": 831969742.0, "step": 21802 }, { "epoch": 2.773565704108892, "ewc_loss": 0.07475084066390991, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003837388358078897, "grad_norm": 8.945072174072266, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.855424165725708, "num_tokens": 832011059.0, "step": 21803 }, { "epoch": 2.7736929143874827, "ewc_loss": 0.07628721743822098, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039910266059450805, "grad_norm": 9.310193061828613, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8759407997131348, "num_tokens": 832042768.0, "step": 21804 }, { "epoch": 2.7738201246660728, "ewc_loss": 0.07475127279758453, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038374317227862775, "grad_norm": 8.919883728027344, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.873397946357727, "num_tokens": 832077699.0, "step": 21805 }, { "epoch": 2.7739473349446637, "ewc_loss": 0.07597564160823822, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039598686271347106, "grad_norm": 9.224011421203613, "learning_rate": 1e-06, "loss": 0.4383, "mean_token_accuracy": 0.8740551471710205, "num_tokens": 832113528.0, "step": 21806 }, { "epoch": 2.774074545223254, "ewc_loss": 0.07472112029790878, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038344168569892645, "grad_norm": 8.940411567687988, "learning_rate": 1e-06, "loss": 0.4435, "mean_token_accuracy": 0.8692394495010376, "num_tokens": 832149108.0, "step": 21807 }, { "epoch": 2.774201755501845, "ewc_loss": 0.07547961175441742, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039102655136957765, "grad_norm": 9.119796752929688, "learning_rate": 1e-06, "loss": 0.4171, "mean_token_accuracy": 0.8827917575836182, "num_tokens": 832189279.0, "step": 21808 }, { "epoch": 2.774328965780435, "ewc_loss": 0.07475206255912781, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000383751088520512, "grad_norm": 8.956002235412598, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8679807186126709, "num_tokens": 832224423.0, "step": 21809 }, { "epoch": 2.774456176059026, "ewc_loss": 0.07503963261842728, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003866268089041114, "grad_norm": 9.02376651763916, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8602171540260315, "num_tokens": 832260389.0, "step": 21810 }, { "epoch": 2.774583386337616, "ewc_loss": 0.07477743923664093, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003840048739220947, "grad_norm": 8.9469633102417, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8645034432411194, "num_tokens": 832297591.0, "step": 21811 }, { "epoch": 2.7747105966162064, "ewc_loss": 0.07485288381576538, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847593325190246, "grad_norm": 8.975456237792969, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8688787221908569, "num_tokens": 832330195.0, "step": 21812 }, { "epoch": 2.774837806894797, "ewc_loss": 0.07471808046102524, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038341127219609916, "grad_norm": 8.889734268188477, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8717634081840515, "num_tokens": 832361613.0, "step": 21813 }, { "epoch": 2.7749650171733875, "ewc_loss": 0.07496477663516998, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038587828748859465, "grad_norm": 8.947425842285156, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.87129145860672, "num_tokens": 832402800.0, "step": 21814 }, { "epoch": 2.775092227451978, "ewc_loss": 0.07464006543159485, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038263111491687596, "grad_norm": 8.897870063781738, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8674923777580261, "num_tokens": 832445161.0, "step": 21815 }, { "epoch": 2.7752194377305686, "ewc_loss": 0.0746629536151886, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038285995833575726, "grad_norm": 8.888041496276855, "learning_rate": 1e-06, "loss": 0.5035, "mean_token_accuracy": 0.8512238264083862, "num_tokens": 832487905.0, "step": 21816 }, { "epoch": 2.775346648009159, "ewc_loss": 0.0747433751821518, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003836642426904291, "grad_norm": 8.902364730834961, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8659931421279907, "num_tokens": 832521107.0, "step": 21817 }, { "epoch": 2.7754738582877496, "ewc_loss": 0.07460284233093262, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038225893513299525, "grad_norm": 8.883081436157227, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8628152012825012, "num_tokens": 832557213.0, "step": 21818 }, { "epoch": 2.77560106856634, "ewc_loss": 0.07489656656980515, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038519612280651927, "grad_norm": 8.929561614990234, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8527899980545044, "num_tokens": 832600445.0, "step": 21819 }, { "epoch": 2.7757282788449307, "ewc_loss": 0.07464064657688141, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038263690657913685, "grad_norm": 8.89196491241455, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8656592965126038, "num_tokens": 832635994.0, "step": 21820 }, { "epoch": 2.775855489123521, "ewc_loss": 0.07473546266555786, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038358510937541723, "grad_norm": 8.929320335388184, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8765841126441956, "num_tokens": 832677151.0, "step": 21821 }, { "epoch": 2.7759826994021117, "ewc_loss": 0.0747327208518982, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000383557693567127, "grad_norm": 8.88495922088623, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8651732206344604, "num_tokens": 832713995.0, "step": 21822 }, { "epoch": 2.7761099096807023, "ewc_loss": 0.07468453049659729, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038307576323859394, "grad_norm": 8.919868469238281, "learning_rate": 1e-06, "loss": 0.4847, "mean_token_accuracy": 0.8565635681152344, "num_tokens": 832754201.0, "step": 21823 }, { "epoch": 2.776237119959293, "ewc_loss": 0.07451094686985016, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813398943748325, "grad_norm": 8.911112785339355, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8616474270820618, "num_tokens": 832796920.0, "step": 21824 }, { "epoch": 2.7763643302378833, "ewc_loss": 0.07475276291370392, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003837581316474825, "grad_norm": 8.885921478271484, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8613759279251099, "num_tokens": 832835952.0, "step": 21825 }, { "epoch": 2.776491540516474, "ewc_loss": 0.07467316091060638, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003829621127806604, "grad_norm": 8.897544860839844, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8689246773719788, "num_tokens": 832878306.0, "step": 21826 }, { "epoch": 2.7766187507950644, "ewc_loss": 0.07465650141239166, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038279552245512605, "grad_norm": 8.89382553100586, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8571562767028809, "num_tokens": 832914315.0, "step": 21827 }, { "epoch": 2.776745961073655, "ewc_loss": 0.07477696239948273, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003840000426862389, "grad_norm": 8.905167579650879, "learning_rate": 1e-06, "loss": 0.5099, "mean_token_accuracy": 0.8631293773651123, "num_tokens": 832956661.0, "step": 21828 }, { "epoch": 2.7768731713522454, "ewc_loss": 0.07474331557750702, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003836636315099895, "grad_norm": 8.937795639038086, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.8724778294563293, "num_tokens": 832999446.0, "step": 21829 }, { "epoch": 2.7770003816308355, "ewc_loss": 0.07460074126720428, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038223789306357503, "grad_norm": 8.953567504882812, "learning_rate": 1e-06, "loss": 0.5411, "mean_token_accuracy": 0.8391751646995544, "num_tokens": 833039001.0, "step": 21830 }, { "epoch": 2.7771275919094265, "ewc_loss": 0.0748007670044899, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842381411232054, "grad_norm": 8.957063674926758, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8643913269042969, "num_tokens": 833074334.0, "step": 21831 }, { "epoch": 2.7772548021880166, "ewc_loss": 0.0746205598115921, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824360901489854, "grad_norm": 8.950167655944824, "learning_rate": 1e-06, "loss": 0.5049, "mean_token_accuracy": 0.8531079292297363, "num_tokens": 833113089.0, "step": 21832 }, { "epoch": 2.7773820124666075, "ewc_loss": 0.07466018199920654, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003828322805929929, "grad_norm": 8.90492057800293, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8627125024795532, "num_tokens": 833149755.0, "step": 21833 }, { "epoch": 2.7775092227451976, "ewc_loss": 0.07463181763887405, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003825486346613616, "grad_norm": 8.943010330200195, "learning_rate": 1e-06, "loss": 0.5085, "mean_token_accuracy": 0.8559641242027283, "num_tokens": 833187718.0, "step": 21834 }, { "epoch": 2.7776364330237886, "ewc_loss": 0.07464183866977692, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003826488391496241, "grad_norm": 8.910292625427246, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8711789846420288, "num_tokens": 833221416.0, "step": 21835 }, { "epoch": 2.7777636433023787, "ewc_loss": 0.07469852268695831, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038321566535159945, "grad_norm": 8.90802001953125, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8804935216903687, "num_tokens": 833261101.0, "step": 21836 }, { "epoch": 2.777890853580969, "ewc_loss": 0.07453593611717224, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038158983807079494, "grad_norm": 8.853662490844727, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8739233016967773, "num_tokens": 833296804.0, "step": 21837 }, { "epoch": 2.7780180638595597, "ewc_loss": 0.07473107427358627, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835412207990885, "grad_norm": 8.909430503845215, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8667936325073242, "num_tokens": 833332005.0, "step": 21838 }, { "epoch": 2.7781452741381503, "ewc_loss": 0.07460857927799225, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003823162114713341, "grad_norm": 8.877571105957031, "learning_rate": 1e-06, "loss": 0.4771, "mean_token_accuracy": 0.8610295057296753, "num_tokens": 833371991.0, "step": 21839 }, { "epoch": 2.778272484416741, "ewc_loss": 0.07477110624313354, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003839415730908513, "grad_norm": 8.924469947814941, "learning_rate": 1e-06, "loss": 0.4286, "mean_token_accuracy": 0.8779085874557495, "num_tokens": 833407514.0, "step": 21840 }, { "epoch": 2.7783996946953313, "ewc_loss": 0.07464048266410828, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003826352476608008, "grad_norm": 8.887839317321777, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8636473417282104, "num_tokens": 833444519.0, "step": 21841 }, { "epoch": 2.778526904973922, "ewc_loss": 0.0747339054942131, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835695388261229, "grad_norm": 8.87942123413086, "learning_rate": 1e-06, "loss": 0.4792, "mean_token_accuracy": 0.8612385988235474, "num_tokens": 833482252.0, "step": 21842 }, { "epoch": 2.7786541152525124, "ewc_loss": 0.07472015172243118, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038343199412338436, "grad_norm": 8.84504222869873, "learning_rate": 1e-06, "loss": 0.4723, "mean_token_accuracy": 0.8659937381744385, "num_tokens": 833515513.0, "step": 21843 }, { "epoch": 2.778781325531103, "ewc_loss": 0.07477792352437973, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003840097051579505, "grad_norm": 8.95959758758545, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8727893233299255, "num_tokens": 833551188.0, "step": 21844 }, { "epoch": 2.7789085358096934, "ewc_loss": 0.07465144991874695, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038274499820545316, "grad_norm": 8.824088096618652, "learning_rate": 1e-06, "loss": 0.4298, "mean_token_accuracy": 0.8742067813873291, "num_tokens": 833589159.0, "step": 21845 }, { "epoch": 2.779035746088284, "ewc_loss": 0.07493182271718979, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038554868660867214, "grad_norm": 8.978981018066406, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8553947806358337, "num_tokens": 833623581.0, "step": 21846 }, { "epoch": 2.7791629563668745, "ewc_loss": 0.07440247386693954, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003802552237175405, "grad_norm": 8.822519302368164, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8533980250358582, "num_tokens": 833660466.0, "step": 21847 }, { "epoch": 2.779290166645465, "ewc_loss": 0.07494133710861206, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003856438270304352, "grad_norm": 8.876184463500977, "learning_rate": 1e-06, "loss": 0.507, "mean_token_accuracy": 0.855230450630188, "num_tokens": 833697465.0, "step": 21848 }, { "epoch": 2.7794173769240555, "ewc_loss": 0.07457821071147919, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000382012571208179, "grad_norm": 8.848624229431152, "learning_rate": 1e-06, "loss": 0.4626, "mean_token_accuracy": 0.8627430200576782, "num_tokens": 833735178.0, "step": 21849 }, { "epoch": 2.779544587202646, "ewc_loss": 0.07482172548770905, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003844476887024939, "grad_norm": 8.995949745178223, "learning_rate": 1e-06, "loss": 0.4143, "mean_token_accuracy": 0.8809984922409058, "num_tokens": 833770193.0, "step": 21850 }, { "epoch": 2.7796717974812366, "ewc_loss": 0.07456376403570175, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038186809979379177, "grad_norm": 8.857132911682129, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8663071393966675, "num_tokens": 833807836.0, "step": 21851 }, { "epoch": 2.779799007759827, "ewc_loss": 0.0747205913066864, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834364179056138, "grad_norm": 9.018599510192871, "learning_rate": 1e-06, "loss": 0.4697, "mean_token_accuracy": 0.8617123961448669, "num_tokens": 833847605.0, "step": 21852 }, { "epoch": 2.7799262180384177, "ewc_loss": 0.07432827353477478, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003795132215600461, "grad_norm": 8.768058776855469, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.868443489074707, "num_tokens": 833884239.0, "step": 21853 }, { "epoch": 2.780053428317008, "ewc_loss": 0.07508057355880737, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038703615427948534, "grad_norm": 8.981329917907715, "learning_rate": 1e-06, "loss": 0.4758, "mean_token_accuracy": 0.8650592565536499, "num_tokens": 833921305.0, "step": 21854 }, { "epoch": 2.7801806385955983, "ewc_loss": 0.07428872585296631, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037911770050413907, "grad_norm": 8.80980396270752, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8565958738327026, "num_tokens": 833959297.0, "step": 21855 }, { "epoch": 2.7803078488741892, "ewc_loss": 0.07496251910924911, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003858556447084993, "grad_norm": 9.00580883026123, "learning_rate": 1e-06, "loss": 0.5455, "mean_token_accuracy": 0.8410078287124634, "num_tokens": 833992135.0, "step": 21856 }, { "epoch": 2.7804350591527793, "ewc_loss": 0.07427837699651718, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003790142363868654, "grad_norm": 8.773056983947754, "learning_rate": 1e-06, "loss": 0.4979, "mean_token_accuracy": 0.8552644848823547, "num_tokens": 834035933.0, "step": 21857 }, { "epoch": 2.7805622694313703, "ewc_loss": 0.0750681459903717, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003869118809234351, "grad_norm": 8.954445838928223, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8643202781677246, "num_tokens": 834070187.0, "step": 21858 }, { "epoch": 2.7806894797099604, "ewc_loss": 0.07434085756540298, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037963903741911054, "grad_norm": 8.795437812805176, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8675690293312073, "num_tokens": 834107370.0, "step": 21859 }, { "epoch": 2.780816689988551, "ewc_loss": 0.0749816745519638, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038604726432822645, "grad_norm": 8.94455623626709, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8567972183227539, "num_tokens": 834145357.0, "step": 21860 }, { "epoch": 2.7809439002671414, "ewc_loss": 0.07423434406518936, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378573895432055, "grad_norm": 8.7298002243042, "learning_rate": 1e-06, "loss": 0.465, "mean_token_accuracy": 0.8629429340362549, "num_tokens": 834187250.0, "step": 21861 }, { "epoch": 2.781071110545732, "ewc_loss": 0.07500220090150833, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003862524754367769, "grad_norm": 8.919475555419922, "learning_rate": 1e-06, "loss": 0.4318, "mean_token_accuracy": 0.876408040523529, "num_tokens": 834227598.0, "step": 21862 }, { "epoch": 2.7811983208243225, "ewc_loss": 0.07448878884315491, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003811183851212263, "grad_norm": 8.852933883666992, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8802323341369629, "num_tokens": 834267032.0, "step": 21863 }, { "epoch": 2.781325531102913, "ewc_loss": 0.07478061318397522, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038403665530495346, "grad_norm": 8.881824493408203, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.854426383972168, "num_tokens": 834308479.0, "step": 21864 }, { "epoch": 2.7814527413815036, "ewc_loss": 0.07460072636604309, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038223774754442275, "grad_norm": 8.805289268493652, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8748244047164917, "num_tokens": 834346185.0, "step": 21865 }, { "epoch": 2.781579951660094, "ewc_loss": 0.07472231984138489, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834536182694137, "grad_norm": 8.84337043762207, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8747233152389526, "num_tokens": 834384404.0, "step": 21866 }, { "epoch": 2.7817071619386846, "ewc_loss": 0.07469820976257324, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003832126094494015, "grad_norm": 8.797755241394043, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8644013404846191, "num_tokens": 834424833.0, "step": 21867 }, { "epoch": 2.781834372217275, "ewc_loss": 0.07479815185070038, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842120058834553, "grad_norm": 8.868559837341309, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8664670586585999, "num_tokens": 834464215.0, "step": 21868 }, { "epoch": 2.7819615824958657, "ewc_loss": 0.07461744546890259, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824049490503967, "grad_norm": 8.829866409301758, "learning_rate": 1e-06, "loss": 0.5427, "mean_token_accuracy": 0.8431247472763062, "num_tokens": 834502452.0, "step": 21869 }, { "epoch": 2.782088792774456, "ewc_loss": 0.07478352636098862, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038406573003157973, "grad_norm": 8.804346084594727, "learning_rate": 1e-06, "loss": 0.4592, "mean_token_accuracy": 0.8658099174499512, "num_tokens": 834539690.0, "step": 21870 }, { "epoch": 2.7822160030530467, "ewc_loss": 0.0748041421175003, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842719306703657, "grad_norm": 8.874625205993652, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8543187379837036, "num_tokens": 834583193.0, "step": 21871 }, { "epoch": 2.7823432133316373, "ewc_loss": 0.07470455765724182, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003832760557997972, "grad_norm": 8.79478931427002, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.8602532148361206, "num_tokens": 834620028.0, "step": 21872 }, { "epoch": 2.782470423610228, "ewc_loss": 0.07505552470684052, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038678571581840515, "grad_norm": 8.876380920410156, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.8544549942016602, "num_tokens": 834657351.0, "step": 21873 }, { "epoch": 2.7825976338888183, "ewc_loss": 0.0746137797832489, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038236824912019074, "grad_norm": 8.81986141204834, "learning_rate": 1e-06, "loss": 0.401, "mean_token_accuracy": 0.8801649808883667, "num_tokens": 834693687.0, "step": 21874 }, { "epoch": 2.782724844167409, "ewc_loss": 0.07500346004962921, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038626501918770373, "grad_norm": 8.844371795654297, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8640357255935669, "num_tokens": 834736171.0, "step": 21875 }, { "epoch": 2.7828520544459994, "ewc_loss": 0.0747884064912796, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038411456625908613, "grad_norm": 8.820385932922363, "learning_rate": 1e-06, "loss": 0.4222, "mean_token_accuracy": 0.8786542415618896, "num_tokens": 834775699.0, "step": 21876 }, { "epoch": 2.78297926472459, "ewc_loss": 0.07509593665599823, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038474847679026425, "grad_norm": 8.865666389465332, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.858962893486023, "num_tokens": 834816053.0, "step": 21877 }, { "epoch": 2.78310647500318, "ewc_loss": 0.07480806857347488, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038431116263382137, "grad_norm": 8.809167861938477, "learning_rate": 1e-06, "loss": 0.4844, "mean_token_accuracy": 0.8608560562133789, "num_tokens": 834857125.0, "step": 21878 }, { "epoch": 2.783233685281771, "ewc_loss": 0.07493546605110168, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003855851828120649, "grad_norm": 8.858160972595215, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8777419328689575, "num_tokens": 834895676.0, "step": 21879 }, { "epoch": 2.783360895560361, "ewc_loss": 0.07480604946613312, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038429093547165394, "grad_norm": 8.855799674987793, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8696314096450806, "num_tokens": 834927748.0, "step": 21880 }, { "epoch": 2.783488105838952, "ewc_loss": 0.07484555244445801, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003846860199701041, "grad_norm": 8.828606605529785, "learning_rate": 1e-06, "loss": 0.487, "mean_token_accuracy": 0.8619885444641113, "num_tokens": 834966722.0, "step": 21881 }, { "epoch": 2.783615316117542, "ewc_loss": 0.07500289380550385, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003862594021484256, "grad_norm": 8.898015975952148, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.866275429725647, "num_tokens": 835003770.0, "step": 21882 }, { "epoch": 2.783742526396133, "ewc_loss": 0.07477481663227081, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038397868047468364, "grad_norm": 8.831868171691895, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8743865489959717, "num_tokens": 835041504.0, "step": 21883 }, { "epoch": 2.783869736674723, "ewc_loss": 0.07501788437366486, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003864092577714473, "grad_norm": 8.93908977508545, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8673512935638428, "num_tokens": 835075179.0, "step": 21884 }, { "epoch": 2.7839969469533137, "ewc_loss": 0.07454624772071838, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816929820459336, "grad_norm": 8.79904556274414, "learning_rate": 1e-06, "loss": 0.5287, "mean_token_accuracy": 0.8556334376335144, "num_tokens": 835113080.0, "step": 21885 }, { "epoch": 2.784124157231904, "ewc_loss": 0.07507943361997604, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003870248037856072, "grad_norm": 8.90631103515625, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8656800985336304, "num_tokens": 835150586.0, "step": 21886 }, { "epoch": 2.7842513675104947, "ewc_loss": 0.07462653517723083, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038249578210525215, "grad_norm": 8.835247039794922, "learning_rate": 1e-06, "loss": 0.4774, "mean_token_accuracy": 0.8616513013839722, "num_tokens": 835184049.0, "step": 21887 }, { "epoch": 2.7843785777890853, "ewc_loss": 0.0749647319316864, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003858777927234769, "grad_norm": 8.872350692749023, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8611356616020203, "num_tokens": 835222745.0, "step": 21888 }, { "epoch": 2.784505788067676, "ewc_loss": 0.07477164268493652, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000383946840884164, "grad_norm": 8.798990249633789, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8529440760612488, "num_tokens": 835263017.0, "step": 21889 }, { "epoch": 2.7846329983462663, "ewc_loss": 0.07533597946166992, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003871488734148443, "grad_norm": 8.921611785888672, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8795837759971619, "num_tokens": 835303916.0, "step": 21890 }, { "epoch": 2.784760208624857, "ewc_loss": 0.0747012048959732, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003832424699794501, "grad_norm": 8.812138557434082, "learning_rate": 1e-06, "loss": 0.4243, "mean_token_accuracy": 0.8803819417953491, "num_tokens": 835343148.0, "step": 21891 }, { "epoch": 2.7848874189034474, "ewc_loss": 0.07507939636707306, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038702445453964174, "grad_norm": 8.916561126708984, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.8629310131072998, "num_tokens": 835382337.0, "step": 21892 }, { "epoch": 2.785014629182038, "ewc_loss": 0.07458730041980743, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003821035206783563, "grad_norm": 8.799295425415039, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8598250150680542, "num_tokens": 835423645.0, "step": 21893 }, { "epoch": 2.7851418394606284, "ewc_loss": 0.07491080462932587, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038533846964128315, "grad_norm": 8.859067916870117, "learning_rate": 1e-06, "loss": 0.5478, "mean_token_accuracy": 0.8353205919265747, "num_tokens": 835464590.0, "step": 21894 }, { "epoch": 2.785269049739219, "ewc_loss": 0.07484845817089081, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847150655928999, "grad_norm": 8.877937316894531, "learning_rate": 1e-06, "loss": 0.3933, "mean_token_accuracy": 0.8902052640914917, "num_tokens": 835496376.0, "step": 21895 }, { "epoch": 2.7853962600178095, "ewc_loss": 0.07483860850334167, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003846165491268039, "grad_norm": 8.84595775604248, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8501495718955994, "num_tokens": 835541231.0, "step": 21896 }, { "epoch": 2.7855234702964, "ewc_loss": 0.0747857466340065, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003840879362542182, "grad_norm": 8.769694328308105, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8684266805648804, "num_tokens": 835584038.0, "step": 21897 }, { "epoch": 2.7856506805749905, "ewc_loss": 0.07497662305831909, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038599674007855356, "grad_norm": 8.881847381591797, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.860041618347168, "num_tokens": 835627189.0, "step": 21898 }, { "epoch": 2.785777890853581, "ewc_loss": 0.0746966153383255, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038319663144648075, "grad_norm": 8.990939140319824, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8649677634239197, "num_tokens": 835659701.0, "step": 21899 }, { "epoch": 2.7859051011321716, "ewc_loss": 0.07464183866977692, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038264886825345457, "grad_norm": 8.93801212310791, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8719186782836914, "num_tokens": 835694045.0, "step": 21900 }, { "epoch": 2.786032311410762, "ewc_loss": 0.07479122281074524, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038414268055930734, "grad_norm": 8.881570816040039, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.862866997718811, "num_tokens": 835732118.0, "step": 21901 }, { "epoch": 2.7861595216893527, "ewc_loss": 0.07461950182914734, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824254381470382, "grad_norm": 8.81564998626709, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8747490048408508, "num_tokens": 835772700.0, "step": 21902 }, { "epoch": 2.7862867319679427, "ewc_loss": 0.07484740018844604, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847044426947832, "grad_norm": 8.963701248168945, "learning_rate": 1e-06, "loss": 0.5319, "mean_token_accuracy": 0.8485866785049438, "num_tokens": 835811206.0, "step": 21903 }, { "epoch": 2.7864139422465337, "ewc_loss": 0.07450391352176666, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038126957952044904, "grad_norm": 8.915144920349121, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.870334267616272, "num_tokens": 835844381.0, "step": 21904 }, { "epoch": 2.786541152525124, "ewc_loss": 0.07474382221698761, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003836686664726585, "grad_norm": 8.908486366271973, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8576151132583618, "num_tokens": 835879926.0, "step": 21905 }, { "epoch": 2.7866683628037148, "ewc_loss": 0.0745687335729599, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038191783824004233, "grad_norm": 8.831622123718262, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8532499074935913, "num_tokens": 835924014.0, "step": 21906 }, { "epoch": 2.786795573082305, "ewc_loss": 0.074785515666008, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003840856661554426, "grad_norm": 8.891998291015625, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8584298491477966, "num_tokens": 835956054.0, "step": 21907 }, { "epoch": 2.786922783360896, "ewc_loss": 0.07456164062023163, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003818468831013888, "grad_norm": 8.898051261901855, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8712217211723328, "num_tokens": 835988885.0, "step": 21908 }, { "epoch": 2.787049993639486, "ewc_loss": 0.07470706850290298, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038330117240548134, "grad_norm": 8.883675575256348, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8643834590911865, "num_tokens": 836028885.0, "step": 21909 }, { "epoch": 2.7871772039180764, "ewc_loss": 0.0746263787150383, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824942687060684, "grad_norm": 8.84145450592041, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8781025409698486, "num_tokens": 836067340.0, "step": 21910 }, { "epoch": 2.787304414196667, "ewc_loss": 0.07472941279411316, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835245734080672, "grad_norm": 8.897843360900879, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8648303747177124, "num_tokens": 836104432.0, "step": 21911 }, { "epoch": 2.7874316244752575, "ewc_loss": 0.07450299710035324, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038126044091768563, "grad_norm": 8.830523490905762, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8609746694564819, "num_tokens": 836141197.0, "step": 21912 }, { "epoch": 2.787558834753848, "ewc_loss": 0.07489505410194397, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003851810179185122, "grad_norm": 8.837661743164062, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8771636486053467, "num_tokens": 836176949.0, "step": 21913 }, { "epoch": 2.7876860450324386, "ewc_loss": 0.07467831671237946, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038301365566439927, "grad_norm": 8.897640228271484, "learning_rate": 1e-06, "loss": 0.4831, "mean_token_accuracy": 0.8635642528533936, "num_tokens": 836206484.0, "step": 21914 }, { "epoch": 2.787813255311029, "ewc_loss": 0.07456618547439575, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038189234328456223, "grad_norm": 8.82931137084961, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8708392381668091, "num_tokens": 836243056.0, "step": 21915 }, { "epoch": 2.7879404655896196, "ewc_loss": 0.07487204670906067, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849509230349213, "grad_norm": 8.847331047058105, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8600334525108337, "num_tokens": 836281154.0, "step": 21916 }, { "epoch": 2.78806767586821, "ewc_loss": 0.07459760457277298, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038220651913434267, "grad_norm": 8.787758827209473, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8645812273025513, "num_tokens": 836319895.0, "step": 21917 }, { "epoch": 2.7881948861468007, "ewc_loss": 0.07512407004833221, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003850297653116286, "grad_norm": 8.85904598236084, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8708504438400269, "num_tokens": 836357227.0, "step": 21918 }, { "epoch": 2.788322096425391, "ewc_loss": 0.07468703389167786, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003831007925327867, "grad_norm": 8.859541893005371, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8570336103439331, "num_tokens": 836398442.0, "step": 21919 }, { "epoch": 2.7884493067039817, "ewc_loss": 0.0748668983578682, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003848994383588433, "grad_norm": 8.841347694396973, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8726983070373535, "num_tokens": 836436660.0, "step": 21920 }, { "epoch": 2.7885765169825723, "ewc_loss": 0.07471705973148346, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834010858554393, "grad_norm": 8.881684303283691, "learning_rate": 1e-06, "loss": 0.4987, "mean_token_accuracy": 0.8593622446060181, "num_tokens": 836479180.0, "step": 21921 }, { "epoch": 2.788703727261163, "ewc_loss": 0.07454803586006165, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038171085179783404, "grad_norm": 8.788941383361816, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8719500303268433, "num_tokens": 836518874.0, "step": 21922 }, { "epoch": 2.7888309375397533, "ewc_loss": 0.07499366998672485, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003861671721097082, "grad_norm": 9.022744178771973, "learning_rate": 1e-06, "loss": 0.4775, "mean_token_accuracy": 0.8601952791213989, "num_tokens": 836556910.0, "step": 21923 }, { "epoch": 2.788958147818344, "ewc_loss": 0.07450773566961288, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003788664180319756, "grad_norm": 8.70753288269043, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8770607709884644, "num_tokens": 836598151.0, "step": 21924 }, { "epoch": 2.7890853580969344, "ewc_loss": 0.07523679733276367, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038859841879457235, "grad_norm": 8.959061622619629, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8574045896530151, "num_tokens": 836637952.0, "step": 21925 }, { "epoch": 2.789212568375525, "ewc_loss": 0.07425730675458908, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003788035537581891, "grad_norm": 8.681815147399902, "learning_rate": 1e-06, "loss": 0.4213, "mean_token_accuracy": 0.8788244724273682, "num_tokens": 836677138.0, "step": 21926 }, { "epoch": 2.7893397786541154, "ewc_loss": 0.07560563832521439, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898454597219825, "grad_norm": 9.012144088745117, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8715804815292358, "num_tokens": 836716880.0, "step": 21927 }, { "epoch": 2.7894669889327055, "ewc_loss": 0.07417618483304977, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003779923135880381, "grad_norm": 8.70453929901123, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8794639110565186, "num_tokens": 836752572.0, "step": 21928 }, { "epoch": 2.7895941992112965, "ewc_loss": 0.07528108358383179, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003890413499902934, "grad_norm": 8.896344184875488, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8607907295227051, "num_tokens": 836794321.0, "step": 21929 }, { "epoch": 2.7897214094898866, "ewc_loss": 0.07445931434631348, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038082365063019097, "grad_norm": 8.74888801574707, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8789588809013367, "num_tokens": 836829183.0, "step": 21930 }, { "epoch": 2.7898486197684775, "ewc_loss": 0.07527609169483185, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038899140781722963, "grad_norm": 8.991595268249512, "learning_rate": 1e-06, "loss": 0.5072, "mean_token_accuracy": 0.8531209230422974, "num_tokens": 836864644.0, "step": 21931 }, { "epoch": 2.7899758300470676, "ewc_loss": 0.07452163100242615, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003814467345364392, "grad_norm": 8.76162338256836, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8774461150169373, "num_tokens": 836895384.0, "step": 21932 }, { "epoch": 2.7901030403256586, "ewc_loss": 0.07531915605068207, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000389421998988837, "grad_norm": 8.941749572753906, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8682222962379456, "num_tokens": 836943336.0, "step": 21933 }, { "epoch": 2.7902302506042487, "ewc_loss": 0.07458911836147308, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038212165236473083, "grad_norm": 8.767050743103027, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8803071975708008, "num_tokens": 836980708.0, "step": 21934 }, { "epoch": 2.790357460882839, "ewc_loss": 0.0757455825805664, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003912449174094945, "grad_norm": 8.971796989440918, "learning_rate": 1e-06, "loss": 0.4595, "mean_token_accuracy": 0.8699795007705688, "num_tokens": 837015738.0, "step": 21935 }, { "epoch": 2.7904846711614297, "ewc_loss": 0.07456375658512115, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038186804158613086, "grad_norm": 8.839500427246094, "learning_rate": 1e-06, "loss": 0.4411, "mean_token_accuracy": 0.8726770877838135, "num_tokens": 837053014.0, "step": 21936 }, { "epoch": 2.7906118814400203, "ewc_loss": 0.07508059591054916, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038703641621395946, "grad_norm": 8.922921180725098, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8617901802062988, "num_tokens": 837083579.0, "step": 21937 }, { "epoch": 2.790739091718611, "ewc_loss": 0.07466641068458557, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003828945627901703, "grad_norm": 8.76002311706543, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8753812313079834, "num_tokens": 837126201.0, "step": 21938 }, { "epoch": 2.7908663019972013, "ewc_loss": 0.07506198436021805, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038685029721818864, "grad_norm": 8.929617881774902, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.8745748996734619, "num_tokens": 837162400.0, "step": 21939 }, { "epoch": 2.790993512275792, "ewc_loss": 0.0745241791009903, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003814722876995802, "grad_norm": 8.765006065368652, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8699354529380798, "num_tokens": 837205733.0, "step": 21940 }, { "epoch": 2.7911207225543824, "ewc_loss": 0.07509738206863403, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038720431621186435, "grad_norm": 8.925554275512695, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.855724573135376, "num_tokens": 837243838.0, "step": 21941 }, { "epoch": 2.791247932832973, "ewc_loss": 0.07455776631832123, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038180808769539, "grad_norm": 8.792228698730469, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8602643013000488, "num_tokens": 837280936.0, "step": 21942 }, { "epoch": 2.7913751431115634, "ewc_loss": 0.07504299283027649, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000386660365620628, "grad_norm": 8.883546829223633, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8650769591331482, "num_tokens": 837321790.0, "step": 21943 }, { "epoch": 2.791502353390154, "ewc_loss": 0.07483543455600739, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003845847677439451, "grad_norm": 8.838333129882812, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8702811002731323, "num_tokens": 837362023.0, "step": 21944 }, { "epoch": 2.7916295636687445, "ewc_loss": 0.07495410740375519, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038577159284614027, "grad_norm": 8.844681739807129, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8730655908584595, "num_tokens": 837404876.0, "step": 21945 }, { "epoch": 2.791756773947335, "ewc_loss": 0.07489283382892609, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003851587534882128, "grad_norm": 8.914052963256836, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8588294386863708, "num_tokens": 837442899.0, "step": 21946 }, { "epoch": 2.7918839842259255, "ewc_loss": 0.07462020218372345, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824324521701783, "grad_norm": 8.884644508361816, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8754611611366272, "num_tokens": 837477611.0, "step": 21947 }, { "epoch": 2.792011194504516, "ewc_loss": 0.07486802339553833, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849106724373996, "grad_norm": 8.871564865112305, "learning_rate": 1e-06, "loss": 0.5024, "mean_token_accuracy": 0.8520509004592896, "num_tokens": 837515928.0, "step": 21948 }, { "epoch": 2.7921384047831066, "ewc_loss": 0.07478852570056915, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003841156722046435, "grad_norm": 8.863470077514648, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8659822940826416, "num_tokens": 837553042.0, "step": 21949 }, { "epoch": 2.792265615061697, "ewc_loss": 0.07486039400100708, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000384834420401603, "grad_norm": 8.867018699645996, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8728392720222473, "num_tokens": 837589749.0, "step": 21950 }, { "epoch": 2.7923928253402877, "ewc_loss": 0.07482743263244629, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003845048195216805, "grad_norm": 8.8248291015625, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.88348788022995, "num_tokens": 837631104.0, "step": 21951 }, { "epoch": 2.792520035618878, "ewc_loss": 0.07487301528453827, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849606728181243, "grad_norm": 8.912174224853516, "learning_rate": 1e-06, "loss": 0.4333, "mean_token_accuracy": 0.8755923509597778, "num_tokens": 837668173.0, "step": 21952 }, { "epoch": 2.7926472458974683, "ewc_loss": 0.07473672926425934, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038359779864549637, "grad_norm": 8.852952003479004, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8723094463348389, "num_tokens": 837704282.0, "step": 21953 }, { "epoch": 2.7927744561760592, "ewc_loss": 0.07501381635665894, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003863686870317906, "grad_norm": 8.896586418151855, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8562888503074646, "num_tokens": 837750790.0, "step": 21954 }, { "epoch": 2.7929016664546493, "ewc_loss": 0.07485879957675934, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003848184132948518, "grad_norm": 8.878373146057129, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8627429008483887, "num_tokens": 837792041.0, "step": 21955 }, { "epoch": 2.7930288767332403, "ewc_loss": 0.07492262125015259, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003854566894005984, "grad_norm": 8.9483642578125, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8730200529098511, "num_tokens": 837818645.0, "step": 21956 }, { "epoch": 2.7931560870118304, "ewc_loss": 0.07474049925804138, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038363548810593784, "grad_norm": 8.827364921569824, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8749501705169678, "num_tokens": 837859816.0, "step": 21957 }, { "epoch": 2.793283297290421, "ewc_loss": 0.07518868148326874, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038811727426946163, "grad_norm": 8.938565254211426, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8607514500617981, "num_tokens": 837903970.0, "step": 21958 }, { "epoch": 2.7934105075690114, "ewc_loss": 0.07458323240280151, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038206277531571686, "grad_norm": 8.796551704406738, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8653560876846313, "num_tokens": 837941158.0, "step": 21959 }, { "epoch": 2.793537717847602, "ewc_loss": 0.07520829141139984, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003883133758790791, "grad_norm": 8.939767837524414, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8724380135536194, "num_tokens": 837978157.0, "step": 21960 }, { "epoch": 2.7936649281261925, "ewc_loss": 0.07459178566932678, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003821483114734292, "grad_norm": 8.77624225616455, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8769276142120361, "num_tokens": 838017871.0, "step": 21961 }, { "epoch": 2.793792138404783, "ewc_loss": 0.07526108622550964, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038884131936356425, "grad_norm": 8.999234199523926, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8789200186729431, "num_tokens": 838050521.0, "step": 21962 }, { "epoch": 2.7939193486833735, "ewc_loss": 0.07451187074184418, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038134920760057867, "grad_norm": 8.841117858886719, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8738572597503662, "num_tokens": 838084450.0, "step": 21963 }, { "epoch": 2.794046558961964, "ewc_loss": 0.07510048896074295, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038723534089513123, "grad_norm": 8.917238235473633, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8601512908935547, "num_tokens": 838121505.0, "step": 21964 }, { "epoch": 2.7941737692405546, "ewc_loss": 0.07465321570634842, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038276263512670994, "grad_norm": 8.835156440734863, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.864197850227356, "num_tokens": 838166148.0, "step": 21965 }, { "epoch": 2.794300979519145, "ewc_loss": 0.07511155307292938, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038734599365852773, "grad_norm": 8.975415229797363, "learning_rate": 1e-06, "loss": 0.4516, "mean_token_accuracy": 0.8725951910018921, "num_tokens": 838203578.0, "step": 21966 }, { "epoch": 2.7944281897977357, "ewc_loss": 0.0745803713798523, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038203413714654744, "grad_norm": 8.805325508117676, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8691367506980896, "num_tokens": 838240702.0, "step": 21967 }, { "epoch": 2.794555400076326, "ewc_loss": 0.07505854219198227, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003868158964905888, "grad_norm": 8.955065727233887, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8733159303665161, "num_tokens": 838275639.0, "step": 21968 }, { "epoch": 2.7946826103549167, "ewc_loss": 0.07462888211011887, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003825192980002612, "grad_norm": 8.834281921386719, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8680949211120605, "num_tokens": 838310045.0, "step": 21969 }, { "epoch": 2.7948098206335072, "ewc_loss": 0.0750277116894722, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038650757051073015, "grad_norm": 8.869726181030273, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.8683949708938599, "num_tokens": 838351571.0, "step": 21970 }, { "epoch": 2.7949370309120978, "ewc_loss": 0.07467252761125565, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038295573904179037, "grad_norm": 8.766972541809082, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8789224624633789, "num_tokens": 838393804.0, "step": 21971 }, { "epoch": 2.7950642411906883, "ewc_loss": 0.07508248090744019, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038705524639226496, "grad_norm": 8.91096305847168, "learning_rate": 1e-06, "loss": 0.4669, "mean_token_accuracy": 0.8673997521400452, "num_tokens": 838435516.0, "step": 21972 }, { "epoch": 2.795191451469279, "ewc_loss": 0.07474184036254883, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038364887586794794, "grad_norm": 8.906485557556152, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8727610111236572, "num_tokens": 838465425.0, "step": 21973 }, { "epoch": 2.7953186617478694, "ewc_loss": 0.0747690200805664, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038392070564441383, "grad_norm": 9.001362800598145, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8638684749603271, "num_tokens": 838512832.0, "step": 21974 }, { "epoch": 2.79544587202646, "ewc_loss": 0.0745791643857956, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003820220590569079, "grad_norm": 8.781684875488281, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.874545693397522, "num_tokens": 838552806.0, "step": 21975 }, { "epoch": 2.79557308230505, "ewc_loss": 0.07506103813648224, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038684086757712066, "grad_norm": 8.922577857971191, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8646150231361389, "num_tokens": 838588592.0, "step": 21976 }, { "epoch": 2.795700292583641, "ewc_loss": 0.0743977278470993, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000380207784473896, "grad_norm": 8.742301940917969, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8644320368766785, "num_tokens": 838629402.0, "step": 21977 }, { "epoch": 2.795827502862231, "ewc_loss": 0.07508084177970886, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038703883183188736, "grad_norm": 8.894869804382324, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8578253388404846, "num_tokens": 838676580.0, "step": 21978 }, { "epoch": 2.795954713140822, "ewc_loss": 0.07461261749267578, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003823566366918385, "grad_norm": 8.87856674194336, "learning_rate": 1e-06, "loss": 0.4808, "mean_token_accuracy": 0.8611829876899719, "num_tokens": 838715773.0, "step": 21979 }, { "epoch": 2.796081923419412, "ewc_loss": 0.07485032081604004, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847336338367313, "grad_norm": 8.89374828338623, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8560724258422852, "num_tokens": 838750270.0, "step": 21980 }, { "epoch": 2.796209133698003, "ewc_loss": 0.07475881278514862, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038381863851100206, "grad_norm": 8.795058250427246, "learning_rate": 1e-06, "loss": 0.5283, "mean_token_accuracy": 0.8463436365127563, "num_tokens": 838794823.0, "step": 21981 }, { "epoch": 2.796336343976593, "ewc_loss": 0.0749138593673706, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003853690577670932, "grad_norm": 8.927409172058105, "learning_rate": 1e-06, "loss": 0.5124, "mean_token_accuracy": 0.8569602966308594, "num_tokens": 838828901.0, "step": 21982 }, { "epoch": 2.7964635542551837, "ewc_loss": 0.07460236549377441, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003822541330009699, "grad_norm": 8.728559494018555, "learning_rate": 1e-06, "loss": 0.4961, "mean_token_accuracy": 0.859237790107727, "num_tokens": 838872648.0, "step": 21983 }, { "epoch": 2.796590764533774, "ewc_loss": 0.0752422958612442, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038865345413796604, "grad_norm": 8.901113510131836, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8790037631988525, "num_tokens": 838905150.0, "step": 21984 }, { "epoch": 2.7967179748123647, "ewc_loss": 0.07456375658512115, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038186804158613086, "grad_norm": 8.801741600036621, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8654822707176208, "num_tokens": 838949027.0, "step": 21985 }, { "epoch": 2.7968451850909553, "ewc_loss": 0.07510881870985031, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038731866516172886, "grad_norm": 8.92934513092041, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8682146668434143, "num_tokens": 838987320.0, "step": 21986 }, { "epoch": 2.796972395369546, "ewc_loss": 0.07461497187614441, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038238015258684754, "grad_norm": 8.765403747558594, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8622062802314758, "num_tokens": 839027396.0, "step": 21987 }, { "epoch": 2.7970996056481363, "ewc_loss": 0.07513757795095444, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038760624011047184, "grad_norm": 8.848469734191895, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8722735047340393, "num_tokens": 839069779.0, "step": 21988 }, { "epoch": 2.797226815926727, "ewc_loss": 0.07478231191635132, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038405353552661836, "grad_norm": 8.909199714660645, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8721662759780884, "num_tokens": 839110919.0, "step": 21989 }, { "epoch": 2.7973540262053174, "ewc_loss": 0.0748198851943016, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003844293241854757, "grad_norm": 8.837782859802246, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8800836801528931, "num_tokens": 839143689.0, "step": 21990 }, { "epoch": 2.797481236483908, "ewc_loss": 0.07501070201396942, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000386337487725541, "grad_norm": 8.854012489318848, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.846659779548645, "num_tokens": 839181274.0, "step": 21991 }, { "epoch": 2.7976084467624984, "ewc_loss": 0.07475411891937256, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003837716649286449, "grad_norm": 8.888352394104004, "learning_rate": 1e-06, "loss": 0.5158, "mean_token_accuracy": 0.8491051197052002, "num_tokens": 839217038.0, "step": 21992 }, { "epoch": 2.797735657041089, "ewc_loss": 0.07486806809902191, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038491113809868693, "grad_norm": 8.827777862548828, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8664904832839966, "num_tokens": 839260104.0, "step": 21993 }, { "epoch": 2.7978628673196795, "ewc_loss": 0.074894979596138, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038518026121892035, "grad_norm": 8.909161567687988, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.869674563407898, "num_tokens": 839294328.0, "step": 21994 }, { "epoch": 2.79799007759827, "ewc_loss": 0.07483047246932983, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038453523302450776, "grad_norm": 8.822484970092773, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8655886650085449, "num_tokens": 839326662.0, "step": 21995 }, { "epoch": 2.7981172878768605, "ewc_loss": 0.07506430149078369, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038687349297106266, "grad_norm": 8.95269775390625, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8660322427749634, "num_tokens": 839355997.0, "step": 21996 }, { "epoch": 2.798244498155451, "ewc_loss": 0.07476179301738739, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003838484117295593, "grad_norm": 8.895533561706543, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8718912601470947, "num_tokens": 839396596.0, "step": 21997 }, { "epoch": 2.7983717084340416, "ewc_loss": 0.0748056173324585, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038428668631240726, "grad_norm": 8.874011993408203, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8679393529891968, "num_tokens": 839432251.0, "step": 21998 }, { "epoch": 2.798498918712632, "ewc_loss": 0.07484255731105804, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003846560139209032, "grad_norm": 8.85022258758545, "learning_rate": 1e-06, "loss": 0.4309, "mean_token_accuracy": 0.8756543397903442, "num_tokens": 839465987.0, "step": 21999 }, { "epoch": 2.7986261289912227, "ewc_loss": 0.074909508228302, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003853255184367299, "grad_norm": 8.842991828918457, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8686344027519226, "num_tokens": 839506147.0, "step": 22000 }, { "epoch": 2.7987533392698127, "ewc_loss": 0.07487110793590546, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849415807053447, "grad_norm": 8.905168533325195, "learning_rate": 1e-06, "loss": 0.4883, "mean_token_accuracy": 0.8575948476791382, "num_tokens": 839544620.0, "step": 22001 }, { "epoch": 2.7988805495484037, "ewc_loss": 0.07467366755008698, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003829670895356685, "grad_norm": 8.833739280700684, "learning_rate": 1e-06, "loss": 0.5031, "mean_token_accuracy": 0.8571069240570068, "num_tokens": 839583217.0, "step": 22002 }, { "epoch": 2.799007759826994, "ewc_loss": 0.07497738301753998, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003860043070744723, "grad_norm": 8.853370666503906, "learning_rate": 1e-06, "loss": 0.4145, "mean_token_accuracy": 0.8861167430877686, "num_tokens": 839621283.0, "step": 22003 }, { "epoch": 2.7991349701055848, "ewc_loss": 0.07471014559268951, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038333196425810456, "grad_norm": 8.84911060333252, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8617499470710754, "num_tokens": 839658072.0, "step": 22004 }, { "epoch": 2.799262180384175, "ewc_loss": 0.07486505806446075, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003848811029456556, "grad_norm": 8.85488224029541, "learning_rate": 1e-06, "loss": 0.5211, "mean_token_accuracy": 0.8484448194503784, "num_tokens": 839700926.0, "step": 22005 }, { "epoch": 2.799389390662766, "ewc_loss": 0.0746891051530838, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003831215144600719, "grad_norm": 8.83772087097168, "learning_rate": 1e-06, "loss": 0.4252, "mean_token_accuracy": 0.8751718997955322, "num_tokens": 839736160.0, "step": 22006 }, { "epoch": 2.799516600941356, "ewc_loss": 0.07490165531635284, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038524699630215764, "grad_norm": 8.872699737548828, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8824456930160522, "num_tokens": 839776834.0, "step": 22007 }, { "epoch": 2.7996438112199464, "ewc_loss": 0.0747150257229805, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038338074227795005, "grad_norm": 8.822244644165039, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8593552112579346, "num_tokens": 839818156.0, "step": 22008 }, { "epoch": 2.799771021498537, "ewc_loss": 0.07492540776729584, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038548457087017596, "grad_norm": 8.893020629882812, "learning_rate": 1e-06, "loss": 0.4729, "mean_token_accuracy": 0.8608243465423584, "num_tokens": 839857133.0, "step": 22009 }, { "epoch": 2.7998982317771275, "ewc_loss": 0.0746726393699646, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003829568740911782, "grad_norm": 8.857075691223145, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.8502668142318726, "num_tokens": 839893129.0, "step": 22010 }, { "epoch": 2.800025442055718, "ewc_loss": 0.07487009465694427, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038493145257234573, "grad_norm": 8.909594535827637, "learning_rate": 1e-06, "loss": 0.4501, "mean_token_accuracy": 0.8689948320388794, "num_tokens": 839924622.0, "step": 22011 }, { "epoch": 2.8001526523343085, "ewc_loss": 0.07454149425029755, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816454263869673, "grad_norm": 8.833757400512695, "learning_rate": 1e-06, "loss": 0.4067, "mean_token_accuracy": 0.8846282958984375, "num_tokens": 839961276.0, "step": 22012 }, { "epoch": 2.800279862612899, "ewc_loss": 0.07486666738986969, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038489719736389816, "grad_norm": 8.886711120605469, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8557115793228149, "num_tokens": 840002554.0, "step": 22013 }, { "epoch": 2.8004070728914896, "ewc_loss": 0.07455232739448547, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003817537217400968, "grad_norm": 8.908778190612793, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8646720051765442, "num_tokens": 840041642.0, "step": 22014 }, { "epoch": 2.80053428317008, "ewc_loss": 0.07454824447631836, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003817129472736269, "grad_norm": 8.80417537689209, "learning_rate": 1e-06, "loss": 0.434, "mean_token_accuracy": 0.8748369812965393, "num_tokens": 840083532.0, "step": 22015 }, { "epoch": 2.8006614934486707, "ewc_loss": 0.07481684535741806, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003843989397864789, "grad_norm": 8.88433837890625, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8631633520126343, "num_tokens": 840124221.0, "step": 22016 }, { "epoch": 2.800788703727261, "ewc_loss": 0.074549600481987, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003817264223471284, "grad_norm": 8.861808776855469, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8710018396377563, "num_tokens": 840162309.0, "step": 22017 }, { "epoch": 2.8009159140058517, "ewc_loss": 0.07472369074821472, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834673552773893, "grad_norm": 8.859869003295898, "learning_rate": 1e-06, "loss": 0.4582, "mean_token_accuracy": 0.8674055337905884, "num_tokens": 840194778.0, "step": 22018 }, { "epoch": 2.8010431242844422, "ewc_loss": 0.07472772896289825, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038350775139406323, "grad_norm": 8.929465293884277, "learning_rate": 1e-06, "loss": 0.4056, "mean_token_accuracy": 0.8819658756256104, "num_tokens": 840225081.0, "step": 22019 }, { "epoch": 2.8011703345630328, "ewc_loss": 0.07458323240280151, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003820628044195473, "grad_norm": 8.86880111694336, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8650206327438354, "num_tokens": 840267698.0, "step": 22020 }, { "epoch": 2.8012975448416233, "ewc_loss": 0.074666827917099, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038289869553409517, "grad_norm": 8.824491500854492, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8595998287200928, "num_tokens": 840303255.0, "step": 22021 }, { "epoch": 2.801424755120214, "ewc_loss": 0.07472006976604462, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000383431208319962, "grad_norm": 9.023185729980469, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8646111488342285, "num_tokens": 840344616.0, "step": 22022 }, { "epoch": 2.8015519653988044, "ewc_loss": 0.07430728524923325, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003793033247347921, "grad_norm": 8.876896858215332, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.867540717124939, "num_tokens": 840383293.0, "step": 22023 }, { "epoch": 2.801679175677395, "ewc_loss": 0.07480709999799728, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003843014710582793, "grad_norm": 8.88212776184082, "learning_rate": 1e-06, "loss": 0.5021, "mean_token_accuracy": 0.8552646636962891, "num_tokens": 840424583.0, "step": 22024 }, { "epoch": 2.8018063859559854, "ewc_loss": 0.07450997829437256, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813302901107818, "grad_norm": 8.894219398498535, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8587198853492737, "num_tokens": 840458752.0, "step": 22025 }, { "epoch": 2.8019335962345755, "ewc_loss": 0.07441980391740799, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003804285079240799, "grad_norm": 8.793252944946289, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8718376159667969, "num_tokens": 840498441.0, "step": 22026 }, { "epoch": 2.8020608065131665, "ewc_loss": 0.07477521896362305, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038398263859562576, "grad_norm": 8.884283065795898, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8758257627487183, "num_tokens": 840537838.0, "step": 22027 }, { "epoch": 2.8021880167917566, "ewc_loss": 0.07447686791419983, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038099917583167553, "grad_norm": 8.874459266662598, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8548133373260498, "num_tokens": 840571770.0, "step": 22028 }, { "epoch": 2.8023152270703475, "ewc_loss": 0.07472936809062958, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038352416595444083, "grad_norm": 8.876994132995605, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8605333566665649, "num_tokens": 840607559.0, "step": 22029 }, { "epoch": 2.8024424373489376, "ewc_loss": 0.07464063167572021, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003826368192676455, "grad_norm": 8.837942123413086, "learning_rate": 1e-06, "loss": 0.5123, "mean_token_accuracy": 0.8515161275863647, "num_tokens": 840650156.0, "step": 22030 }, { "epoch": 2.8025696476275286, "ewc_loss": 0.07466952502727509, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038292576209641993, "grad_norm": 8.830758094787598, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8586355447769165, "num_tokens": 840686590.0, "step": 22031 }, { "epoch": 2.8026968579061187, "ewc_loss": 0.07476211339235306, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038385161315090954, "grad_norm": 8.893073081970215, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8575584888458252, "num_tokens": 840724445.0, "step": 22032 }, { "epoch": 2.802824068184709, "ewc_loss": 0.07458096742630005, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003820401325356215, "grad_norm": 8.776676177978516, "learning_rate": 1e-06, "loss": 0.4151, "mean_token_accuracy": 0.8803744912147522, "num_tokens": 840765047.0, "step": 22033 }, { "epoch": 2.8029512784632997, "ewc_loss": 0.0748547613620758, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847781044896692, "grad_norm": 8.871630668640137, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8526692390441895, "num_tokens": 840802375.0, "step": 22034 }, { "epoch": 2.8030784887418903, "ewc_loss": 0.0745777040719986, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038200747803784907, "grad_norm": 8.830266952514648, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8647480010986328, "num_tokens": 840839994.0, "step": 22035 }, { "epoch": 2.803205699020481, "ewc_loss": 0.07491330802440643, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003853635862469673, "grad_norm": 8.963127136230469, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8638412356376648, "num_tokens": 840877144.0, "step": 22036 }, { "epoch": 2.8033329092990713, "ewc_loss": 0.07449591159820557, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003811896312981844, "grad_norm": 8.727420806884766, "learning_rate": 1e-06, "loss": 0.497, "mean_token_accuracy": 0.856846034526825, "num_tokens": 840918478.0, "step": 22037 }, { "epoch": 2.803460119577662, "ewc_loss": 0.07523270696401596, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003885575570166111, "grad_norm": 8.882877349853516, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.876379132270813, "num_tokens": 840958484.0, "step": 22038 }, { "epoch": 2.8035873298562524, "ewc_loss": 0.07448546588420868, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003810850903391838, "grad_norm": 8.739798545837402, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8678574562072754, "num_tokens": 841000035.0, "step": 22039 }, { "epoch": 2.803714540134843, "ewc_loss": 0.07517663389444351, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003879968135152012, "grad_norm": 8.946724891662598, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8609782457351685, "num_tokens": 841035836.0, "step": 22040 }, { "epoch": 2.8038417504134334, "ewc_loss": 0.074529729783535, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003815277596004307, "grad_norm": 8.806222915649414, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8662824630737305, "num_tokens": 841069295.0, "step": 22041 }, { "epoch": 2.803968960692024, "ewc_loss": 0.07515409588813782, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003877714043483138, "grad_norm": 8.902070999145508, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8613439798355103, "num_tokens": 841108341.0, "step": 22042 }, { "epoch": 2.8040961709706145, "ewc_loss": 0.07466559112071991, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003828864137176424, "grad_norm": 8.83104419708252, "learning_rate": 1e-06, "loss": 0.4323, "mean_token_accuracy": 0.8751455545425415, "num_tokens": 841142945.0, "step": 22043 }, { "epoch": 2.804223381249205, "ewc_loss": 0.07512873411178589, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038751785177737474, "grad_norm": 8.926200866699219, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8786470293998718, "num_tokens": 841174137.0, "step": 22044 }, { "epoch": 2.8043505915277955, "ewc_loss": 0.07475301623344421, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038376066368073225, "grad_norm": 8.805870056152344, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8761727809906006, "num_tokens": 841215914.0, "step": 22045 }, { "epoch": 2.804477801806386, "ewc_loss": 0.07504886388778687, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003867191553581506, "grad_norm": 8.960214614868164, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8669575452804565, "num_tokens": 841256097.0, "step": 22046 }, { "epoch": 2.8046050120849766, "ewc_loss": 0.07456550002098083, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003818855038844049, "grad_norm": 8.807063102722168, "learning_rate": 1e-06, "loss": 0.4598, "mean_token_accuracy": 0.8715647459030151, "num_tokens": 841288997.0, "step": 22047 }, { "epoch": 2.804732222363567, "ewc_loss": 0.07498586177825928, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003860890865325928, "grad_norm": 8.873797416687012, "learning_rate": 1e-06, "loss": 0.4898, "mean_token_accuracy": 0.8604637980461121, "num_tokens": 841328053.0, "step": 22048 }, { "epoch": 2.8048594326421576, "ewc_loss": 0.07477940618991852, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038402454811148345, "grad_norm": 8.813334465026855, "learning_rate": 1e-06, "loss": 0.4096, "mean_token_accuracy": 0.8842073082923889, "num_tokens": 841366848.0, "step": 22049 }, { "epoch": 2.804986642920748, "ewc_loss": 0.07506279647350311, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000386858475394547, "grad_norm": 8.914848327636719, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.874894917011261, "num_tokens": 841398568.0, "step": 22050 }, { "epoch": 2.8051138531993383, "ewc_loss": 0.07466449588537216, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038287544157356024, "grad_norm": 8.881776809692383, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.863372266292572, "num_tokens": 841435722.0, "step": 22051 }, { "epoch": 2.8052410634779292, "ewc_loss": 0.0749896913766861, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003861273289658129, "grad_norm": 8.870223999023438, "learning_rate": 1e-06, "loss": 0.485, "mean_token_accuracy": 0.8618032932281494, "num_tokens": 841473635.0, "step": 22052 }, { "epoch": 2.8053682737565193, "ewc_loss": 0.07485099136829376, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038474041502922773, "grad_norm": 8.842057228088379, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8682733178138733, "num_tokens": 841512383.0, "step": 22053 }, { "epoch": 2.8054954840351103, "ewc_loss": 0.07483787834644318, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003846092149615288, "grad_norm": 8.893558502197266, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8595703840255737, "num_tokens": 841543174.0, "step": 22054 }, { "epoch": 2.8056226943137004, "ewc_loss": 0.074734628200531, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038357669836841524, "grad_norm": 8.846351623535156, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8708380460739136, "num_tokens": 841582185.0, "step": 22055 }, { "epoch": 2.805749904592291, "ewc_loss": 0.07480349391698837, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842654114123434, "grad_norm": 8.873579025268555, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8600708246231079, "num_tokens": 841623179.0, "step": 22056 }, { "epoch": 2.8058771148708814, "ewc_loss": 0.0746009349822998, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003822398721240461, "grad_norm": 8.820296287536621, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8747544288635254, "num_tokens": 841661062.0, "step": 22057 }, { "epoch": 2.806004325149472, "ewc_loss": 0.07489043474197388, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003851348301395774, "grad_norm": 8.794854164123535, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8634653091430664, "num_tokens": 841704259.0, "step": 22058 }, { "epoch": 2.8061315354280625, "ewc_loss": 0.07479074597358704, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038413790753111243, "grad_norm": 8.843803405761719, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8637908101081848, "num_tokens": 841748396.0, "step": 22059 }, { "epoch": 2.806258745706653, "ewc_loss": 0.07474909722805023, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003837214317172766, "grad_norm": 8.806962013244629, "learning_rate": 1e-06, "loss": 0.4035, "mean_token_accuracy": 0.8849713802337646, "num_tokens": 841787809.0, "step": 22060 }, { "epoch": 2.8063859559852435, "ewc_loss": 0.07485224306583405, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847528714686632, "grad_norm": 8.863338470458984, "learning_rate": 1e-06, "loss": 0.4699, "mean_token_accuracy": 0.8662311434745789, "num_tokens": 841824046.0, "step": 22061 }, { "epoch": 2.806513166263834, "ewc_loss": 0.07460863888263702, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038231690996326506, "grad_norm": 8.86945629119873, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8754067420959473, "num_tokens": 841856949.0, "step": 22062 }, { "epoch": 2.8066403765424246, "ewc_loss": 0.07489514350891113, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038518186192959547, "grad_norm": 8.936469078063965, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8521910309791565, "num_tokens": 841898644.0, "step": 22063 }, { "epoch": 2.806767586821015, "ewc_loss": 0.07454995065927505, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003817299730144441, "grad_norm": 8.805130958557129, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8697364330291748, "num_tokens": 841934795.0, "step": 22064 }, { "epoch": 2.8068947970996057, "ewc_loss": 0.07500749826431274, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038630544440820813, "grad_norm": 8.894493103027344, "learning_rate": 1e-06, "loss": 0.5209, "mean_token_accuracy": 0.850883960723877, "num_tokens": 841976878.0, "step": 22065 }, { "epoch": 2.807022007378196, "ewc_loss": 0.07451697438955307, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003814002266153693, "grad_norm": 8.810750007629395, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.866387128829956, "num_tokens": 842010998.0, "step": 22066 }, { "epoch": 2.8071492176567867, "ewc_loss": 0.07501553744077682, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038638582918792963, "grad_norm": 8.849922180175781, "learning_rate": 1e-06, "loss": 0.3833, "mean_token_accuracy": 0.8914313316345215, "num_tokens": 842051410.0, "step": 22067 }, { "epoch": 2.8072764279353772, "ewc_loss": 0.07467027008533478, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003829331835731864, "grad_norm": 8.816238403320312, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.869246244430542, "num_tokens": 842087954.0, "step": 22068 }, { "epoch": 2.8074036382139678, "ewc_loss": 0.07498656213283539, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038609610055573285, "grad_norm": 8.9220552444458, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8625930547714233, "num_tokens": 842128326.0, "step": 22069 }, { "epoch": 2.8075308484925583, "ewc_loss": 0.07455675303936005, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038179801777005196, "grad_norm": 8.77405071258545, "learning_rate": 1e-06, "loss": 0.5358, "mean_token_accuracy": 0.8457021713256836, "num_tokens": 842171143.0, "step": 22070 }, { "epoch": 2.807658058771149, "ewc_loss": 0.0750209242105484, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038643970037810504, "grad_norm": 8.893472671508789, "learning_rate": 1e-06, "loss": 0.5013, "mean_token_accuracy": 0.8532428741455078, "num_tokens": 842212167.0, "step": 22071 }, { "epoch": 2.8077852690497394, "ewc_loss": 0.07459621131420135, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003821925783995539, "grad_norm": 8.853428840637207, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8578951358795166, "num_tokens": 842252576.0, "step": 22072 }, { "epoch": 2.80791247932833, "ewc_loss": 0.07482942938804626, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038452475564554334, "grad_norm": 8.795554161071777, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8627628087997437, "num_tokens": 842287812.0, "step": 22073 }, { "epoch": 2.80803968960692, "ewc_loss": 0.07483780384063721, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003846085164695978, "grad_norm": 8.867141723632812, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.862006664276123, "num_tokens": 842328289.0, "step": 22074 }, { "epoch": 2.808166899885511, "ewc_loss": 0.07482992112636566, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003845296741928905, "grad_norm": 8.944437980651855, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8687654733657837, "num_tokens": 842359182.0, "step": 22075 }, { "epoch": 2.808294110164101, "ewc_loss": 0.07464095950126648, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003826400497928262, "grad_norm": 8.839932441711426, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8577477931976318, "num_tokens": 842399820.0, "step": 22076 }, { "epoch": 2.808421320442692, "ewc_loss": 0.07488535344600677, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003850839857477695, "grad_norm": 8.92820930480957, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.8637436628341675, "num_tokens": 842437852.0, "step": 22077 }, { "epoch": 2.808548530721282, "ewc_loss": 0.07442338019609451, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003804642765317112, "grad_norm": 8.790889739990234, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8525091409683228, "num_tokens": 842476729.0, "step": 22078 }, { "epoch": 2.808675740999873, "ewc_loss": 0.07501859962940216, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038641647552140057, "grad_norm": 8.910984992980957, "learning_rate": 1e-06, "loss": 0.5059, "mean_token_accuracy": 0.8531322479248047, "num_tokens": 842514590.0, "step": 22079 }, { "epoch": 2.808802951278463, "ewc_loss": 0.07436639070510864, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003798944235313684, "grad_norm": 8.763113021850586, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8648864030838013, "num_tokens": 842556502.0, "step": 22080 }, { "epoch": 2.8089301615570537, "ewc_loss": 0.07509150356054306, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003871454973705113, "grad_norm": 8.95241641998291, "learning_rate": 1e-06, "loss": 0.5009, "mean_token_accuracy": 0.8535556197166443, "num_tokens": 842594299.0, "step": 22081 }, { "epoch": 2.809057371835644, "ewc_loss": 0.07433247566223145, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003795552474912256, "grad_norm": 8.700277328491211, "learning_rate": 1e-06, "loss": 0.4716, "mean_token_accuracy": 0.8629546165466309, "num_tokens": 842633189.0, "step": 22082 }, { "epoch": 2.8091845821142347, "ewc_loss": 0.07532404363155365, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003894708934240043, "grad_norm": 8.95237922668457, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8692494034767151, "num_tokens": 842673626.0, "step": 22083 }, { "epoch": 2.8093117923928252, "ewc_loss": 0.07440164685249329, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038024692912586033, "grad_norm": 8.766213417053223, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8587716817855835, "num_tokens": 842712574.0, "step": 22084 }, { "epoch": 2.8094390026714158, "ewc_loss": 0.07500968873500824, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003863273304887116, "grad_norm": 8.938333511352539, "learning_rate": 1e-06, "loss": 0.4743, "mean_token_accuracy": 0.8633071780204773, "num_tokens": 842749534.0, "step": 22085 }, { "epoch": 2.8095662129500063, "ewc_loss": 0.07451742887496948, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038140479591675103, "grad_norm": 8.794660568237305, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8558663129806519, "num_tokens": 842784989.0, "step": 22086 }, { "epoch": 2.809693423228597, "ewc_loss": 0.07510165870189667, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038724704063497484, "grad_norm": 8.866826057434082, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.866388738155365, "num_tokens": 842823858.0, "step": 22087 }, { "epoch": 2.8098206335071874, "ewc_loss": 0.07462017238140106, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003824321611318737, "grad_norm": 8.76685905456543, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8663022518157959, "num_tokens": 842858849.0, "step": 22088 }, { "epoch": 2.809947843785778, "ewc_loss": 0.0751006156206131, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003872366214636713, "grad_norm": 8.858726501464844, "learning_rate": 1e-06, "loss": 0.4023, "mean_token_accuracy": 0.8860726356506348, "num_tokens": 842895809.0, "step": 22089 }, { "epoch": 2.8100750540643684, "ewc_loss": 0.07472626864910126, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834931703750044, "grad_norm": 8.823524475097656, "learning_rate": 1e-06, "loss": 0.3773, "mean_token_accuracy": 0.8928213715553284, "num_tokens": 842933650.0, "step": 22090 }, { "epoch": 2.810202264342959, "ewc_loss": 0.07492279261350632, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038545840652659535, "grad_norm": 8.900811195373535, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8703005313873291, "num_tokens": 842967200.0, "step": 22091 }, { "epoch": 2.8103294746215495, "ewc_loss": 0.07465589046478271, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038278932333923876, "grad_norm": 8.785820007324219, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8560643196105957, "num_tokens": 843002498.0, "step": 22092 }, { "epoch": 2.81045668490014, "ewc_loss": 0.07499201595783234, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038615058292634785, "grad_norm": 8.941156387329102, "learning_rate": 1e-06, "loss": 0.4466, "mean_token_accuracy": 0.8718758225440979, "num_tokens": 843042627.0, "step": 22093 }, { "epoch": 2.8105838951787305, "ewc_loss": 0.07465319335460663, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038276237319223583, "grad_norm": 8.840461730957031, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8622047901153564, "num_tokens": 843076862.0, "step": 22094 }, { "epoch": 2.810711105457321, "ewc_loss": 0.07486985623836517, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038492900785058737, "grad_norm": 8.876593589782715, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8647611141204834, "num_tokens": 843115000.0, "step": 22095 }, { "epoch": 2.8108383157359116, "ewc_loss": 0.07471083849668503, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003833388618659228, "grad_norm": 8.882500648498535, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8727785348892212, "num_tokens": 843152408.0, "step": 22096 }, { "epoch": 2.810965526014502, "ewc_loss": 0.07491159439086914, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003853464440908283, "grad_norm": 8.84959602355957, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8669959306716919, "num_tokens": 843189269.0, "step": 22097 }, { "epoch": 2.8110927362930926, "ewc_loss": 0.07478897273540497, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003841202415060252, "grad_norm": 8.855207443237305, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8602865934371948, "num_tokens": 843227148.0, "step": 22098 }, { "epoch": 2.8112199465716827, "ewc_loss": 0.07469667494297028, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038319724262692034, "grad_norm": 8.910601615905762, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.855944812297821, "num_tokens": 843259736.0, "step": 22099 }, { "epoch": 2.8113471568502737, "ewc_loss": 0.07470352947711945, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003832658112514764, "grad_norm": 8.832015037536621, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8615946173667908, "num_tokens": 843302598.0, "step": 22100 }, { "epoch": 2.811474367128864, "ewc_loss": 0.07492406666278839, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038547118310816586, "grad_norm": 8.90146255493164, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8814408779144287, "num_tokens": 843337615.0, "step": 22101 }, { "epoch": 2.8116015774074548, "ewc_loss": 0.07478538155555725, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038164289435371757, "grad_norm": 8.825539588928223, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8810814619064331, "num_tokens": 843378845.0, "step": 22102 }, { "epoch": 2.811728787686045, "ewc_loss": 0.07483257353305817, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038455621688626707, "grad_norm": 8.954045295715332, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8551619052886963, "num_tokens": 843414128.0, "step": 22103 }, { "epoch": 2.811855997964636, "ewc_loss": 0.07440612465143204, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038029171992093325, "grad_norm": 8.808991432189941, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8828620910644531, "num_tokens": 843454255.0, "step": 22104 }, { "epoch": 2.811983208243226, "ewc_loss": 0.07497122883796692, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038594281068071723, "grad_norm": 8.906014442443848, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.866354763507843, "num_tokens": 843498332.0, "step": 22105 }, { "epoch": 2.8121104185218164, "ewc_loss": 0.07456585764884949, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003818889963440597, "grad_norm": 8.867419242858887, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.856991171836853, "num_tokens": 843539196.0, "step": 22106 }, { "epoch": 2.812237628800407, "ewc_loss": 0.07479849457740784, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038421546923927963, "grad_norm": 8.91588306427002, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8701068162918091, "num_tokens": 843573767.0, "step": 22107 }, { "epoch": 2.8123648390789975, "ewc_loss": 0.07470844686031342, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038331496762111783, "grad_norm": 8.906312942504883, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.854509711265564, "num_tokens": 843611156.0, "step": 22108 }, { "epoch": 2.812492049357588, "ewc_loss": 0.07478723675012589, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038410283741541207, "grad_norm": 8.927023887634277, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.8705431222915649, "num_tokens": 843644026.0, "step": 22109 }, { "epoch": 2.8126192596361785, "ewc_loss": 0.07455010712146759, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003817315155174583, "grad_norm": 8.829676628112793, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8695801496505737, "num_tokens": 843683477.0, "step": 22110 }, { "epoch": 2.812746469914769, "ewc_loss": 0.07491866499185562, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038541710819117725, "grad_norm": 8.91275691986084, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8522603511810303, "num_tokens": 843722815.0, "step": 22111 }, { "epoch": 2.8128736801933596, "ewc_loss": 0.07475258409976959, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003837562690023333, "grad_norm": 8.898632049560547, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8682225942611694, "num_tokens": 843763843.0, "step": 22112 }, { "epoch": 2.81300089047195, "ewc_loss": 0.07472262531518936, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834567323792726, "grad_norm": 8.813474655151367, "learning_rate": 1e-06, "loss": 0.4944, "mean_token_accuracy": 0.858075737953186, "num_tokens": 843805327.0, "step": 22113 }, { "epoch": 2.8131281007505406, "ewc_loss": 0.07485002279281616, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847306943498552, "grad_norm": 8.880420684814453, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8665189146995544, "num_tokens": 843850132.0, "step": 22114 }, { "epoch": 2.813255311029131, "ewc_loss": 0.07452711462974548, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038150165346451104, "grad_norm": 8.798983573913574, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.8672987222671509, "num_tokens": 843887425.0, "step": 22115 }, { "epoch": 2.8133825213077217, "ewc_loss": 0.075001060962677, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038624106673523784, "grad_norm": 8.946459770202637, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.873702347278595, "num_tokens": 843920728.0, "step": 22116 }, { "epoch": 2.8135097315863122, "ewc_loss": 0.07457525283098221, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038198300171643496, "grad_norm": 8.84367847442627, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8631487488746643, "num_tokens": 843959733.0, "step": 22117 }, { "epoch": 2.8136369418649028, "ewc_loss": 0.07513824105262756, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000387612875783816, "grad_norm": 8.946775436401367, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.867730438709259, "num_tokens": 844005616.0, "step": 22118 }, { "epoch": 2.8137641521434933, "ewc_loss": 0.07466880977153778, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003829185734502971, "grad_norm": 8.859413146972656, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8798717260360718, "num_tokens": 844034216.0, "step": 22119 }, { "epoch": 2.813891362422084, "ewc_loss": 0.07505416125059128, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038677206612192094, "grad_norm": 8.913878440856934, "learning_rate": 1e-06, "loss": 0.4684, "mean_token_accuracy": 0.8640217781066895, "num_tokens": 844071173.0, "step": 22120 }, { "epoch": 2.8140185727006743, "ewc_loss": 0.07480631023645401, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842935839202255, "grad_norm": 8.848692893981934, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.863667368888855, "num_tokens": 844113665.0, "step": 22121 }, { "epoch": 2.814145782979265, "ewc_loss": 0.07502508163452148, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003864812897518277, "grad_norm": 8.930580139160156, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.866122841835022, "num_tokens": 844154063.0, "step": 22122 }, { "epoch": 2.8142729932578554, "ewc_loss": 0.07477635145187378, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038399395998567343, "grad_norm": 8.851286888122559, "learning_rate": 1e-06, "loss": 0.4057, "mean_token_accuracy": 0.8861686587333679, "num_tokens": 844185634.0, "step": 22123 }, { "epoch": 2.8144002035364455, "ewc_loss": 0.07490530610084534, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038528352160938084, "grad_norm": 8.907668113708496, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.8558422327041626, "num_tokens": 844221755.0, "step": 22124 }, { "epoch": 2.8145274138150365, "ewc_loss": 0.07490037381649017, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038523421972058713, "grad_norm": 8.913986206054688, "learning_rate": 1e-06, "loss": 0.448, "mean_token_accuracy": 0.8706272840499878, "num_tokens": 844256230.0, "step": 22125 }, { "epoch": 2.8146546240936265, "ewc_loss": 0.07487620413303375, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849924833048135, "grad_norm": 8.921438217163086, "learning_rate": 1e-06, "loss": 0.5016, "mean_token_accuracy": 0.8591439723968506, "num_tokens": 844292823.0, "step": 22126 }, { "epoch": 2.8147818343722175, "ewc_loss": 0.07486619800329208, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003848924534395337, "grad_norm": 8.953500747680664, "learning_rate": 1e-06, "loss": 0.533, "mean_token_accuracy": 0.844476044178009, "num_tokens": 844331843.0, "step": 22127 }, { "epoch": 2.8149090446508076, "ewc_loss": 0.07470351457595825, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038326557842083275, "grad_norm": 8.848828315734863, "learning_rate": 1e-06, "loss": 0.4373, "mean_token_accuracy": 0.8759617805480957, "num_tokens": 844375823.0, "step": 22128 }, { "epoch": 2.815036254929398, "ewc_loss": 0.07515157759189606, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003877462004311383, "grad_norm": 8.899338722229004, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8670121431350708, "num_tokens": 844415187.0, "step": 22129 }, { "epoch": 2.8151634652079887, "ewc_loss": 0.07472623884677887, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834929084405303, "grad_norm": 8.819113731384277, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8693159818649292, "num_tokens": 844454986.0, "step": 22130 }, { "epoch": 2.815290675486579, "ewc_loss": 0.07516466081142426, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038787710946053267, "grad_norm": 8.92626667022705, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.87253737449646, "num_tokens": 844495514.0, "step": 22131 }, { "epoch": 2.8154178857651697, "ewc_loss": 0.07476137578487396, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038384422077797353, "grad_norm": 8.852228164672852, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8651449680328369, "num_tokens": 844532406.0, "step": 22132 }, { "epoch": 2.8155450960437602, "ewc_loss": 0.07520903646945953, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003883208555635065, "grad_norm": 8.964117050170898, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8652157783508301, "num_tokens": 844569561.0, "step": 22133 }, { "epoch": 2.8156723063223508, "ewc_loss": 0.07475984841585159, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038382894126698375, "grad_norm": 8.85521411895752, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8702293038368225, "num_tokens": 844613504.0, "step": 22134 }, { "epoch": 2.8157995166009413, "ewc_loss": 0.0751606822013855, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038783723721280694, "grad_norm": 8.906839370727539, "learning_rate": 1e-06, "loss": 0.477, "mean_token_accuracy": 0.8640542030334473, "num_tokens": 844649255.0, "step": 22135 }, { "epoch": 2.815926726879532, "ewc_loss": 0.07494555413722992, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038568602758459747, "grad_norm": 8.901802062988281, "learning_rate": 1e-06, "loss": 0.4631, "mean_token_accuracy": 0.8676818609237671, "num_tokens": 844687202.0, "step": 22136 }, { "epoch": 2.8160539371581224, "ewc_loss": 0.07500112056732178, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003862416197080165, "grad_norm": 8.935952186584473, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8668889999389648, "num_tokens": 844723096.0, "step": 22137 }, { "epoch": 2.816181147436713, "ewc_loss": 0.0749112069606781, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003853424859698862, "grad_norm": 8.849719047546387, "learning_rate": 1e-06, "loss": 0.4594, "mean_token_accuracy": 0.8666515350341797, "num_tokens": 844759754.0, "step": 22138 }, { "epoch": 2.8163083577153034, "ewc_loss": 0.07501662522554398, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038639671402052045, "grad_norm": 8.923587799072266, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8694426417350769, "num_tokens": 844798372.0, "step": 22139 }, { "epoch": 2.816435567993894, "ewc_loss": 0.07470600306987762, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003832905204035342, "grad_norm": 8.83608341217041, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.860576868057251, "num_tokens": 844832738.0, "step": 22140 }, { "epoch": 2.8165627782724845, "ewc_loss": 0.07505413889884949, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038677186239510775, "grad_norm": 8.912200927734375, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.867891788482666, "num_tokens": 844871684.0, "step": 22141 }, { "epoch": 2.816689988551075, "ewc_loss": 0.0746278241276741, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038250870420597494, "grad_norm": 8.789149284362793, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8583592176437378, "num_tokens": 844912140.0, "step": 22142 }, { "epoch": 2.8168171988296655, "ewc_loss": 0.07517419010400772, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038797236629761755, "grad_norm": 8.96235466003418, "learning_rate": 1e-06, "loss": 0.4948, "mean_token_accuracy": 0.8549007773399353, "num_tokens": 844948256.0, "step": 22143 }, { "epoch": 2.816944409108256, "ewc_loss": 0.07456620782613754, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038189254701137543, "grad_norm": 8.792859077453613, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8821067810058594, "num_tokens": 844983821.0, "step": 22144 }, { "epoch": 2.8170716193868466, "ewc_loss": 0.07517767697572708, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003880072326865047, "grad_norm": 8.907898902893066, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8689767718315125, "num_tokens": 845020622.0, "step": 22145 }, { "epoch": 2.817198829665437, "ewc_loss": 0.07460133731365204, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038224385934881866, "grad_norm": 8.81929874420166, "learning_rate": 1e-06, "loss": 0.4919, "mean_token_accuracy": 0.857561469078064, "num_tokens": 845056748.0, "step": 22146 }, { "epoch": 2.8173260399440276, "ewc_loss": 0.07513759285211563, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038760638562962413, "grad_norm": 8.886120796203613, "learning_rate": 1e-06, "loss": 0.5242, "mean_token_accuracy": 0.84675532579422, "num_tokens": 845096294.0, "step": 22147 }, { "epoch": 2.817453250222618, "ewc_loss": 0.07482746988534927, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038450516876764596, "grad_norm": 8.872458457946777, "learning_rate": 1e-06, "loss": 0.4107, "mean_token_accuracy": 0.8821261525154114, "num_tokens": 845135132.0, "step": 22148 }, { "epoch": 2.8175804605012083, "ewc_loss": 0.07497996091842651, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000386030093068257, "grad_norm": 8.894969940185547, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8747552037239075, "num_tokens": 845172164.0, "step": 22149 }, { "epoch": 2.8177076707797992, "ewc_loss": 0.07482501864433289, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038448060513474047, "grad_norm": 8.874349594116211, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8673275709152222, "num_tokens": 845210270.0, "step": 22150 }, { "epoch": 2.8178348810583893, "ewc_loss": 0.07495266199111938, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038575707003474236, "grad_norm": 8.865669250488281, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8693560361862183, "num_tokens": 845251863.0, "step": 22151 }, { "epoch": 2.8179620913369803, "ewc_loss": 0.07489878684282303, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038521832902915776, "grad_norm": 8.933528900146484, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8720707893371582, "num_tokens": 845290262.0, "step": 22152 }, { "epoch": 2.8180893016155704, "ewc_loss": 0.07467667758464813, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003829972120001912, "grad_norm": 8.802103996276855, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8745898008346558, "num_tokens": 845326894.0, "step": 22153 }, { "epoch": 2.818216511894161, "ewc_loss": 0.07507546246051788, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003869850770570338, "grad_norm": 8.917342185974121, "learning_rate": 1e-06, "loss": 0.4433, "mean_token_accuracy": 0.8754474520683289, "num_tokens": 845367545.0, "step": 22154 }, { "epoch": 2.8183437221727514, "ewc_loss": 0.0746065229177475, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038229572237469256, "grad_norm": 8.891011238098145, "learning_rate": 1e-06, "loss": 0.5226, "mean_token_accuracy": 0.8512594103813171, "num_tokens": 845400920.0, "step": 22155 }, { "epoch": 2.818470932451342, "ewc_loss": 0.07487091422080994, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038493965985253453, "grad_norm": 8.971207618713379, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8636187314987183, "num_tokens": 845436279.0, "step": 22156 }, { "epoch": 2.8185981427299325, "ewc_loss": 0.07463404536247253, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003825709572993219, "grad_norm": 8.786880493164062, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8700551986694336, "num_tokens": 845476308.0, "step": 22157 }, { "epoch": 2.818725353008523, "ewc_loss": 0.07503645867109299, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003865950566250831, "grad_norm": 8.947516441345215, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.862826943397522, "num_tokens": 845516624.0, "step": 22158 }, { "epoch": 2.8188525632871135, "ewc_loss": 0.0744985044002533, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003812155337072909, "grad_norm": 8.781984329223633, "learning_rate": 1e-06, "loss": 0.4027, "mean_token_accuracy": 0.8845347762107849, "num_tokens": 845552385.0, "step": 22159 }, { "epoch": 2.818979773565704, "ewc_loss": 0.07509611546993256, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003871915687341243, "grad_norm": 8.955036163330078, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8747580647468567, "num_tokens": 845592600.0, "step": 22160 }, { "epoch": 2.8191069838442946, "ewc_loss": 0.0744972974061966, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038120339740999043, "grad_norm": 8.848169326782227, "learning_rate": 1e-06, "loss": 0.4395, "mean_token_accuracy": 0.8753175735473633, "num_tokens": 845630028.0, "step": 22161 }, { "epoch": 2.819234194122885, "ewc_loss": 0.07498329877853394, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003860634460579604, "grad_norm": 8.955056190490723, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8616963624954224, "num_tokens": 845665874.0, "step": 22162 }, { "epoch": 2.8193614044014756, "ewc_loss": 0.07453292608261108, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038155968650244176, "grad_norm": 8.852800369262695, "learning_rate": 1e-06, "loss": 0.504, "mean_token_accuracy": 0.8543728590011597, "num_tokens": 845702738.0, "step": 22163 }, { "epoch": 2.819488614680066, "ewc_loss": 0.07479415833950043, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003841720172204077, "grad_norm": 8.849278450012207, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8700255155563354, "num_tokens": 845744785.0, "step": 22164 }, { "epoch": 2.8196158249586567, "ewc_loss": 0.07489030063152313, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038513343315571547, "grad_norm": 8.893174171447754, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8666225671768188, "num_tokens": 845779196.0, "step": 22165 }, { "epoch": 2.8197430352372472, "ewc_loss": 0.07497915625572205, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038602200220339, "grad_norm": 8.950084686279297, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8552713990211487, "num_tokens": 845819016.0, "step": 22166 }, { "epoch": 2.8198702455158378, "ewc_loss": 0.07470615208148956, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038329194649122655, "grad_norm": 8.855947494506836, "learning_rate": 1e-06, "loss": 0.453, "mean_token_accuracy": 0.8687606453895569, "num_tokens": 845855014.0, "step": 22167 }, { "epoch": 2.8199974557944283, "ewc_loss": 0.07498375326395035, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038606798625551164, "grad_norm": 8.91822338104248, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8753377199172974, "num_tokens": 845895491.0, "step": 22168 }, { "epoch": 2.820124666073019, "ewc_loss": 0.07483023405075073, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038453281740657985, "grad_norm": 8.91218376159668, "learning_rate": 1e-06, "loss": 0.4977, "mean_token_accuracy": 0.8572514057159424, "num_tokens": 845931560.0, "step": 22169 }, { "epoch": 2.8202518763516093, "ewc_loss": 0.07489021122455597, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003851325309369713, "grad_norm": 8.924643516540527, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8675958514213562, "num_tokens": 845966701.0, "step": 22170 }, { "epoch": 2.8203790866302, "ewc_loss": 0.07481241226196289, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038435455644503236, "grad_norm": 8.86377239227295, "learning_rate": 1e-06, "loss": 0.4135, "mean_token_accuracy": 0.8846693634986877, "num_tokens": 846002327.0, "step": 22171 }, { "epoch": 2.82050629690879, "ewc_loss": 0.07502897083759308, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038652020157314837, "grad_norm": 8.896553993225098, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8670133948326111, "num_tokens": 846042157.0, "step": 22172 }, { "epoch": 2.820633507187381, "ewc_loss": 0.07479770481586456, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842074947897345, "grad_norm": 8.83176326751709, "learning_rate": 1e-06, "loss": 0.5305, "mean_token_accuracy": 0.852138876914978, "num_tokens": 846082851.0, "step": 22173 }, { "epoch": 2.820760717465971, "ewc_loss": 0.07490423321723938, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852727822959423, "grad_norm": 8.95411205291748, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8649832606315613, "num_tokens": 846116846.0, "step": 22174 }, { "epoch": 2.820887927744562, "ewc_loss": 0.07469011843204498, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003831317008007318, "grad_norm": 8.849323272705078, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8695600032806396, "num_tokens": 846154373.0, "step": 22175 }, { "epoch": 2.821015138023152, "ewc_loss": 0.07500866055488586, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038631705683656037, "grad_norm": 8.865111351013184, "learning_rate": 1e-06, "loss": 0.3991, "mean_token_accuracy": 0.8850979804992676, "num_tokens": 846195329.0, "step": 22176 }, { "epoch": 2.821142348301743, "ewc_loss": 0.07468324154615402, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003830628702417016, "grad_norm": 8.831833839416504, "learning_rate": 1e-06, "loss": 0.4212, "mean_token_accuracy": 0.8767522573471069, "num_tokens": 846233078.0, "step": 22177 }, { "epoch": 2.821269558580333, "ewc_loss": 0.07500194013118744, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038624988519586623, "grad_norm": 8.928115844726562, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8653690218925476, "num_tokens": 846269030.0, "step": 22178 }, { "epoch": 2.8213967688589237, "ewc_loss": 0.07466346025466919, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003828650515060872, "grad_norm": 8.824418067932129, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8621072173118591, "num_tokens": 846307269.0, "step": 22179 }, { "epoch": 2.821523979137514, "ewc_loss": 0.0749875009059906, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003861054719891399, "grad_norm": 8.981049537658691, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8563816547393799, "num_tokens": 846339116.0, "step": 22180 }, { "epoch": 2.8216511894161047, "ewc_loss": 0.07465246319770813, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003827550681307912, "grad_norm": 8.828437805175781, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8598530888557434, "num_tokens": 846384968.0, "step": 22181 }, { "epoch": 2.8217783996946952, "ewc_loss": 0.07510793954133987, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003873098758049309, "grad_norm": 8.96226978302002, "learning_rate": 1e-06, "loss": 0.512, "mean_token_accuracy": 0.8502458333969116, "num_tokens": 846426235.0, "step": 22182 }, { "epoch": 2.8219056099732858, "ewc_loss": 0.07451678067445755, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813982766587287, "grad_norm": 8.828726768493652, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.854028582572937, "num_tokens": 846465715.0, "step": 22183 }, { "epoch": 2.8220328202518763, "ewc_loss": 0.07507570832967758, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003869875508826226, "grad_norm": 8.911818504333496, "learning_rate": 1e-06, "loss": 0.5086, "mean_token_accuracy": 0.8515876531600952, "num_tokens": 846499972.0, "step": 22184 }, { "epoch": 2.822160030530467, "ewc_loss": 0.07471930980682373, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038342358311638236, "grad_norm": 8.892714500427246, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8655804991722107, "num_tokens": 846535906.0, "step": 22185 }, { "epoch": 2.8222872408090574, "ewc_loss": 0.07492925226688385, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038552298792637885, "grad_norm": 8.95638656616211, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8600900173187256, "num_tokens": 846572723.0, "step": 22186 }, { "epoch": 2.822414451087648, "ewc_loss": 0.07476875185966492, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003839179698843509, "grad_norm": 8.911355018615723, "learning_rate": 1e-06, "loss": 0.4514, "mean_token_accuracy": 0.8693925142288208, "num_tokens": 846613501.0, "step": 22187 }, { "epoch": 2.8225416613662384, "ewc_loss": 0.07483527064323425, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038458316703327, "grad_norm": 8.856487274169922, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.8865628242492676, "num_tokens": 846650001.0, "step": 22188 }, { "epoch": 2.822668871644829, "ewc_loss": 0.07486917078495026, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849221393465996, "grad_norm": 8.85720443725586, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.853785514831543, "num_tokens": 846690636.0, "step": 22189 }, { "epoch": 2.8227960819234195, "ewc_loss": 0.07492539286613464, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038548436714336276, "grad_norm": 8.977558135986328, "learning_rate": 1e-06, "loss": 0.5355, "mean_token_accuracy": 0.8451182246208191, "num_tokens": 846728513.0, "step": 22190 }, { "epoch": 2.82292329220201, "ewc_loss": 0.07474826276302338, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003837131371255964, "grad_norm": 8.804459571838379, "learning_rate": 1e-06, "loss": 0.5154, "mean_token_accuracy": 0.8515737056732178, "num_tokens": 846771989.0, "step": 22191 }, { "epoch": 2.8230505024806005, "ewc_loss": 0.07515054941177368, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003877359558828175, "grad_norm": 8.932587623596191, "learning_rate": 1e-06, "loss": 0.4091, "mean_token_accuracy": 0.8835559487342834, "num_tokens": 846808393.0, "step": 22192 }, { "epoch": 2.823177712759191, "ewc_loss": 0.07471068203449249, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038333729025907815, "grad_norm": 8.842229843139648, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8571912050247192, "num_tokens": 846846928.0, "step": 22193 }, { "epoch": 2.8233049230377816, "ewc_loss": 0.07529240101575851, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003891544765792787, "grad_norm": 8.9725341796875, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8561028242111206, "num_tokens": 846884637.0, "step": 22194 }, { "epoch": 2.823432133316372, "ewc_loss": 0.07474228739738464, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003836533287540078, "grad_norm": 8.790946960449219, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8700392246246338, "num_tokens": 846925749.0, "step": 22195 }, { "epoch": 2.8235593435949626, "ewc_loss": 0.07532031834125519, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003894336405210197, "grad_norm": 8.972711563110352, "learning_rate": 1e-06, "loss": 0.5221, "mean_token_accuracy": 0.8503187298774719, "num_tokens": 846967191.0, "step": 22196 }, { "epoch": 2.8236865538735527, "ewc_loss": 0.07461494207382202, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000382379861548543, "grad_norm": 8.743911743164062, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8718845844268799, "num_tokens": 847003280.0, "step": 22197 }, { "epoch": 2.8238137641521437, "ewc_loss": 0.07549402117729187, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039117070264182985, "grad_norm": 9.005815505981445, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8533558249473572, "num_tokens": 847042179.0, "step": 22198 }, { "epoch": 2.8239409744307338, "ewc_loss": 0.07492861151695251, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003830751811619848, "grad_norm": 8.765329360961914, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8627892136573792, "num_tokens": 847083008.0, "step": 22199 }, { "epoch": 2.8240681847093247, "ewc_loss": 0.07552400976419449, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003914705594070256, "grad_norm": 8.941618919372559, "learning_rate": 1e-06, "loss": 0.5331, "mean_token_accuracy": 0.8448097705841064, "num_tokens": 847124009.0, "step": 22200 }, { "epoch": 2.824195394987915, "ewc_loss": 0.07488532364368439, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003850837529171258, "grad_norm": 8.8325834274292, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.8598183393478394, "num_tokens": 847163526.0, "step": 22201 }, { "epoch": 2.824322605266506, "ewc_loss": 0.07574990391731262, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003912881074938923, "grad_norm": 8.957831382751465, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8656220436096191, "num_tokens": 847201374.0, "step": 22202 }, { "epoch": 2.824449815545096, "ewc_loss": 0.07510121166706085, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038480115472339094, "grad_norm": 8.812210083007812, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8507367968559265, "num_tokens": 847239469.0, "step": 22203 }, { "epoch": 2.8245770258236864, "ewc_loss": 0.07562974095344543, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039008643943816423, "grad_norm": 8.9201021194458, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8636292219161987, "num_tokens": 847275964.0, "step": 22204 }, { "epoch": 2.824704236102277, "ewc_loss": 0.07533857226371765, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003871748049277812, "grad_norm": 8.85871410369873, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8663007020950317, "num_tokens": 847316815.0, "step": 22205 }, { "epoch": 2.8248314463808675, "ewc_loss": 0.07541298121213913, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003879188734572381, "grad_norm": 8.880219459533691, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8692742586135864, "num_tokens": 847350949.0, "step": 22206 }, { "epoch": 2.824958656659458, "ewc_loss": 0.07518835365772247, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038811395643278956, "grad_norm": 8.839580535888672, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8696268796920776, "num_tokens": 847388945.0, "step": 22207 }, { "epoch": 2.8250858669380485, "ewc_loss": 0.07549583911895752, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003887474595103413, "grad_norm": 8.908148765563965, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8772753477096558, "num_tokens": 847435114.0, "step": 22208 }, { "epoch": 2.825213077216639, "ewc_loss": 0.07544758915901184, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003882649471051991, "grad_norm": 8.917815208435059, "learning_rate": 1e-06, "loss": 0.4878, "mean_token_accuracy": 0.8566104173660278, "num_tokens": 847470815.0, "step": 22209 }, { "epoch": 2.8253402874952296, "ewc_loss": 0.07512257248163223, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003874562098644674, "grad_norm": 8.855205535888672, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8627676963806152, "num_tokens": 847509379.0, "step": 22210 }, { "epoch": 2.82546749777382, "ewc_loss": 0.07517731189727783, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003880035656038672, "grad_norm": 8.886974334716797, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8695583939552307, "num_tokens": 847547427.0, "step": 22211 }, { "epoch": 2.8255947080524106, "ewc_loss": 0.07503429055213928, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038657334516756237, "grad_norm": 8.87894058227539, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8483524322509766, "num_tokens": 847584740.0, "step": 22212 }, { "epoch": 2.825721918331001, "ewc_loss": 0.07523100078105927, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038854044396430254, "grad_norm": 8.875457763671875, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.85505211353302, "num_tokens": 847626201.0, "step": 22213 }, { "epoch": 2.8258491286095917, "ewc_loss": 0.07498978078365326, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000386128289392218, "grad_norm": 8.821000099182129, "learning_rate": 1e-06, "loss": 0.4214, "mean_token_accuracy": 0.8795216679573059, "num_tokens": 847661311.0, "step": 22214 }, { "epoch": 2.8259763388881822, "ewc_loss": 0.07524745911359787, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003887050552293658, "grad_norm": 8.92107105255127, "learning_rate": 1e-06, "loss": 0.4533, "mean_token_accuracy": 0.8641993999481201, "num_tokens": 847695234.0, "step": 22215 }, { "epoch": 2.8261035491667728, "ewc_loss": 0.0749596431851387, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003858269192278385, "grad_norm": 8.781404495239258, "learning_rate": 1e-06, "loss": 0.4053, "mean_token_accuracy": 0.8836344480514526, "num_tokens": 847738848.0, "step": 22216 }, { "epoch": 2.8262307594453633, "ewc_loss": 0.07527273893356323, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038895782199688256, "grad_norm": 8.866214752197266, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8693054914474487, "num_tokens": 847775952.0, "step": 22217 }, { "epoch": 2.826357969723954, "ewc_loss": 0.07509636133909225, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038719410076737404, "grad_norm": 8.936986923217773, "learning_rate": 1e-06, "loss": 0.4173, "mean_token_accuracy": 0.882177472114563, "num_tokens": 847809381.0, "step": 22218 }, { "epoch": 2.8264851800025443, "ewc_loss": 0.0750114843249321, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038634531665593386, "grad_norm": 8.866724014282227, "learning_rate": 1e-06, "loss": 0.4814, "mean_token_accuracy": 0.8624093532562256, "num_tokens": 847849835.0, "step": 22219 }, { "epoch": 2.826612390281135, "ewc_loss": 0.0752662867307663, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038889329880476, "grad_norm": 8.97562313079834, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8639249801635742, "num_tokens": 847882726.0, "step": 22220 }, { "epoch": 2.8267396005597254, "ewc_loss": 0.0748773068189621, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038500348455272615, "grad_norm": 8.894393920898438, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8662185668945312, "num_tokens": 847920663.0, "step": 22221 }, { "epoch": 2.8268668108383155, "ewc_loss": 0.07516968250274658, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003879272844642401, "grad_norm": 8.90517520904541, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8721337914466858, "num_tokens": 847962204.0, "step": 22222 }, { "epoch": 2.8269940211169065, "ewc_loss": 0.07503773272037506, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003866077749989927, "grad_norm": 8.926356315612793, "learning_rate": 1e-06, "loss": 0.4871, "mean_token_accuracy": 0.8576914668083191, "num_tokens": 848003722.0, "step": 22223 }, { "epoch": 2.8271212313954965, "ewc_loss": 0.07493878901004791, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038561836117878556, "grad_norm": 8.855420112609863, "learning_rate": 1e-06, "loss": 0.449, "mean_token_accuracy": 0.8745967149734497, "num_tokens": 848045335.0, "step": 22224 }, { "epoch": 2.8272484416740875, "ewc_loss": 0.0751657485961914, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000387887965189293, "grad_norm": 8.91300106048584, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.857955813407898, "num_tokens": 848088817.0, "step": 22225 }, { "epoch": 2.8273756519526776, "ewc_loss": 0.07494090497493744, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003856395196635276, "grad_norm": 8.914915084838867, "learning_rate": 1e-06, "loss": 0.4954, "mean_token_accuracy": 0.8561758399009705, "num_tokens": 848120993.0, "step": 22226 }, { "epoch": 2.827502862231268, "ewc_loss": 0.0749785378575325, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038601586129516363, "grad_norm": 8.86102294921875, "learning_rate": 1e-06, "loss": 0.5075, "mean_token_accuracy": 0.8561403751373291, "num_tokens": 848162542.0, "step": 22227 }, { "epoch": 2.8276300725098586, "ewc_loss": 0.07512229681015015, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038745347410440445, "grad_norm": 8.917923927307129, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8779760599136353, "num_tokens": 848198194.0, "step": 22228 }, { "epoch": 2.827757282788449, "ewc_loss": 0.07505623251199722, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038679278804920614, "grad_norm": 8.932050704956055, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.861024022102356, "num_tokens": 848240575.0, "step": 22229 }, { "epoch": 2.8278844930670397, "ewc_loss": 0.0751064345240593, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003872948291245848, "grad_norm": 8.950884819030762, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8656989336013794, "num_tokens": 848289026.0, "step": 22230 }, { "epoch": 2.8280117033456302, "ewc_loss": 0.07500417530536652, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003862721787299961, "grad_norm": 9.020135879516602, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8672003746032715, "num_tokens": 848326438.0, "step": 22231 }, { "epoch": 2.8281389136242208, "ewc_loss": 0.07492662966251373, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038549673627130687, "grad_norm": 8.89089584350586, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8722979426383972, "num_tokens": 848368106.0, "step": 22232 }, { "epoch": 2.8282661239028113, "ewc_loss": 0.07520042359828949, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003882346791215241, "grad_norm": 8.970696449279785, "learning_rate": 1e-06, "loss": 0.4745, "mean_token_accuracy": 0.863096296787262, "num_tokens": 848409775.0, "step": 22233 }, { "epoch": 2.828393334181402, "ewc_loss": 0.07484105229377747, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038464099634438753, "grad_norm": 8.991822242736816, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8657746315002441, "num_tokens": 848442149.0, "step": 22234 }, { "epoch": 2.8285205444599923, "ewc_loss": 0.0749630481004715, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000385860912501812, "grad_norm": 8.9210844039917, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8597754836082458, "num_tokens": 848475468.0, "step": 22235 }, { "epoch": 2.828647754738583, "ewc_loss": 0.07491885870695114, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038541905814781785, "grad_norm": 8.938919067382812, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8665006160736084, "num_tokens": 848509565.0, "step": 22236 }, { "epoch": 2.8287749650171734, "ewc_loss": 0.07490639388561249, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852944355458021, "grad_norm": 8.855809211730957, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8675388097763062, "num_tokens": 848546357.0, "step": 22237 }, { "epoch": 2.828902175295764, "ewc_loss": 0.07508605718612671, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038709104410372674, "grad_norm": 8.884452819824219, "learning_rate": 1e-06, "loss": 0.4963, "mean_token_accuracy": 0.854994535446167, "num_tokens": 848587661.0, "step": 22238 }, { "epoch": 2.8290293855743545, "ewc_loss": 0.07480292022228241, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842596779577434, "grad_norm": 8.845019340515137, "learning_rate": 1e-06, "loss": 0.4262, "mean_token_accuracy": 0.8695490956306458, "num_tokens": 848628250.0, "step": 22239 }, { "epoch": 2.829156595852945, "ewc_loss": 0.0750855952501297, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003870864456985146, "grad_norm": 8.965837478637695, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8660049438476562, "num_tokens": 848670452.0, "step": 22240 }, { "epoch": 2.8292838061315355, "ewc_loss": 0.07472764700651169, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038350693648681045, "grad_norm": 8.928390502929688, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8670377135276794, "num_tokens": 848703592.0, "step": 22241 }, { "epoch": 2.829411016410126, "ewc_loss": 0.07508211582899094, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038705160841345787, "grad_norm": 8.951985359191895, "learning_rate": 1e-06, "loss": 0.5216, "mean_token_accuracy": 0.8500388860702515, "num_tokens": 848739291.0, "step": 22242 }, { "epoch": 2.8295382266887166, "ewc_loss": 0.07492562383413315, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038548672455362976, "grad_norm": 8.897177696228027, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8589357137680054, "num_tokens": 848776039.0, "step": 22243 }, { "epoch": 2.829665436967307, "ewc_loss": 0.07507658749818802, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038699634023942053, "grad_norm": 8.873496055603027, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8695398569107056, "num_tokens": 848814799.0, "step": 22244 }, { "epoch": 2.8297926472458976, "ewc_loss": 0.07504692673683167, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038669968489557505, "grad_norm": 8.857462882995605, "learning_rate": 1e-06, "loss": 0.4576, "mean_token_accuracy": 0.8672740459442139, "num_tokens": 848860123.0, "step": 22245 }, { "epoch": 2.829919857524488, "ewc_loss": 0.0752846747636795, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000389077264117077, "grad_norm": 8.9666748046875, "learning_rate": 1e-06, "loss": 0.4045, "mean_token_accuracy": 0.8829083442687988, "num_tokens": 848900447.0, "step": 22246 }, { "epoch": 2.8300470678030782, "ewc_loss": 0.07490098476409912, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038524033152498305, "grad_norm": 8.849955558776855, "learning_rate": 1e-06, "loss": 0.4907, "mean_token_accuracy": 0.859088659286499, "num_tokens": 848945385.0, "step": 22247 }, { "epoch": 2.830174278081669, "ewc_loss": 0.07541093230247498, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039033981738612056, "grad_norm": 9.062036514282227, "learning_rate": 1e-06, "loss": 0.4893, "mean_token_accuracy": 0.8549914360046387, "num_tokens": 848978783.0, "step": 22248 }, { "epoch": 2.8303014883602593, "ewc_loss": 0.0748516395688057, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038474687607958913, "grad_norm": 8.842385292053223, "learning_rate": 1e-06, "loss": 0.5058, "mean_token_accuracy": 0.8533985614776611, "num_tokens": 849017324.0, "step": 22249 }, { "epoch": 2.8304286986388503, "ewc_loss": 0.07564280927181244, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039265857776626945, "grad_norm": 9.10917854309082, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8875192403793335, "num_tokens": 849045928.0, "step": 22250 }, { "epoch": 2.8305559089174404, "ewc_loss": 0.07471148669719696, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038334535202011466, "grad_norm": 9.207965850830078, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8651801347732544, "num_tokens": 849080855.0, "step": 22251 }, { "epoch": 2.830683119196031, "ewc_loss": 0.07465922087430954, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003828226763289422, "grad_norm": 8.773092269897461, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8694069385528564, "num_tokens": 849114835.0, "step": 22252 }, { "epoch": 2.8308103294746214, "ewc_loss": 0.07550935447216034, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000391323963413015, "grad_norm": 9.038614273071289, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8545814752578735, "num_tokens": 849152632.0, "step": 22253 }, { "epoch": 2.830937539753212, "ewc_loss": 0.07453477382659912, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003815781674347818, "grad_norm": 8.766580581665039, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8735232353210449, "num_tokens": 849189911.0, "step": 22254 }, { "epoch": 2.8310647500318025, "ewc_loss": 0.07568001747131348, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003930306702386588, "grad_norm": 9.053993225097656, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8632192611694336, "num_tokens": 849227847.0, "step": 22255 }, { "epoch": 2.831191960310393, "ewc_loss": 0.07450617849826813, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003812922805082053, "grad_norm": 8.710701942443848, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8713926672935486, "num_tokens": 849265778.0, "step": 22256 }, { "epoch": 2.8313191705889835, "ewc_loss": 0.07585792243480682, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039480964187532663, "grad_norm": 9.018147468566895, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.864920973777771, "num_tokens": 849302488.0, "step": 22257 }, { "epoch": 2.831446380867574, "ewc_loss": 0.07474225759506226, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003836530668195337, "grad_norm": 8.844858169555664, "learning_rate": 1e-06, "loss": 0.4132, "mean_token_accuracy": 0.8816319704055786, "num_tokens": 849337483.0, "step": 22258 }, { "epoch": 2.8315735911461646, "ewc_loss": 0.07591824233531952, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039297150215134025, "grad_norm": 8.98218059539795, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8712037205696106, "num_tokens": 849378491.0, "step": 22259 }, { "epoch": 2.831700801424755, "ewc_loss": 0.07530969381332397, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003868860367219895, "grad_norm": 8.876267433166504, "learning_rate": 1e-06, "loss": 0.447, "mean_token_accuracy": 0.8724779486656189, "num_tokens": 849418897.0, "step": 22260 }, { "epoch": 2.8318280117033456, "ewc_loss": 0.07561418414115906, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038993085036054254, "grad_norm": 8.941060066223145, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8690556287765503, "num_tokens": 849457057.0, "step": 22261 }, { "epoch": 2.831955221981936, "ewc_loss": 0.07520692050457001, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003858582931570709, "grad_norm": 8.803629875183105, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.869126558303833, "num_tokens": 849488825.0, "step": 22262 }, { "epoch": 2.8320824322605267, "ewc_loss": 0.07573641836643219, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003911532403435558, "grad_norm": 8.910511016845703, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8737391233444214, "num_tokens": 849528336.0, "step": 22263 }, { "epoch": 2.8322096425391172, "ewc_loss": 0.07484325766563416, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003846630861517042, "grad_norm": 8.811118125915527, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8799086809158325, "num_tokens": 849565373.0, "step": 22264 }, { "epoch": 2.8323368528177078, "ewc_loss": 0.07547375559806824, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003909680526703596, "grad_norm": 8.925040245056152, "learning_rate": 1e-06, "loss": 0.4185, "mean_token_accuracy": 0.8779373168945312, "num_tokens": 849600909.0, "step": 22265 }, { "epoch": 2.8324640630962983, "ewc_loss": 0.0749703049659729, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003859334974549711, "grad_norm": 8.827871322631836, "learning_rate": 1e-06, "loss": 0.4945, "mean_token_accuracy": 0.856640100479126, "num_tokens": 849639499.0, "step": 22266 }, { "epoch": 2.832591273374889, "ewc_loss": 0.07526031136512756, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003888336068484932, "grad_norm": 8.894514083862305, "learning_rate": 1e-06, "loss": 0.4225, "mean_token_accuracy": 0.8766081929206848, "num_tokens": 849677232.0, "step": 22267 }, { "epoch": 2.8327184836534793, "ewc_loss": 0.07502295821905136, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003864600439555943, "grad_norm": 8.793779373168945, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8722196221351624, "num_tokens": 849717187.0, "step": 22268 }, { "epoch": 2.83284569393207, "ewc_loss": 0.0752384215593338, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003886146587319672, "grad_norm": 8.881776809692383, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.875334620475769, "num_tokens": 849756545.0, "step": 22269 }, { "epoch": 2.83297290421066, "ewc_loss": 0.07534053921699524, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003871944500133395, "grad_norm": 8.868110656738281, "learning_rate": 1e-06, "loss": 0.4343, "mean_token_accuracy": 0.8684810400009155, "num_tokens": 849788670.0, "step": 22270 }, { "epoch": 2.833100114489251, "ewc_loss": 0.07522836327552795, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003885141049977392, "grad_norm": 8.874561309814453, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8662740588188171, "num_tokens": 849825715.0, "step": 22271 }, { "epoch": 2.833227324767841, "ewc_loss": 0.07529128342866898, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003891433007083833, "grad_norm": 8.953041076660156, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.871995747089386, "num_tokens": 849856435.0, "step": 22272 }, { "epoch": 2.833354535046432, "ewc_loss": 0.0753757506608963, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038754657725803554, "grad_norm": 8.882547378540039, "learning_rate": 1e-06, "loss": 0.4191, "mean_token_accuracy": 0.881142258644104, "num_tokens": 849895634.0, "step": 22273 }, { "epoch": 2.833481745325022, "ewc_loss": 0.07519589364528656, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038818936445750296, "grad_norm": 8.86297607421875, "learning_rate": 1e-06, "loss": 0.5291, "mean_token_accuracy": 0.8458163738250732, "num_tokens": 849938646.0, "step": 22274 }, { "epoch": 2.833608955603613, "ewc_loss": 0.07518576830625534, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003880881704390049, "grad_norm": 8.916149139404297, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8640784025192261, "num_tokens": 849978642.0, "step": 22275 }, { "epoch": 2.833736165882203, "ewc_loss": 0.07510754466056824, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038730588858015835, "grad_norm": 8.891060829162598, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.865526556968689, "num_tokens": 850017820.0, "step": 22276 }, { "epoch": 2.8338633761607936, "ewc_loss": 0.0751490443944931, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003877208800986409, "grad_norm": 8.894893646240234, "learning_rate": 1e-06, "loss": 0.5269, "mean_token_accuracy": 0.8480749130249023, "num_tokens": 850056619.0, "step": 22277 }, { "epoch": 2.833990586439384, "ewc_loss": 0.07520030438899994, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003882335149683058, "grad_norm": 8.97425651550293, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8630713224411011, "num_tokens": 850092456.0, "step": 22278 }, { "epoch": 2.8341177967179747, "ewc_loss": 0.07499366253614426, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003861670847982168, "grad_norm": 8.852446556091309, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8623038530349731, "num_tokens": 850131598.0, "step": 22279 }, { "epoch": 2.8342450069965652, "ewc_loss": 0.07533743977546692, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038960485835559666, "grad_norm": 8.942574501037598, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8566083908081055, "num_tokens": 850170884.0, "step": 22280 }, { "epoch": 2.8343722172751558, "ewc_loss": 0.07487937808036804, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003850242937915027, "grad_norm": 8.779289245605469, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8632624745368958, "num_tokens": 850206989.0, "step": 22281 }, { "epoch": 2.8344994275537463, "ewc_loss": 0.07549384981393814, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003911689855158329, "grad_norm": 8.889389038085938, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8623274564743042, "num_tokens": 850248602.0, "step": 22282 }, { "epoch": 2.834626637832337, "ewc_loss": 0.07505965977907181, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038682707236148417, "grad_norm": 8.869431495666504, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8571926355361938, "num_tokens": 850289790.0, "step": 22283 }, { "epoch": 2.8347538481109273, "ewc_loss": 0.07530905306339264, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003893210378009826, "grad_norm": 8.912124633789062, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8745007514953613, "num_tokens": 850326688.0, "step": 22284 }, { "epoch": 2.834881058389518, "ewc_loss": 0.07514916360378265, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038772207335568964, "grad_norm": 8.862777709960938, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8781381845474243, "num_tokens": 850366460.0, "step": 22285 }, { "epoch": 2.8350082686681084, "ewc_loss": 0.07537302374839783, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038996065268293023, "grad_norm": 9.023347854614258, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.8733438849449158, "num_tokens": 850400046.0, "step": 22286 }, { "epoch": 2.835135478946699, "ewc_loss": 0.0749591812491417, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003858222917187959, "grad_norm": 8.819533348083496, "learning_rate": 1e-06, "loss": 0.5067, "mean_token_accuracy": 0.8559863567352295, "num_tokens": 850434374.0, "step": 22287 }, { "epoch": 2.8352626892252895, "ewc_loss": 0.0755959302186966, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003921897441614419, "grad_norm": 8.966012954711914, "learning_rate": 1e-06, "loss": 0.4575, "mean_token_accuracy": 0.8680475354194641, "num_tokens": 850469056.0, "step": 22288 }, { "epoch": 2.83538989950388, "ewc_loss": 0.07491157948970795, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000385346298571676, "grad_norm": 8.796295166015625, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8632336854934692, "num_tokens": 850505805.0, "step": 22289 }, { "epoch": 2.8355171097824705, "ewc_loss": 0.07564502209424973, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039268069667741656, "grad_norm": 9.043513298034668, "learning_rate": 1e-06, "loss": 0.5066, "mean_token_accuracy": 0.8540724515914917, "num_tokens": 850550056.0, "step": 22290 }, { "epoch": 2.835644320061061, "ewc_loss": 0.07478854060173035, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038411590503528714, "grad_norm": 8.82272720336914, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8749491572380066, "num_tokens": 850585075.0, "step": 22291 }, { "epoch": 2.8357715303396516, "ewc_loss": 0.07559668272733688, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003921972820535302, "grad_norm": 9.013969421386719, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8768455982208252, "num_tokens": 850625758.0, "step": 22292 }, { "epoch": 2.835898740618242, "ewc_loss": 0.07469324767589569, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038316298741847277, "grad_norm": 8.705046653747559, "learning_rate": 1e-06, "loss": 0.4115, "mean_token_accuracy": 0.8795223832130432, "num_tokens": 850664608.0, "step": 22293 }, { "epoch": 2.8360259508968326, "ewc_loss": 0.07587191462516785, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003949496312998235, "grad_norm": 9.18414306640625, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.874433159828186, "num_tokens": 850705897.0, "step": 22294 }, { "epoch": 2.8361531611754227, "ewc_loss": 0.07438591122627258, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038008959381841123, "grad_norm": 8.66213607788086, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8605976104736328, "num_tokens": 850746195.0, "step": 22295 }, { "epoch": 2.8362803714540137, "ewc_loss": 0.07616647332906723, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003978952008765191, "grad_norm": 9.224600791931152, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8734718561172485, "num_tokens": 850785183.0, "step": 22296 }, { "epoch": 2.8364075817326038, "ewc_loss": 0.07443827390670776, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003806132299359888, "grad_norm": 8.738618850708008, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8713083267211914, "num_tokens": 850819558.0, "step": 22297 }, { "epoch": 2.8365347920111947, "ewc_loss": 0.07623782008886337, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039860865217633545, "grad_norm": 9.363992691040039, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8574130535125732, "num_tokens": 850857385.0, "step": 22298 }, { "epoch": 2.836662002289785, "ewc_loss": 0.07426097989082336, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000378840253688395, "grad_norm": 8.671462059020996, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.861298680305481, "num_tokens": 850895297.0, "step": 22299 }, { "epoch": 2.836789212568376, "ewc_loss": 0.07663274556398392, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004025579255539924, "grad_norm": 9.301424980163574, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8657562732696533, "num_tokens": 850930982.0, "step": 22300 }, { "epoch": 2.836916422846966, "ewc_loss": 0.07440793514251709, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003803097933996469, "grad_norm": 8.766159057617188, "learning_rate": 1e-06, "loss": 0.4868, "mean_token_accuracy": 0.8621566295623779, "num_tokens": 850969032.0, "step": 22301 }, { "epoch": 2.8370436331255564, "ewc_loss": 0.07638579607009888, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004000884364359081, "grad_norm": 9.195640563964844, "learning_rate": 1e-06, "loss": 0.4564, "mean_token_accuracy": 0.8691318035125732, "num_tokens": 851008325.0, "step": 22302 }, { "epoch": 2.837170843404147, "ewc_loss": 0.07478366047143936, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038406706880778074, "grad_norm": 8.910408020019531, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8681507110595703, "num_tokens": 851044289.0, "step": 22303 }, { "epoch": 2.8372980536827375, "ewc_loss": 0.07568085938692093, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003930390812456608, "grad_norm": 9.096965789794922, "learning_rate": 1e-06, "loss": 0.4419, "mean_token_accuracy": 0.8725631237030029, "num_tokens": 851079887.0, "step": 22304 }, { "epoch": 2.837425263961328, "ewc_loss": 0.07480081170797348, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842385776806623, "grad_norm": 8.956604957580566, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8689420223236084, "num_tokens": 851124535.0, "step": 22305 }, { "epoch": 2.8375524742399185, "ewc_loss": 0.07530157268047333, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003892462409567088, "grad_norm": 8.961685180664062, "learning_rate": 1e-06, "loss": 0.4259, "mean_token_accuracy": 0.876454770565033, "num_tokens": 851162982.0, "step": 22306 }, { "epoch": 2.837679684518509, "ewc_loss": 0.07505427300930023, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003867731720674783, "grad_norm": 8.984277725219727, "learning_rate": 1e-06, "loss": 0.4263, "mean_token_accuracy": 0.874687910079956, "num_tokens": 851201864.0, "step": 22307 }, { "epoch": 2.8378068947970996, "ewc_loss": 0.07498867064714432, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003861171717289835, "grad_norm": 8.92729377746582, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.869559109210968, "num_tokens": 851241659.0, "step": 22308 }, { "epoch": 2.83793410507569, "ewc_loss": 0.07520824670791626, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038831293932162225, "grad_norm": 8.973869323730469, "learning_rate": 1e-06, "loss": 0.4835, "mean_token_accuracy": 0.8576710224151611, "num_tokens": 851278861.0, "step": 22309 }, { "epoch": 2.8380613153542806, "ewc_loss": 0.07482695579528809, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000384500075597316, "grad_norm": 8.915730476379395, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8674012422561646, "num_tokens": 851314442.0, "step": 22310 }, { "epoch": 2.838188525632871, "ewc_loss": 0.07533667981624603, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003895972913596779, "grad_norm": 8.952256202697754, "learning_rate": 1e-06, "loss": 0.4746, "mean_token_accuracy": 0.8616067171096802, "num_tokens": 851356322.0, "step": 22311 }, { "epoch": 2.8383157359114617, "ewc_loss": 0.07492119073867798, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003854423703160137, "grad_norm": 8.869616508483887, "learning_rate": 1e-06, "loss": 0.5007, "mean_token_accuracy": 0.8543076515197754, "num_tokens": 851395374.0, "step": 22312 }, { "epoch": 2.838442946190052, "ewc_loss": 0.07517196238040924, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003879500727634877, "grad_norm": 8.974336624145508, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8653024435043335, "num_tokens": 851430957.0, "step": 22313 }, { "epoch": 2.8385701564686427, "ewc_loss": 0.07489781081676483, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852085501421243, "grad_norm": 8.837287902832031, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8614836931228638, "num_tokens": 851474357.0, "step": 22314 }, { "epoch": 2.8386973667472333, "ewc_loss": 0.07533383369445801, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003895687696058303, "grad_norm": 9.000137329101562, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8793504238128662, "num_tokens": 851510346.0, "step": 22315 }, { "epoch": 2.838824577025824, "ewc_loss": 0.07505185902118683, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003867490740958601, "grad_norm": 8.872965812683105, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8686081767082214, "num_tokens": 851546798.0, "step": 22316 }, { "epoch": 2.8389517873044143, "ewc_loss": 0.07532578706741333, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038948829751461744, "grad_norm": 9.009270668029785, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8697391748428345, "num_tokens": 851587819.0, "step": 22317 }, { "epoch": 2.839078997583005, "ewc_loss": 0.0748983770608902, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038521422538906336, "grad_norm": 8.838595390319824, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8708860874176025, "num_tokens": 851628764.0, "step": 22318 }, { "epoch": 2.8392062078615954, "ewc_loss": 0.07556362450122833, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003918666625395417, "grad_norm": 8.922048568725586, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8537155985832214, "num_tokens": 851672206.0, "step": 22319 }, { "epoch": 2.8393334181401855, "ewc_loss": 0.07508689165115356, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038709936779923737, "grad_norm": 8.855724334716797, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8609137535095215, "num_tokens": 851709969.0, "step": 22320 }, { "epoch": 2.8394606284187764, "ewc_loss": 0.07543991506099701, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003906295751221478, "grad_norm": 8.971323013305664, "learning_rate": 1e-06, "loss": 0.493, "mean_token_accuracy": 0.8593348860740662, "num_tokens": 851749015.0, "step": 22321 }, { "epoch": 2.8395878386973665, "ewc_loss": 0.07515605539083481, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038779102033004165, "grad_norm": 8.909262657165527, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.8715180158615112, "num_tokens": 851787210.0, "step": 22322 }, { "epoch": 2.8397150489759575, "ewc_loss": 0.07531332224607468, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038936370401643217, "grad_norm": 8.992650985717773, "learning_rate": 1e-06, "loss": 0.4806, "mean_token_accuracy": 0.8606191277503967, "num_tokens": 851824677.0, "step": 22323 }, { "epoch": 2.8398422592545476, "ewc_loss": 0.0751771554350853, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038800202310085297, "grad_norm": 8.995262145996094, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.8722364902496338, "num_tokens": 851863614.0, "step": 22324 }, { "epoch": 2.839969469533138, "ewc_loss": 0.07510048151016235, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003872353117913008, "grad_norm": 8.930793762207031, "learning_rate": 1e-06, "loss": 0.4777, "mean_token_accuracy": 0.8608391284942627, "num_tokens": 851908153.0, "step": 22325 }, { "epoch": 2.8400966798117286, "ewc_loss": 0.0752996876835823, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038922735257074237, "grad_norm": 8.946791648864746, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.862912654876709, "num_tokens": 851946473.0, "step": 22326 }, { "epoch": 2.840223890090319, "ewc_loss": 0.07497567683458328, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003859872231259942, "grad_norm": 8.864938735961914, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8564274311065674, "num_tokens": 851991600.0, "step": 22327 }, { "epoch": 2.8403511003689097, "ewc_loss": 0.07535738497972488, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038980430690571666, "grad_norm": 8.930370330810547, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8674740791320801, "num_tokens": 852029468.0, "step": 22328 }, { "epoch": 2.8404783106475002, "ewc_loss": 0.07513612508773804, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003875917464029044, "grad_norm": 9.055423736572266, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8718543648719788, "num_tokens": 852061400.0, "step": 22329 }, { "epoch": 2.8406055209260908, "ewc_loss": 0.07490430772304535, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852735098917037, "grad_norm": 8.914655685424805, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8666605949401855, "num_tokens": 852100596.0, "step": 22330 }, { "epoch": 2.8407327312046813, "ewc_loss": 0.07513178139925003, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003875482943840325, "grad_norm": 8.889566421508789, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.86869215965271, "num_tokens": 852140320.0, "step": 22331 }, { "epoch": 2.840859941483272, "ewc_loss": 0.07485809922218323, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003848114574793726, "grad_norm": 8.898290634155273, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.8643227815628052, "num_tokens": 852175969.0, "step": 22332 }, { "epoch": 2.8409871517618623, "ewc_loss": 0.07510589808225632, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003872894449159503, "grad_norm": 8.989070892333984, "learning_rate": 1e-06, "loss": 0.5087, "mean_token_accuracy": 0.8524118661880493, "num_tokens": 852210816.0, "step": 22333 }, { "epoch": 2.841114362040453, "ewc_loss": 0.07494524121284485, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038568288437090814, "grad_norm": 8.893692016601562, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8651592135429382, "num_tokens": 852246684.0, "step": 22334 }, { "epoch": 2.8412415723190434, "ewc_loss": 0.07518349587917328, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038806541124358773, "grad_norm": 8.940988540649414, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8698415160179138, "num_tokens": 852285283.0, "step": 22335 }, { "epoch": 2.841368782597634, "ewc_loss": 0.07493485510349274, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038557901280000806, "grad_norm": 8.857694625854492, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8730239868164062, "num_tokens": 852323310.0, "step": 22336 }, { "epoch": 2.8414959928762245, "ewc_loss": 0.07533638179302216, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003895942645613104, "grad_norm": 8.952715873718262, "learning_rate": 1e-06, "loss": 0.4563, "mean_token_accuracy": 0.8698102235794067, "num_tokens": 852360850.0, "step": 22337 }, { "epoch": 2.841623203154815, "ewc_loss": 0.0748499184846878, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847296757157892, "grad_norm": 8.914730072021484, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8598206043243408, "num_tokens": 852399320.0, "step": 22338 }, { "epoch": 2.8417504134334055, "ewc_loss": 0.07535389065742493, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003897693823091686, "grad_norm": 9.030905723571777, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8635368943214417, "num_tokens": 852430980.0, "step": 22339 }, { "epoch": 2.841877623711996, "ewc_loss": 0.07472854852676392, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835159877780825, "grad_norm": 8.796433448791504, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8777801990509033, "num_tokens": 852465688.0, "step": 22340 }, { "epoch": 2.8420048339905866, "ewc_loss": 0.07555469870567322, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039177745929919183, "grad_norm": 8.992101669311523, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8651900291442871, "num_tokens": 852504372.0, "step": 22341 }, { "epoch": 2.842132044269177, "ewc_loss": 0.0747314840555191, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038354535354301333, "grad_norm": 8.833147048950195, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8703542947769165, "num_tokens": 852542232.0, "step": 22342 }, { "epoch": 2.8422592545477676, "ewc_loss": 0.07545574009418488, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039078789995983243, "grad_norm": 9.081986427307129, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8577250242233276, "num_tokens": 852580005.0, "step": 22343 }, { "epoch": 2.842386464826358, "ewc_loss": 0.07479792833328247, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038420979399234056, "grad_norm": 8.781100273132324, "learning_rate": 1e-06, "loss": 0.4219, "mean_token_accuracy": 0.8782909512519836, "num_tokens": 852619912.0, "step": 22344 }, { "epoch": 2.8425136751049482, "ewc_loss": 0.07567337900400162, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003929642552975565, "grad_norm": 9.084993362426758, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.868982195854187, "num_tokens": 852654349.0, "step": 22345 }, { "epoch": 2.842640885383539, "ewc_loss": 0.0747886672616005, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003841171273961663, "grad_norm": 8.93094539642334, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.8588424324989319, "num_tokens": 852691300.0, "step": 22346 }, { "epoch": 2.8427680956621293, "ewc_loss": 0.0753614753484726, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003898451686836779, "grad_norm": 8.93241024017334, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8653768301010132, "num_tokens": 852731228.0, "step": 22347 }, { "epoch": 2.8428953059407203, "ewc_loss": 0.07524944841861725, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003887249913532287, "grad_norm": 9.053009986877441, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8650578856468201, "num_tokens": 852768333.0, "step": 22348 }, { "epoch": 2.8430225162193103, "ewc_loss": 0.074858158826828, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003848120104521513, "grad_norm": 8.958399772644043, "learning_rate": 1e-06, "loss": 0.5655, "mean_token_accuracy": 0.8344389796257019, "num_tokens": 852810962.0, "step": 22349 }, { "epoch": 2.843149726497901, "ewc_loss": 0.07529711723327637, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038920168299227953, "grad_norm": 8.95843505859375, "learning_rate": 1e-06, "loss": 0.4203, "mean_token_accuracy": 0.8815687894821167, "num_tokens": 852844326.0, "step": 22350 }, { "epoch": 2.8432769367764914, "ewc_loss": 0.07494194805622101, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003856499679386616, "grad_norm": 9.010937690734863, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.8602790236473083, "num_tokens": 852877737.0, "step": 22351 }, { "epoch": 2.843404147055082, "ewc_loss": 0.07486872375011444, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849176864605397, "grad_norm": 8.8572998046875, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8736248016357422, "num_tokens": 852922085.0, "step": 22352 }, { "epoch": 2.8435313573336725, "ewc_loss": 0.07539257407188416, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000390156201319769, "grad_norm": 8.940998077392578, "learning_rate": 1e-06, "loss": 0.4075, "mean_token_accuracy": 0.8833338022232056, "num_tokens": 852960742.0, "step": 22353 }, { "epoch": 2.843658567612263, "ewc_loss": 0.07498578727245331, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003860883880406618, "grad_norm": 8.926770210266113, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.8706644177436829, "num_tokens": 852993502.0, "step": 22354 }, { "epoch": 2.8437857778908535, "ewc_loss": 0.07517846673727036, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003880151198245585, "grad_norm": 8.896622657775879, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8718700408935547, "num_tokens": 853036844.0, "step": 22355 }, { "epoch": 2.843912988169444, "ewc_loss": 0.07514873147010803, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003877177950926125, "grad_norm": 8.952905654907227, "learning_rate": 1e-06, "loss": 0.4611, "mean_token_accuracy": 0.865237832069397, "num_tokens": 853070851.0, "step": 22356 }, { "epoch": 2.8440401984480346, "ewc_loss": 0.07511065155267715, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038733697147108614, "grad_norm": 8.921865463256836, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8654093146324158, "num_tokens": 853106116.0, "step": 22357 }, { "epoch": 2.844167408726625, "ewc_loss": 0.07511328160762787, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038736333954147995, "grad_norm": 8.846464157104492, "learning_rate": 1e-06, "loss": 0.4682, "mean_token_accuracy": 0.8648369908332825, "num_tokens": 853152776.0, "step": 22358 }, { "epoch": 2.8442946190052156, "ewc_loss": 0.07532428205013275, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003894733090419322, "grad_norm": 8.948253631591797, "learning_rate": 1e-06, "loss": 0.4623, "mean_token_accuracy": 0.8655878305435181, "num_tokens": 853194176.0, "step": 22359 }, { "epoch": 2.844421829283806, "ewc_loss": 0.0750063806772232, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038629426853731275, "grad_norm": 8.862059593200684, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8640019297599792, "num_tokens": 853236834.0, "step": 22360 }, { "epoch": 2.8445490395623967, "ewc_loss": 0.07543300837278366, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003905605408363044, "grad_norm": 8.998592376708984, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8574602007865906, "num_tokens": 853274145.0, "step": 22361 }, { "epoch": 2.844676249840987, "ewc_loss": 0.07480265200138092, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038425697130151093, "grad_norm": 8.831260681152344, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8681773543357849, "num_tokens": 853312569.0, "step": 22362 }, { "epoch": 2.8448034601195777, "ewc_loss": 0.07551711797714233, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003914016124326736, "grad_norm": 8.98625659942627, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8719157576560974, "num_tokens": 853351742.0, "step": 22363 }, { "epoch": 2.8449306703981683, "ewc_loss": 0.07489462196826935, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003851767396554351, "grad_norm": 8.77863883972168, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8685688972473145, "num_tokens": 853389607.0, "step": 22364 }, { "epoch": 2.845057880676759, "ewc_loss": 0.07579939067363739, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039422442205250263, "grad_norm": 9.026693344116211, "learning_rate": 1e-06, "loss": 0.4403, "mean_token_accuracy": 0.8727642297744751, "num_tokens": 853427218.0, "step": 22365 }, { "epoch": 2.8451850909553493, "ewc_loss": 0.07488775998353958, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003851080546155572, "grad_norm": 8.827442169189453, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8705078363418579, "num_tokens": 853464773.0, "step": 22366 }, { "epoch": 2.84531230123394, "ewc_loss": 0.07554370164871216, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003916674468200654, "grad_norm": 8.925054550170898, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8664533495903015, "num_tokens": 853507250.0, "step": 22367 }, { "epoch": 2.84543951151253, "ewc_loss": 0.07517307996749878, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003879612486343831, "grad_norm": 8.931859016418457, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8690857887268066, "num_tokens": 853551947.0, "step": 22368 }, { "epoch": 2.845566721791121, "ewc_loss": 0.0752367451786995, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003885979240294546, "grad_norm": 8.912470817565918, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8585628271102905, "num_tokens": 853590557.0, "step": 22369 }, { "epoch": 2.845693932069711, "ewc_loss": 0.07521842420101166, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000388414686312899, "grad_norm": 8.941835403442383, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8601628541946411, "num_tokens": 853630870.0, "step": 22370 }, { "epoch": 2.845821142348302, "ewc_loss": 0.07526890933513641, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003889195213560015, "grad_norm": 9.001043319702148, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8631301522254944, "num_tokens": 853662360.0, "step": 22371 }, { "epoch": 2.845948352626892, "ewc_loss": 0.07507695257663727, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038699994911439717, "grad_norm": 8.880067825317383, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8609739542007446, "num_tokens": 853703661.0, "step": 22372 }, { "epoch": 2.846075562905483, "ewc_loss": 0.07545390725135803, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003907695645466447, "grad_norm": 8.987049102783203, "learning_rate": 1e-06, "loss": 0.5094, "mean_token_accuracy": 0.8557296991348267, "num_tokens": 853742398.0, "step": 22373 }, { "epoch": 2.846202773184073, "ewc_loss": 0.0751158744096756, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000387389212846756, "grad_norm": 8.867536544799805, "learning_rate": 1e-06, "loss": 0.4449, "mean_token_accuracy": 0.8741117715835571, "num_tokens": 853781479.0, "step": 22374 }, { "epoch": 2.8463299834626636, "ewc_loss": 0.07551896572113037, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003914200933650136, "grad_norm": 8.964876174926758, "learning_rate": 1e-06, "loss": 0.541, "mean_token_accuracy": 0.8414421677589417, "num_tokens": 853822330.0, "step": 22375 }, { "epoch": 2.846457193741254, "ewc_loss": 0.07519775629043579, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038820799090899527, "grad_norm": 8.838248252868652, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8527122139930725, "num_tokens": 853862747.0, "step": 22376 }, { "epoch": 2.8465844040198447, "ewc_loss": 0.07559935748577118, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003922240575775504, "grad_norm": 8.967199325561523, "learning_rate": 1e-06, "loss": 0.3881, "mean_token_accuracy": 0.8901551961898804, "num_tokens": 853897124.0, "step": 22377 }, { "epoch": 2.8467116142984352, "ewc_loss": 0.07509216666221619, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038715210394002497, "grad_norm": 8.906311988830566, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8526953458786011, "num_tokens": 853930516.0, "step": 22378 }, { "epoch": 2.8468388245770258, "ewc_loss": 0.07559017091989517, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003921321767847985, "grad_norm": 8.960661888122559, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8712649345397949, "num_tokens": 853967016.0, "step": 22379 }, { "epoch": 2.8469660348556163, "ewc_loss": 0.07524309307336807, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003886613994836807, "grad_norm": 8.864662170410156, "learning_rate": 1e-06, "loss": 0.506, "mean_token_accuracy": 0.8498803377151489, "num_tokens": 854008804.0, "step": 22380 }, { "epoch": 2.847093245134207, "ewc_loss": 0.07557856291532516, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039201611070893705, "grad_norm": 9.03097915649414, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8534775972366333, "num_tokens": 854048393.0, "step": 22381 }, { "epoch": 2.8472204554127973, "ewc_loss": 0.0751040130853653, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038727058563381433, "grad_norm": 8.835909843444824, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8723621368408203, "num_tokens": 854089480.0, "step": 22382 }, { "epoch": 2.847347665691388, "ewc_loss": 0.07568000257015228, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003930305247195065, "grad_norm": 8.975279808044434, "learning_rate": 1e-06, "loss": 0.4289, "mean_token_accuracy": 0.8765236139297485, "num_tokens": 854125677.0, "step": 22383 }, { "epoch": 2.8474748759699784, "ewc_loss": 0.07506582885980606, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038688877248205245, "grad_norm": 8.855172157287598, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8622965216636658, "num_tokens": 854160485.0, "step": 22384 }, { "epoch": 2.847602086248569, "ewc_loss": 0.07562676072120667, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039249812834896147, "grad_norm": 9.001928329467773, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8579481840133667, "num_tokens": 854200635.0, "step": 22385 }, { "epoch": 2.8477292965271594, "ewc_loss": 0.0749962329864502, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038619284168817103, "grad_norm": 8.806994438171387, "learning_rate": 1e-06, "loss": 0.4511, "mean_token_accuracy": 0.8707665205001831, "num_tokens": 854240046.0, "step": 22386 }, { "epoch": 2.84785650680575, "ewc_loss": 0.07570134103298187, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003932438266929239, "grad_norm": 8.987884521484375, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8776320219039917, "num_tokens": 854275880.0, "step": 22387 }, { "epoch": 2.8479837170843405, "ewc_loss": 0.07510482519865036, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038727870560251176, "grad_norm": 8.87536334991455, "learning_rate": 1e-06, "loss": 0.4583, "mean_token_accuracy": 0.8675531148910522, "num_tokens": 854311204.0, "step": 22388 }, { "epoch": 2.848110927362931, "ewc_loss": 0.0756019651889801, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039225013460963964, "grad_norm": 8.999314308166504, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8632978796958923, "num_tokens": 854346076.0, "step": 22389 }, { "epoch": 2.8482381376415216, "ewc_loss": 0.07517790794372559, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003880095900967717, "grad_norm": 8.869848251342773, "learning_rate": 1e-06, "loss": 0.4366, "mean_token_accuracy": 0.8728294372558594, "num_tokens": 854383075.0, "step": 22390 }, { "epoch": 2.848365347920112, "ewc_loss": 0.07556255161762238, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000391856039641425, "grad_norm": 8.948028564453125, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.8585792183876038, "num_tokens": 854416011.0, "step": 22391 }, { "epoch": 2.8484925581987026, "ewc_loss": 0.07524244487285614, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003886549675371498, "grad_norm": 8.878211975097656, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8687356114387512, "num_tokens": 854449520.0, "step": 22392 }, { "epoch": 2.8486197684772927, "ewc_loss": 0.0754593163728714, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039082366856746376, "grad_norm": 8.92699146270752, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8630601763725281, "num_tokens": 854484355.0, "step": 22393 }, { "epoch": 2.8487469787558837, "ewc_loss": 0.07525570690631866, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038878750638104975, "grad_norm": 8.86447525024414, "learning_rate": 1e-06, "loss": 0.5178, "mean_token_accuracy": 0.851470947265625, "num_tokens": 854526238.0, "step": 22394 }, { "epoch": 2.8488741890344738, "ewc_loss": 0.07541286945343018, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003903591714333743, "grad_norm": 9.05615520477295, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8679821491241455, "num_tokens": 854562619.0, "step": 22395 }, { "epoch": 2.8490013993130647, "ewc_loss": 0.07508040964603424, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003870345535688102, "grad_norm": 9.185009002685547, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.871841549873352, "num_tokens": 854604974.0, "step": 22396 }, { "epoch": 2.849128609591655, "ewc_loss": 0.07496444880962372, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003858749405480921, "grad_norm": 9.221833229064941, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8742618560791016, "num_tokens": 854644066.0, "step": 22397 }, { "epoch": 2.849255819870246, "ewc_loss": 0.07454048097133636, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003816353273577988, "grad_norm": 8.775497436523438, "learning_rate": 1e-06, "loss": 0.4554, "mean_token_accuracy": 0.8689760565757751, "num_tokens": 854682723.0, "step": 22398 }, { "epoch": 2.849383030148836, "ewc_loss": 0.07540947943925858, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003903252654708922, "grad_norm": 9.342124938964844, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8766599297523499, "num_tokens": 854720292.0, "step": 22399 }, { "epoch": 2.8495102404274264, "ewc_loss": 0.07403579354286194, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003765884321182966, "grad_norm": 8.952544212341309, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8621714115142822, "num_tokens": 854756917.0, "step": 22400 }, { "epoch": 2.849637450706017, "ewc_loss": 0.07518691569566727, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038809963734820485, "grad_norm": 8.98619270324707, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8668335676193237, "num_tokens": 854798771.0, "step": 22401 }, { "epoch": 2.8497646609846075, "ewc_loss": 0.07427683472633362, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003789988113567233, "grad_norm": 8.967240333557129, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.8634432554244995, "num_tokens": 854838193.0, "step": 22402 }, { "epoch": 2.849891871263198, "ewc_loss": 0.07457397133111954, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038197016692720354, "grad_norm": 8.857172966003418, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8597508668899536, "num_tokens": 854871650.0, "step": 22403 }, { "epoch": 2.8500190815417885, "ewc_loss": 0.07481034100055695, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038433392182923853, "grad_norm": 8.949533462524414, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8646905422210693, "num_tokens": 854910757.0, "step": 22404 }, { "epoch": 2.850146291820379, "ewc_loss": 0.07431893050670624, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003794197691604495, "grad_norm": 8.846116065979004, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8530171513557434, "num_tokens": 854945018.0, "step": 22405 }, { "epoch": 2.8502735020989696, "ewc_loss": 0.0749909058213234, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003861395234707743, "grad_norm": 8.908390045166016, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.8584235906600952, "num_tokens": 854988369.0, "step": 22406 }, { "epoch": 2.85040071237756, "ewc_loss": 0.0745241716504097, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003814722003880888, "grad_norm": 8.899901390075684, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8682966232299805, "num_tokens": 855023935.0, "step": 22407 }, { "epoch": 2.8505279226561506, "ewc_loss": 0.07488982379436493, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000385128689231351, "grad_norm": 8.941075325012207, "learning_rate": 1e-06, "loss": 0.4721, "mean_token_accuracy": 0.8674120903015137, "num_tokens": 855062002.0, "step": 22408 }, { "epoch": 2.850655132934741, "ewc_loss": 0.07473081350326538, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835385723505169, "grad_norm": 8.926824569702148, "learning_rate": 1e-06, "loss": 0.4559, "mean_token_accuracy": 0.868653416633606, "num_tokens": 855101257.0, "step": 22409 }, { "epoch": 2.8507823432133317, "ewc_loss": 0.07480311393737793, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000384261627914384, "grad_norm": 8.9544038772583, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8675732016563416, "num_tokens": 855135834.0, "step": 22410 }, { "epoch": 2.850909553491922, "ewc_loss": 0.07476115226745605, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038384197978302836, "grad_norm": 8.891894340515137, "learning_rate": 1e-06, "loss": 0.454, "mean_token_accuracy": 0.8683946132659912, "num_tokens": 855174156.0, "step": 22411 }, { "epoch": 2.8510367637705127, "ewc_loss": 0.07483486831188202, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038457915070466697, "grad_norm": 8.951488494873047, "learning_rate": 1e-06, "loss": 0.4326, "mean_token_accuracy": 0.8761598467826843, "num_tokens": 855207698.0, "step": 22412 }, { "epoch": 2.8511639740491033, "ewc_loss": 0.07471771538257599, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003834076051134616, "grad_norm": 8.902999877929688, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8665528297424316, "num_tokens": 855240636.0, "step": 22413 }, { "epoch": 2.851291184327694, "ewc_loss": 0.07496900856494904, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038592054625041783, "grad_norm": 8.900675773620605, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8684296607971191, "num_tokens": 855279923.0, "step": 22414 }, { "epoch": 2.8514183946062843, "ewc_loss": 0.07471577823162079, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038338822196237743, "grad_norm": 8.93032455444336, "learning_rate": 1e-06, "loss": 0.4968, "mean_token_accuracy": 0.8543974757194519, "num_tokens": 855315460.0, "step": 22415 }, { "epoch": 2.851545604884875, "ewc_loss": 0.07493910193443298, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000385621446184814, "grad_norm": 8.89781379699707, "learning_rate": 1e-06, "loss": 0.4274, "mean_token_accuracy": 0.8740530610084534, "num_tokens": 855351548.0, "step": 22416 }, { "epoch": 2.8516728151634654, "ewc_loss": 0.07493051886558533, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003855356480926275, "grad_norm": 8.886138916015625, "learning_rate": 1e-06, "loss": 0.5199, "mean_token_accuracy": 0.846304178237915, "num_tokens": 855390715.0, "step": 22417 }, { "epoch": 2.8518000254420555, "ewc_loss": 0.0747823640704155, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003840541176032275, "grad_norm": 8.831095695495605, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.852204442024231, "num_tokens": 855431997.0, "step": 22418 }, { "epoch": 2.8519272357206464, "ewc_loss": 0.07519121468067169, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000388142594601959, "grad_norm": 8.930109024047852, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8604890704154968, "num_tokens": 855470242.0, "step": 22419 }, { "epoch": 2.8520544459992365, "ewc_loss": 0.07481769472360611, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003844074090011418, "grad_norm": 8.87635612487793, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.8533365726470947, "num_tokens": 855508901.0, "step": 22420 }, { "epoch": 2.8521816562778275, "ewc_loss": 0.07506874948740005, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038691796362400055, "grad_norm": 8.997276306152344, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8623441457748413, "num_tokens": 855545290.0, "step": 22421 }, { "epoch": 2.8523088665564176, "ewc_loss": 0.07459693402051926, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003821998252533376, "grad_norm": 8.837910652160645, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8616330623626709, "num_tokens": 855579567.0, "step": 22422 }, { "epoch": 2.852436076835008, "ewc_loss": 0.07524371892213821, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003886676568072289, "grad_norm": 8.901267051696777, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8591994643211365, "num_tokens": 855621117.0, "step": 22423 }, { "epoch": 2.8525632871135986, "ewc_loss": 0.07483838498592377, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038461427902802825, "grad_norm": 8.905328750610352, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.8699569702148438, "num_tokens": 855660494.0, "step": 22424 }, { "epoch": 2.852690497392189, "ewc_loss": 0.07490593194961548, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003852898080367595, "grad_norm": 8.953582763671875, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8643760681152344, "num_tokens": 855694043.0, "step": 22425 }, { "epoch": 2.8528177076707797, "ewc_loss": 0.07488971948623657, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003851276997011155, "grad_norm": 8.931025505065918, "learning_rate": 1e-06, "loss": 0.5038, "mean_token_accuracy": 0.851955235004425, "num_tokens": 855734615.0, "step": 22426 }, { "epoch": 2.85294491794937, "ewc_loss": 0.07494158297777176, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000385646300856024, "grad_norm": 8.911977767944336, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.876808762550354, "num_tokens": 855771141.0, "step": 22427 }, { "epoch": 2.8530721282279607, "ewc_loss": 0.07505178451538086, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038674831739626825, "grad_norm": 8.902944564819336, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8640050888061523, "num_tokens": 855806877.0, "step": 22428 }, { "epoch": 2.8531993385065513, "ewc_loss": 0.07490400969982147, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038527059950865805, "grad_norm": 9.018989562988281, "learning_rate": 1e-06, "loss": 0.4189, "mean_token_accuracy": 0.8800154328346252, "num_tokens": 855840076.0, "step": 22429 }, { "epoch": 2.853326548785142, "ewc_loss": 0.07482738047838211, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003845042665489018, "grad_norm": 11.465071678161621, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.876710057258606, "num_tokens": 855877972.0, "step": 22430 }, { "epoch": 2.8534537590637323, "ewc_loss": 0.07508812844753265, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038711176603101194, "grad_norm": 8.737372398376465, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8598816990852356, "num_tokens": 855917424.0, "step": 22431 }, { "epoch": 2.853580969342323, "ewc_loss": 0.07791544497013092, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00041538497316651046, "grad_norm": 9.560643196105957, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8558349013328552, "num_tokens": 855954853.0, "step": 22432 }, { "epoch": 2.8537081796209134, "ewc_loss": 0.07392586022615433, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00037548906402662396, "grad_norm": 10.944440841674805, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8686439990997314, "num_tokens": 855993300.0, "step": 22433 }, { "epoch": 2.853835389899504, "ewc_loss": 0.07517228275537491, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003879533032886684, "grad_norm": 8.681211471557617, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8686169981956482, "num_tokens": 856035307.0, "step": 22434 }, { "epoch": 2.8539626001780944, "ewc_loss": 0.07687972486019135, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004050277348142117, "grad_norm": 9.351980209350586, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8644931316375732, "num_tokens": 856071801.0, "step": 22435 }, { "epoch": 2.854089810456685, "ewc_loss": 0.07384591549634933, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003746896400116384, "grad_norm": 8.602078437805176, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8554835319519043, "num_tokens": 856112035.0, "step": 22436 }, { "epoch": 2.8542170207352755, "ewc_loss": 0.07813301682472229, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0004126778512727469, "grad_norm": 9.309350967407227, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8706993460655212, "num_tokens": 856148275.0, "step": 22437 }, { "epoch": 2.854344231013866, "ewc_loss": 0.0749119222164154, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003804668376687914, "grad_norm": 8.764781951904297, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8682097792625427, "num_tokens": 856180330.0, "step": 22438 }, { "epoch": 2.8544714412924566, "ewc_loss": 0.07721211016178131, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00040346875903196633, "grad_norm": 9.21783447265625, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8523387312889099, "num_tokens": 856219354.0, "step": 22439 }, { "epoch": 2.854598651571047, "ewc_loss": 0.07525353133678436, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003838829870801419, "grad_norm": 8.726367950439453, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8764621019363403, "num_tokens": 856257583.0, "step": 22440 }, { "epoch": 2.8547258618496376, "ewc_loss": 0.07708960771560669, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0004022437206003815, "grad_norm": 9.215559959411621, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8479311466217041, "num_tokens": 856298056.0, "step": 22441 }, { "epoch": 2.854853072128228, "ewc_loss": 0.07526012510061264, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003839489072561264, "grad_norm": 8.730035781860352, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8611642122268677, "num_tokens": 856334109.0, "step": 22442 }, { "epoch": 2.8549802824068182, "ewc_loss": 0.07687211036682129, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00040006876224651933, "grad_norm": 9.0979585647583, "learning_rate": 1e-06, "loss": 0.4537, "mean_token_accuracy": 0.8684039115905762, "num_tokens": 856376655.0, "step": 22443 }, { "epoch": 2.855107492685409, "ewc_loss": 0.07536222040653229, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003849698114208877, "grad_norm": 8.834195137023926, "learning_rate": 1e-06, "loss": 0.408, "mean_token_accuracy": 0.8817321062088013, "num_tokens": 856409926.0, "step": 22444 }, { "epoch": 2.8552347029639993, "ewc_loss": 0.07639441639184952, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039529180503450334, "grad_norm": 11.550566673278809, "learning_rate": 1e-06, "loss": 0.5042, "mean_token_accuracy": 0.8556700944900513, "num_tokens": 856451471.0, "step": 22445 }, { "epoch": 2.8553619132425903, "ewc_loss": 0.0770152360200882, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0004015000013168901, "grad_norm": 8.924379348754883, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8753098845481873, "num_tokens": 856483237.0, "step": 22446 }, { "epoch": 2.8554891235211803, "ewc_loss": 0.07857346534729004, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00041708230855874717, "grad_norm": 9.400858879089355, "learning_rate": 1e-06, "loss": 0.5052, "mean_token_accuracy": 0.8576444387435913, "num_tokens": 856521278.0, "step": 22447 }, { "epoch": 2.855616333799771, "ewc_loss": 0.07523548603057861, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038370254333131015, "grad_norm": 8.789518356323242, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.862238883972168, "num_tokens": 856565048.0, "step": 22448 }, { "epoch": 2.8557435440783614, "ewc_loss": 0.07812346518039703, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0004125823616050184, "grad_norm": 9.31984806060791, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.869828462600708, "num_tokens": 856603756.0, "step": 22449 }, { "epoch": 2.855870754356952, "ewc_loss": 0.07572096586227417, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038855732418596745, "grad_norm": 8.914494514465332, "learning_rate": 1e-06, "loss": 0.4526, "mean_token_accuracy": 0.8711122274398804, "num_tokens": 856640677.0, "step": 22450 }, { "epoch": 2.8559979646355425, "ewc_loss": 0.0767340138554573, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003986877854913473, "grad_norm": 9.057811737060547, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8658526539802551, "num_tokens": 856679982.0, "step": 22451 }, { "epoch": 2.856125174914133, "ewc_loss": 0.07597171515226364, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039106482290662825, "grad_norm": 8.899603843688965, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8531088829040527, "num_tokens": 856720621.0, "step": 22452 }, { "epoch": 2.8562523851927235, "ewc_loss": 0.07625256478786469, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003938732552342117, "grad_norm": 8.980817794799805, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8747388124465942, "num_tokens": 856757172.0, "step": 22453 }, { "epoch": 2.856379595471314, "ewc_loss": 0.07591289281845093, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039047657628543675, "grad_norm": 8.910102844238281, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.863510251045227, "num_tokens": 856796876.0, "step": 22454 }, { "epoch": 2.8565068057499046, "ewc_loss": 0.07593797147274017, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039072733488865197, "grad_norm": 8.889451026916504, "learning_rate": 1e-06, "loss": 0.4662, "mean_token_accuracy": 0.8658237457275391, "num_tokens": 856831148.0, "step": 22455 }, { "epoch": 2.856634016028495, "ewc_loss": 0.07594221085309982, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003907697682734579, "grad_norm": 8.896330833435059, "learning_rate": 1e-06, "loss": 0.4767, "mean_token_accuracy": 0.8615184426307678, "num_tokens": 856867091.0, "step": 22456 }, { "epoch": 2.8567612263070856, "ewc_loss": 0.07588455080986023, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003901931340806186, "grad_norm": 8.958683013916016, "learning_rate": 1e-06, "loss": 0.5057, "mean_token_accuracy": 0.8549220561981201, "num_tokens": 856906118.0, "step": 22457 }, { "epoch": 2.856888436585676, "ewc_loss": 0.07570478320121765, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003883954486809671, "grad_norm": 8.79067611694336, "learning_rate": 1e-06, "loss": 0.5471, "mean_token_accuracy": 0.8398821949958801, "num_tokens": 856942694.0, "step": 22458 }, { "epoch": 2.8570156468642667, "ewc_loss": 0.07617141306400299, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039306184044107795, "grad_norm": 8.978816986083984, "learning_rate": 1e-06, "loss": 0.4892, "mean_token_accuracy": 0.8570372462272644, "num_tokens": 856977530.0, "step": 22459 }, { "epoch": 2.857142857142857, "ewc_loss": 0.07543131709098816, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038566088187508285, "grad_norm": 8.74234676361084, "learning_rate": 1e-06, "loss": 0.4478, "mean_token_accuracy": 0.8680951595306396, "num_tokens": 857012417.0, "step": 22460 }, { "epoch": 2.8572700674214477, "ewc_loss": 0.0764712244272232, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.000396059884224087, "grad_norm": 8.90738582611084, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8591267466545105, "num_tokens": 857053556.0, "step": 22461 }, { "epoch": 2.8573972777000383, "ewc_loss": 0.07555698603391647, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038691749796271324, "grad_norm": 8.739909172058105, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8703749179840088, "num_tokens": 857094514.0, "step": 22462 }, { "epoch": 2.857524487978629, "ewc_loss": 0.07635466754436493, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039489430491812527, "grad_norm": 8.890613555908203, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8673704862594604, "num_tokens": 857133670.0, "step": 22463 }, { "epoch": 2.8576516982572193, "ewc_loss": 0.07578335702419281, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038918128120712936, "grad_norm": 8.81843090057373, "learning_rate": 1e-06, "loss": 0.4833, "mean_token_accuracy": 0.8575192093849182, "num_tokens": 857167934.0, "step": 22464 }, { "epoch": 2.85777890853581, "ewc_loss": 0.0761004239320755, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039235185249708593, "grad_norm": 8.925959587097168, "learning_rate": 1e-06, "loss": 0.5201, "mean_token_accuracy": 0.8487014174461365, "num_tokens": 857208263.0, "step": 22465 }, { "epoch": 2.8579061188144, "ewc_loss": 0.07587013393640518, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003900489828083664, "grad_norm": 8.820345878601074, "learning_rate": 1e-06, "loss": 0.5232, "mean_token_accuracy": 0.8479758501052856, "num_tokens": 857255107.0, "step": 22466 }, { "epoch": 2.858033329092991, "ewc_loss": 0.07609699666500092, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039231765549629927, "grad_norm": 8.820241928100586, "learning_rate": 1e-06, "loss": 0.4754, "mean_token_accuracy": 0.8636327981948853, "num_tokens": 857298368.0, "step": 22467 }, { "epoch": 2.858160539371581, "ewc_loss": 0.07608235627412796, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.000392171205021441, "grad_norm": 8.906573295593262, "learning_rate": 1e-06, "loss": 0.468, "mean_token_accuracy": 0.8658365607261658, "num_tokens": 857336899.0, "step": 22468 }, { "epoch": 2.858287749650172, "ewc_loss": 0.07573041319847107, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038865176611579955, "grad_norm": 8.82949447631836, "learning_rate": 1e-06, "loss": 0.4523, "mean_token_accuracy": 0.8697230219841003, "num_tokens": 857378482.0, "step": 22469 }, { "epoch": 2.858414959928762, "ewc_loss": 0.07608569413423538, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003922046162188053, "grad_norm": 8.89753246307373, "learning_rate": 1e-06, "loss": 0.4353, "mean_token_accuracy": 0.8748214840888977, "num_tokens": 857413203.0, "step": 22470 }, { "epoch": 2.858542170207353, "ewc_loss": 0.07572181522846222, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038856579340063035, "grad_norm": 8.898321151733398, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8633396625518799, "num_tokens": 857446706.0, "step": 22471 }, { "epoch": 2.858669380485943, "ewc_loss": 0.07530452311038971, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038927572313696146, "grad_norm": 8.859049797058105, "learning_rate": 1e-06, "loss": 0.4647, "mean_token_accuracy": 0.870101273059845, "num_tokens": 857483415.0, "step": 22472 }, { "epoch": 2.8587965907645336, "ewc_loss": 0.07522490620613098, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038847955875098705, "grad_norm": 8.809121131896973, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8706516027450562, "num_tokens": 857525323.0, "step": 22473 }, { "epoch": 2.858923801043124, "ewc_loss": 0.07535349577665329, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038976542418822646, "grad_norm": 8.819158554077148, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8801922798156738, "num_tokens": 857564400.0, "step": 22474 }, { "epoch": 2.8590510113217147, "ewc_loss": 0.07515906542539597, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003878211136907339, "grad_norm": 8.811079025268555, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8637757301330566, "num_tokens": 857602756.0, "step": 22475 }, { "epoch": 2.859178221600305, "ewc_loss": 0.07520119845867157, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003882424789480865, "grad_norm": 8.888958930969238, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8641490936279297, "num_tokens": 857636875.0, "step": 22476 }, { "epoch": 2.8593054318788957, "ewc_loss": 0.0752185508608818, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038841599598526955, "grad_norm": 8.804383277893066, "learning_rate": 1e-06, "loss": 0.4149, "mean_token_accuracy": 0.8831511735916138, "num_tokens": 857671950.0, "step": 22477 }, { "epoch": 2.8594326421574863, "ewc_loss": 0.07588465511798859, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003901942109223455, "grad_norm": 8.849769592285156, "learning_rate": 1e-06, "loss": 0.4277, "mean_token_accuracy": 0.8794729709625244, "num_tokens": 857705274.0, "step": 22478 }, { "epoch": 2.859559852436077, "ewc_loss": 0.07572188973426819, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038856655010022223, "grad_norm": 8.807987213134766, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8596898913383484, "num_tokens": 857743730.0, "step": 22479 }, { "epoch": 2.8596870627146673, "ewc_loss": 0.07547000795602798, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039093056693673134, "grad_norm": 8.869524955749512, "learning_rate": 1e-06, "loss": 0.4717, "mean_token_accuracy": 0.8623072504997253, "num_tokens": 857784055.0, "step": 22480 }, { "epoch": 2.859814272993258, "ewc_loss": 0.07572092860937119, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003885569458361715, "grad_norm": 8.876681327819824, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8715919256210327, "num_tokens": 857822431.0, "step": 22481 }, { "epoch": 2.8599414832718484, "ewc_loss": 0.07578466832637787, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.000389194319723174, "grad_norm": 8.808149337768555, "learning_rate": 1e-06, "loss": 0.4287, "mean_token_accuracy": 0.8775467872619629, "num_tokens": 857859258.0, "step": 22482 }, { "epoch": 2.860068693550439, "ewc_loss": 0.07528942823410034, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003891247615683824, "grad_norm": 8.83059024810791, "learning_rate": 1e-06, "loss": 0.4966, "mean_token_accuracy": 0.8567206859588623, "num_tokens": 857901209.0, "step": 22483 }, { "epoch": 2.8601959038290294, "ewc_loss": 0.07520483434200287, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003882788005284965, "grad_norm": 8.812392234802246, "learning_rate": 1e-06, "loss": 0.4208, "mean_token_accuracy": 0.8781705498695374, "num_tokens": 857935563.0, "step": 22484 }, { "epoch": 2.86032311410762, "ewc_loss": 0.07593695819377899, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003907172358594835, "grad_norm": 8.906148910522461, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8556443452835083, "num_tokens": 857966777.0, "step": 22485 }, { "epoch": 2.8604503243862105, "ewc_loss": 0.07500894367694855, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003863198508042842, "grad_norm": 8.748910903930664, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8730928897857666, "num_tokens": 858005078.0, "step": 22486 }, { "epoch": 2.860577534664801, "ewc_loss": 0.0756007581949234, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003922379983123392, "grad_norm": 8.930089950561523, "learning_rate": 1e-06, "loss": 0.5114, "mean_token_accuracy": 0.8541983962059021, "num_tokens": 858049288.0, "step": 22487 }, { "epoch": 2.8607047449433916, "ewc_loss": 0.07555727660655975, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003869204083457589, "grad_norm": 8.740541458129883, "learning_rate": 1e-06, "loss": 0.4004, "mean_token_accuracy": 0.8873223066329956, "num_tokens": 858089834.0, "step": 22488 }, { "epoch": 2.860831955221982, "ewc_loss": 0.0756155475974083, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003923859621863812, "grad_norm": 8.929133415222168, "learning_rate": 1e-06, "loss": 0.4587, "mean_token_accuracy": 0.8712301254272461, "num_tokens": 858124152.0, "step": 22489 }, { "epoch": 2.8609591655005726, "ewc_loss": 0.07546942681074142, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003860419092234224, "grad_norm": 8.780546188354492, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8704193234443665, "num_tokens": 858163202.0, "step": 22490 }, { "epoch": 2.8610863757791627, "ewc_loss": 0.0757279247045517, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003935097192879766, "grad_norm": 8.960453033447266, "learning_rate": 1e-06, "loss": 0.3998, "mean_token_accuracy": 0.8866106271743774, "num_tokens": 858199173.0, "step": 22491 }, { "epoch": 2.8612135860577537, "ewc_loss": 0.07496687769889832, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038589921314269304, "grad_norm": 8.851688385009766, "learning_rate": 1e-06, "loss": 0.4736, "mean_token_accuracy": 0.8641185760498047, "num_tokens": 858239693.0, "step": 22492 }, { "epoch": 2.8613407963363438, "ewc_loss": 0.07540249824523926, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039025547448545694, "grad_norm": 8.885831832885742, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8732340335845947, "num_tokens": 858280720.0, "step": 22493 }, { "epoch": 2.8614680066149347, "ewc_loss": 0.0751933604478836, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003881640441250056, "grad_norm": 8.854211807250977, "learning_rate": 1e-06, "loss": 0.4242, "mean_token_accuracy": 0.8787885904312134, "num_tokens": 858314145.0, "step": 22494 }, { "epoch": 2.861595216893525, "ewc_loss": 0.07535995543003082, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038738863077014685, "grad_norm": 11.484356880187988, "learning_rate": 1e-06, "loss": 0.4619, "mean_token_accuracy": 0.8671929836273193, "num_tokens": 858355177.0, "step": 22495 }, { "epoch": 2.861722427172116, "ewc_loss": 0.0758337453007698, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000394567905459553, "grad_norm": 8.72592544555664, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8695286512374878, "num_tokens": 858393070.0, "step": 22496 }, { "epoch": 2.861849637450706, "ewc_loss": 0.07843996584415436, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00042063018190674484, "grad_norm": 9.537610054016113, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8652898073196411, "num_tokens": 858431827.0, "step": 22497 }, { "epoch": 2.8619768477292964, "ewc_loss": 0.07451219856739044, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003813524672295898, "grad_norm": 8.755146980285645, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8640344738960266, "num_tokens": 858474990.0, "step": 22498 }, { "epoch": 2.862104058007887, "ewc_loss": 0.07825654745101929, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0004187959129922092, "grad_norm": 9.40211009979248, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8787272572517395, "num_tokens": 858514698.0, "step": 22499 }, { "epoch": 2.8622312682864774, "ewc_loss": 0.07506366074085236, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038686711923219264, "grad_norm": 8.826655387878418, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8666986227035522, "num_tokens": 858557601.0, "step": 22500 }, { "epoch": 2.862358478565068, "ewc_loss": 0.07698144018650055, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00040604491368867457, "grad_norm": 9.163759231567383, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8755158185958862, "num_tokens": 858596830.0, "step": 22501 }, { "epoch": 2.8624856888436585, "ewc_loss": 0.07554277777671814, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003916582791134715, "grad_norm": 8.898649215698242, "learning_rate": 1e-06, "loss": 0.5073, "mean_token_accuracy": 0.8566053509712219, "num_tokens": 858637555.0, "step": 22502 }, { "epoch": 2.862612899122249, "ewc_loss": 0.07630327343940735, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039926316821947694, "grad_norm": 9.08088493347168, "learning_rate": 1e-06, "loss": 0.3994, "mean_token_accuracy": 0.8855248093605042, "num_tokens": 858672813.0, "step": 22503 }, { "epoch": 2.8627401094008396, "ewc_loss": 0.07549908757209778, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003912213142029941, "grad_norm": 8.9689359664917, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.874101996421814, "num_tokens": 858710743.0, "step": 22504 }, { "epoch": 2.86286731967943, "ewc_loss": 0.07567886263132095, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000393019086914137, "grad_norm": 8.95971393585205, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.8786106705665588, "num_tokens": 858739486.0, "step": 22505 }, { "epoch": 2.8629945299580206, "ewc_loss": 0.07553562521934509, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039158668369054794, "grad_norm": 8.942261695861816, "learning_rate": 1e-06, "loss": 0.5156, "mean_token_accuracy": 0.8476062417030334, "num_tokens": 858780678.0, "step": 22506 }, { "epoch": 2.863121740236611, "ewc_loss": 0.07546346634626389, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003908651415258646, "grad_norm": 8.910222053527832, "learning_rate": 1e-06, "loss": 0.4809, "mean_token_accuracy": 0.8625849485397339, "num_tokens": 858826112.0, "step": 22507 }, { "epoch": 2.8632489505152017, "ewc_loss": 0.07535342872142792, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003897647839039564, "grad_norm": 8.894975662231445, "learning_rate": 1e-06, "loss": 0.4255, "mean_token_accuracy": 0.8779245615005493, "num_tokens": 858862158.0, "step": 22508 }, { "epoch": 2.863376160793792, "ewc_loss": 0.07534801959991455, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038971067988313735, "grad_norm": 8.895203590393066, "learning_rate": 1e-06, "loss": 0.4484, "mean_token_accuracy": 0.8708661794662476, "num_tokens": 858905270.0, "step": 22509 }, { "epoch": 2.8635033710723827, "ewc_loss": 0.07531224191188812, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003893529064953327, "grad_norm": 8.974963188171387, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8613221645355225, "num_tokens": 858941802.0, "step": 22510 }, { "epoch": 2.8636305813509733, "ewc_loss": 0.07509230077266693, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038715353002771735, "grad_norm": 8.870233535766602, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8693884611129761, "num_tokens": 858974672.0, "step": 22511 }, { "epoch": 2.863757791629564, "ewc_loss": 0.07544559240341187, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039068644400686026, "grad_norm": 8.930465698242188, "learning_rate": 1e-06, "loss": 0.4678, "mean_token_accuracy": 0.8638324737548828, "num_tokens": 859015305.0, "step": 22512 }, { "epoch": 2.8638850019081543, "ewc_loss": 0.075111985206604, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003873503301292658, "grad_norm": 8.863531112670898, "learning_rate": 1e-06, "loss": 0.4882, "mean_token_accuracy": 0.858753502368927, "num_tokens": 859050646.0, "step": 22513 }, { "epoch": 2.864012212186745, "ewc_loss": 0.0753430649638176, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038966111605986953, "grad_norm": 8.93718147277832, "learning_rate": 1e-06, "loss": 0.4351, "mean_token_accuracy": 0.8762155771255493, "num_tokens": 859090139.0, "step": 22514 }, { "epoch": 2.8641394224653354, "ewc_loss": 0.07506917417049408, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038692221278324723, "grad_norm": 8.834285736083984, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8715298175811768, "num_tokens": 859130661.0, "step": 22515 }, { "epoch": 2.8642666327439255, "ewc_loss": 0.07543094456195831, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003905399062205106, "grad_norm": 8.95155143737793, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8653373718261719, "num_tokens": 859166399.0, "step": 22516 }, { "epoch": 2.8643938430225164, "ewc_loss": 0.07502143085002899, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038644482265226543, "grad_norm": 8.856474876403809, "learning_rate": 1e-06, "loss": 0.4653, "mean_token_accuracy": 0.866891622543335, "num_tokens": 859208349.0, "step": 22517 }, { "epoch": 2.8645210533011065, "ewc_loss": 0.07536716014146805, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003899020666722208, "grad_norm": 8.971123695373535, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8691710233688354, "num_tokens": 859243784.0, "step": 22518 }, { "epoch": 2.8646482635796975, "ewc_loss": 0.07486717402935028, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003849022323265672, "grad_norm": 8.863112449645996, "learning_rate": 1e-06, "loss": 0.5051, "mean_token_accuracy": 0.853443443775177, "num_tokens": 859278345.0, "step": 22519 }, { "epoch": 2.8647754738582876, "ewc_loss": 0.07533475756645203, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000389578053727746, "grad_norm": 8.947548866271973, "learning_rate": 1e-06, "loss": 0.5036, "mean_token_accuracy": 0.8562902808189392, "num_tokens": 859317454.0, "step": 22520 }, { "epoch": 2.864902684136878, "ewc_loss": 0.07492251694202423, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003854556707665324, "grad_norm": 8.822321891784668, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8624069094657898, "num_tokens": 859352076.0, "step": 22521 }, { "epoch": 2.8650298944154686, "ewc_loss": 0.07552844285964966, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003915149427484721, "grad_norm": 9.006136894226074, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.874086320400238, "num_tokens": 859389957.0, "step": 22522 }, { "epoch": 2.865157104694059, "ewc_loss": 0.07483100891113281, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038454055902548134, "grad_norm": 8.855283737182617, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8672376871109009, "num_tokens": 859425167.0, "step": 22523 }, { "epoch": 2.8652843149726497, "ewc_loss": 0.07538585364818573, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003900890296790749, "grad_norm": 8.907508850097656, "learning_rate": 1e-06, "loss": 0.4738, "mean_token_accuracy": 0.8635708093643188, "num_tokens": 859470999.0, "step": 22524 }, { "epoch": 2.86541152525124, "ewc_loss": 0.07495060563087463, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038573655183427036, "grad_norm": 8.868483543395996, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8734385967254639, "num_tokens": 859503763.0, "step": 22525 }, { "epoch": 2.8655387355298307, "ewc_loss": 0.07525347918272018, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038876524195075035, "grad_norm": 8.926813125610352, "learning_rate": 1e-06, "loss": 0.4349, "mean_token_accuracy": 0.8764365911483765, "num_tokens": 859537739.0, "step": 22526 }, { "epoch": 2.8656659458084213, "ewc_loss": 0.07515237480401993, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003877542039845139, "grad_norm": 8.88021183013916, "learning_rate": 1e-06, "loss": 0.4231, "mean_token_accuracy": 0.8777929544448853, "num_tokens": 859573497.0, "step": 22527 }, { "epoch": 2.865793156087012, "ewc_loss": 0.07514595240354538, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003876900009345263, "grad_norm": 8.992422103881836, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8628477454185486, "num_tokens": 859606997.0, "step": 22528 }, { "epoch": 2.8659203663656023, "ewc_loss": 0.07493557035923004, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003855861723423004, "grad_norm": 8.830472946166992, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8543004393577576, "num_tokens": 859642278.0, "step": 22529 }, { "epoch": 2.866047576644193, "ewc_loss": 0.07532663643360138, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003894968540407717, "grad_norm": 8.921643257141113, "learning_rate": 1e-06, "loss": 0.4308, "mean_token_accuracy": 0.8791167736053467, "num_tokens": 859684728.0, "step": 22530 }, { "epoch": 2.8661747869227834, "ewc_loss": 0.07504133135080338, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038664377643726766, "grad_norm": 8.946639060974121, "learning_rate": 1e-06, "loss": 0.4939, "mean_token_accuracy": 0.8593543767929077, "num_tokens": 859720946.0, "step": 22531 }, { "epoch": 2.866301997201374, "ewc_loss": 0.07499910145998001, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038622147985734046, "grad_norm": 8.857964515686035, "learning_rate": 1e-06, "loss": 0.4663, "mean_token_accuracy": 0.8634619116783142, "num_tokens": 859752605.0, "step": 22532 }, { "epoch": 2.8664292074799644, "ewc_loss": 0.07533057779073715, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003895362315233797, "grad_norm": 8.940938949584961, "learning_rate": 1e-06, "loss": 0.4281, "mean_token_accuracy": 0.8784564733505249, "num_tokens": 859784206.0, "step": 22533 }, { "epoch": 2.866556417758555, "ewc_loss": 0.07492533326148987, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003854837850667536, "grad_norm": 8.817317008972168, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8636218905448914, "num_tokens": 859828142.0, "step": 22534 }, { "epoch": 2.8666836280371455, "ewc_loss": 0.07528090476989746, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038903948734514415, "grad_norm": 8.903797149658203, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.862969160079956, "num_tokens": 859870032.0, "step": 22535 }, { "epoch": 2.866810838315736, "ewc_loss": 0.07518388330936432, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038562790723517537, "grad_norm": 8.848259925842285, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8569628000259399, "num_tokens": 859911417.0, "step": 22536 }, { "epoch": 2.8669380485943265, "ewc_loss": 0.07530193775892258, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003892498498316854, "grad_norm": 8.935405731201172, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8777471780776978, "num_tokens": 859953929.0, "step": 22537 }, { "epoch": 2.867065258872917, "ewc_loss": 0.07497264444828033, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038595686783082783, "grad_norm": 8.859293937683105, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.868868350982666, "num_tokens": 859985402.0, "step": 22538 }, { "epoch": 2.8671924691515076, "ewc_loss": 0.07522892951965332, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038851978024467826, "grad_norm": 8.907634735107422, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8633903861045837, "num_tokens": 860017374.0, "step": 22539 }, { "epoch": 2.867319679430098, "ewc_loss": 0.07519245147705078, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038815499283373356, "grad_norm": 8.858964920043945, "learning_rate": 1e-06, "loss": 0.4284, "mean_token_accuracy": 0.8766436576843262, "num_tokens": 860058217.0, "step": 22540 }, { "epoch": 2.867446889708688, "ewc_loss": 0.07532547414302826, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038948518340475857, "grad_norm": 8.936767578125, "learning_rate": 1e-06, "loss": 0.5077, "mean_token_accuracy": 0.8492905497550964, "num_tokens": 860092317.0, "step": 22541 }, { "epoch": 2.867574099987279, "ewc_loss": 0.07502139359712601, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003864443860948086, "grad_norm": 8.902426719665527, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8633992075920105, "num_tokens": 860125067.0, "step": 22542 }, { "epoch": 2.8677013102658693, "ewc_loss": 0.07525883615016937, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038881879299879074, "grad_norm": 8.901741027832031, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8795642852783203, "num_tokens": 860163805.0, "step": 22543 }, { "epoch": 2.8678285205444602, "ewc_loss": 0.0753280520439148, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038706959458068013, "grad_norm": 8.870969772338867, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.854004979133606, "num_tokens": 860200178.0, "step": 22544 }, { "epoch": 2.8679557308230503, "ewc_loss": 0.07519296556711197, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038816011510789394, "grad_norm": 8.934228897094727, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8652346134185791, "num_tokens": 860235283.0, "step": 22545 }, { "epoch": 2.868082941101641, "ewc_loss": 0.07501037418842316, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003863341989926994, "grad_norm": 8.816526412963867, "learning_rate": 1e-06, "loss": 0.4061, "mean_token_accuracy": 0.8843874931335449, "num_tokens": 860276776.0, "step": 22546 }, { "epoch": 2.8682101513802314, "ewc_loss": 0.07530379295349121, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003892684180755168, "grad_norm": 8.880467414855957, "learning_rate": 1e-06, "loss": 0.4446, "mean_token_accuracy": 0.8714488744735718, "num_tokens": 860318492.0, "step": 22547 }, { "epoch": 2.868337361658822, "ewc_loss": 0.07510916143655777, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038732209941372275, "grad_norm": 8.89409351348877, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.8512440919876099, "num_tokens": 860358556.0, "step": 22548 }, { "epoch": 2.8684645719374124, "ewc_loss": 0.07499632239341736, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003861937439069152, "grad_norm": 8.987922668457031, "learning_rate": 1e-06, "loss": 0.482, "mean_token_accuracy": 0.8561199903488159, "num_tokens": 860391114.0, "step": 22549 }, { "epoch": 2.868591782216003, "ewc_loss": 0.0752522423863411, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003863114689011127, "grad_norm": 8.885056495666504, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8624253869056702, "num_tokens": 860430696.0, "step": 22550 }, { "epoch": 2.8687189924945935, "ewc_loss": 0.07517459243535995, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038797641173005104, "grad_norm": 8.906599044799805, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8741216659545898, "num_tokens": 860471055.0, "step": 22551 }, { "epoch": 2.868846202773184, "ewc_loss": 0.07486911863088608, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038492167368531227, "grad_norm": 8.834786415100098, "learning_rate": 1e-06, "loss": 0.4839, "mean_token_accuracy": 0.8606351613998413, "num_tokens": 860512472.0, "step": 22552 }, { "epoch": 2.8689734130517746, "ewc_loss": 0.07519286125898361, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038815909647382796, "grad_norm": 9.009415626525879, "learning_rate": 1e-06, "loss": 0.473, "mean_token_accuracy": 0.8619906902313232, "num_tokens": 860541014.0, "step": 22553 }, { "epoch": 2.869100623330365, "ewc_loss": 0.07480049878358841, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003842354635708034, "grad_norm": 8.815998077392578, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8816421031951904, "num_tokens": 860585025.0, "step": 22554 }, { "epoch": 2.8692278336089556, "ewc_loss": 0.07526752352714539, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000388905726140365, "grad_norm": 9.013368606567383, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.8733888268470764, "num_tokens": 860616361.0, "step": 22555 }, { "epoch": 2.869355043887546, "ewc_loss": 0.07476039975881577, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003838344709947705, "grad_norm": 8.793853759765625, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8576884269714355, "num_tokens": 860655976.0, "step": 22556 }, { "epoch": 2.8694822541661367, "ewc_loss": 0.07550907135009766, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039132116944529116, "grad_norm": 8.958029747009277, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8537592887878418, "num_tokens": 860703949.0, "step": 22557 }, { "epoch": 2.869609464444727, "ewc_loss": 0.07468724995851517, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000383102975320071, "grad_norm": 8.85916519165039, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8664933443069458, "num_tokens": 860739935.0, "step": 22558 }, { "epoch": 2.8697366747233177, "ewc_loss": 0.07558971643447876, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003896862326655537, "grad_norm": 8.96834659576416, "learning_rate": 1e-06, "loss": 0.4483, "mean_token_accuracy": 0.8700608015060425, "num_tokens": 860782483.0, "step": 22559 }, { "epoch": 2.8698638850019083, "ewc_loss": 0.07511796057224274, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003849686763714999, "grad_norm": 8.863027572631836, "learning_rate": 1e-06, "loss": 0.4915, "mean_token_accuracy": 0.8550183176994324, "num_tokens": 860816821.0, "step": 22560 }, { "epoch": 2.869991095280499, "ewc_loss": 0.07551337033510208, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003889227518811822, "grad_norm": 8.973453521728516, "learning_rate": 1e-06, "loss": 0.4969, "mean_token_accuracy": 0.8554695248603821, "num_tokens": 860851313.0, "step": 22561 }, { "epoch": 2.8701183055590893, "ewc_loss": 0.07518637925386429, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003856528492178768, "grad_norm": 8.846633911132812, "learning_rate": 1e-06, "loss": 0.4374, "mean_token_accuracy": 0.8748711347579956, "num_tokens": 860890540.0, "step": 22562 }, { "epoch": 2.87024551583768, "ewc_loss": 0.07546351850032806, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884242905769497, "grad_norm": 9.008333206176758, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8591957092285156, "num_tokens": 860929686.0, "step": 22563 }, { "epoch": 2.87037272611627, "ewc_loss": 0.07505268603563309, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038431593566201627, "grad_norm": 8.841548919677734, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8687094449996948, "num_tokens": 860968186.0, "step": 22564 }, { "epoch": 2.870499936394861, "ewc_loss": 0.0755271166563034, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038906020927242935, "grad_norm": 9.007441520690918, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8619529604911804, "num_tokens": 861007252.0, "step": 22565 }, { "epoch": 2.870627146673451, "ewc_loss": 0.0747363418340683, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003835939278360456, "grad_norm": 8.802931785583496, "learning_rate": 1e-06, "loss": 0.4528, "mean_token_accuracy": 0.8686450123786926, "num_tokens": 861049898.0, "step": 22566 }, { "epoch": 2.870754356952042, "ewc_loss": 0.07540231198072433, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039025358273647726, "grad_norm": 8.951835632324219, "learning_rate": 1e-06, "loss": 0.4402, "mean_token_accuracy": 0.8736200332641602, "num_tokens": 861087711.0, "step": 22567 }, { "epoch": 2.870881567230632, "ewc_loss": 0.07477419078350067, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003839723358396441, "grad_norm": 8.90957260131836, "learning_rate": 1e-06, "loss": 0.5251, "mean_token_accuracy": 0.845265805721283, "num_tokens": 861128566.0, "step": 22568 }, { "epoch": 2.871008777509223, "ewc_loss": 0.075245201587677, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003886824706569314, "grad_norm": 9.034217834472656, "learning_rate": 1e-06, "loss": 0.4405, "mean_token_accuracy": 0.8713794350624084, "num_tokens": 861156299.0, "step": 22569 }, { "epoch": 2.871135987787813, "ewc_loss": 0.07485710829496384, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038480156217701733, "grad_norm": 8.873649597167969, "learning_rate": 1e-06, "loss": 0.423, "mean_token_accuracy": 0.8775368928909302, "num_tokens": 861189376.0, "step": 22570 }, { "epoch": 2.8712631980664036, "ewc_loss": 0.07529130578041077, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000389143533539027, "grad_norm": 9.034444808959961, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8617385625839233, "num_tokens": 861227156.0, "step": 22571 }, { "epoch": 2.871390408344994, "ewc_loss": 0.07485130429267883, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003847434709314257, "grad_norm": 8.884140014648438, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8664587140083313, "num_tokens": 861268088.0, "step": 22572 }, { "epoch": 2.8715176186235847, "ewc_loss": 0.07511298358440399, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038736025453545153, "grad_norm": 8.923320770263672, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8678638935089111, "num_tokens": 861308892.0, "step": 22573 }, { "epoch": 2.871644828902175, "ewc_loss": 0.07481346279382706, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003843650920316577, "grad_norm": 8.933826446533203, "learning_rate": 1e-06, "loss": 0.4527, "mean_token_accuracy": 0.8697441220283508, "num_tokens": 861350108.0, "step": 22574 }, { "epoch": 2.8717720391807657, "ewc_loss": 0.07493928074836731, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003856232506223023, "grad_norm": 8.926297187805176, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8776261806488037, "num_tokens": 861390553.0, "step": 22575 }, { "epoch": 2.8718992494593563, "ewc_loss": 0.07501165568828583, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038634700467810035, "grad_norm": 8.917459487915039, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8684665560722351, "num_tokens": 861425878.0, "step": 22576 }, { "epoch": 2.872026459737947, "ewc_loss": 0.07513679563999176, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003851570363622159, "grad_norm": 8.87598991394043, "learning_rate": 1e-06, "loss": 0.4164, "mean_token_accuracy": 0.8827720880508423, "num_tokens": 861460935.0, "step": 22577 }, { "epoch": 2.8721536700165373, "ewc_loss": 0.07505291700363159, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038675960968248546, "grad_norm": 8.928302764892578, "learning_rate": 1e-06, "loss": 0.5159, "mean_token_accuracy": 0.8500588536262512, "num_tokens": 861500446.0, "step": 22578 }, { "epoch": 2.872280880295128, "ewc_loss": 0.0750461295247078, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003866917686536908, "grad_norm": 8.899941444396973, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8546860218048096, "num_tokens": 861538000.0, "step": 22579 }, { "epoch": 2.8724080905737184, "ewc_loss": 0.07500500977039337, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038628053152933717, "grad_norm": 8.976388931274414, "learning_rate": 1e-06, "loss": 0.4522, "mean_token_accuracy": 0.8687068223953247, "num_tokens": 861578052.0, "step": 22580 }, { "epoch": 2.872535300852309, "ewc_loss": 0.07505440711975098, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003843331360258162, "grad_norm": 8.869756698608398, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.8634650707244873, "num_tokens": 861619065.0, "step": 22581 }, { "epoch": 2.8726625111308994, "ewc_loss": 0.07517071068286896, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003879376163240522, "grad_norm": 8.8816556930542, "learning_rate": 1e-06, "loss": 0.46, "mean_token_accuracy": 0.8700639009475708, "num_tokens": 861660112.0, "step": 22582 }, { "epoch": 2.87278972140949, "ewc_loss": 0.07506781816482544, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038446724647656083, "grad_norm": 8.873842239379883, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8690866231918335, "num_tokens": 861699312.0, "step": 22583 }, { "epoch": 2.8729169316880805, "ewc_loss": 0.07517816871404648, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.0003880121512338519, "grad_norm": 8.937911033630371, "learning_rate": 1e-06, "loss": 0.4068, "mean_token_accuracy": 0.883610725402832, "num_tokens": 861735167.0, "step": 22584 }, { "epoch": 2.873044141966671, "ewc_loss": 0.07512040436267853, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038499306538142264, "grad_norm": 8.82199764251709, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8605349659919739, "num_tokens": 861778594.0, "step": 22585 }, { "epoch": 2.8731713522452615, "ewc_loss": 0.07541312277317047, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003879203286487609, "grad_norm": 8.935441017150879, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.864174485206604, "num_tokens": 861814754.0, "step": 22586 }, { "epoch": 2.873298562523852, "ewc_loss": 0.07485750317573547, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.000384805491194129, "grad_norm": 8.8756685256958, "learning_rate": 1e-06, "loss": 0.5027, "mean_token_accuracy": 0.8507446050643921, "num_tokens": 861849019.0, "step": 22587 }, { "epoch": 2.8734257728024426, "ewc_loss": 0.07546031475067139, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003883921599481255, "grad_norm": 8.919537544250488, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8638619184494019, "num_tokens": 861889471.0, "step": 22588 }, { "epoch": 2.8735529830810327, "ewc_loss": 0.07528107613325119, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000386599829653278, "grad_norm": 8.863608360290527, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.858449399471283, "num_tokens": 861926811.0, "step": 22589 }, { "epoch": 2.8736801933596237, "ewc_loss": 0.07554741203784943, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003892631793860346, "grad_norm": 8.928939819335938, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8637678623199463, "num_tokens": 861963628.0, "step": 22590 }, { "epoch": 2.8738074036382137, "ewc_loss": 0.07537063956260681, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003874954709317535, "grad_norm": 8.96576976776123, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8528222441673279, "num_tokens": 862002915.0, "step": 22591 }, { "epoch": 2.8739346139168047, "ewc_loss": 0.07521575689315796, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038594662328250706, "grad_norm": 8.817078590393066, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8720664978027344, "num_tokens": 862047276.0, "step": 22592 }, { "epoch": 2.874061824195395, "ewc_loss": 0.07566182315349579, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039040728006511927, "grad_norm": 8.921751022338867, "learning_rate": 1e-06, "loss": 0.4387, "mean_token_accuracy": 0.8733763694763184, "num_tokens": 862091241.0, "step": 22593 }, { "epoch": 2.8741890344739858, "ewc_loss": 0.07518395036458969, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003856285766232759, "grad_norm": 8.873162269592285, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8685569763183594, "num_tokens": 862126025.0, "step": 22594 }, { "epoch": 2.874316244752576, "ewc_loss": 0.07551316916942596, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003889207437168807, "grad_norm": 8.953543663024902, "learning_rate": 1e-06, "loss": 0.49, "mean_token_accuracy": 0.8563871383666992, "num_tokens": 862160822.0, "step": 22595 }, { "epoch": 2.8744434550311664, "ewc_loss": 0.07530845701694489, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038687363849021494, "grad_norm": 8.930265426635742, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8705186247825623, "num_tokens": 862196808.0, "step": 22596 }, { "epoch": 2.874570665309757, "ewc_loss": 0.07537902146577835, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038757926085963845, "grad_norm": 8.944868087768555, "learning_rate": 1e-06, "loss": 0.5179, "mean_token_accuracy": 0.8509104251861572, "num_tokens": 862233722.0, "step": 22597 }, { "epoch": 2.8746978755883474, "ewc_loss": 0.0754406526684761, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003881955926772207, "grad_norm": 8.861289978027344, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8825877904891968, "num_tokens": 862271914.0, "step": 22598 }, { "epoch": 2.874825085866938, "ewc_loss": 0.07540325820446014, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003878216666635126, "grad_norm": 8.882542610168457, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8642740249633789, "num_tokens": 862313696.0, "step": 22599 }, { "epoch": 2.8749522961455285, "ewc_loss": 0.07546555250883102, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038844457594677806, "grad_norm": 8.923089027404785, "learning_rate": 1e-06, "loss": 0.4902, "mean_token_accuracy": 0.8607698678970337, "num_tokens": 862355366.0, "step": 22600 }, { "epoch": 2.875079506424119, "ewc_loss": 0.07544519007205963, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038824096554890275, "grad_norm": 8.925361633300781, "learning_rate": 1e-06, "loss": 0.4914, "mean_token_accuracy": 0.8581187129020691, "num_tokens": 862395630.0, "step": 22601 }, { "epoch": 2.8752067167027096, "ewc_loss": 0.07538001239299774, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003875891852658242, "grad_norm": 8.919096946716309, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8656916618347168, "num_tokens": 862429650.0, "step": 22602 }, { "epoch": 2.8753339269813, "ewc_loss": 0.07540515810251236, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003878406423609704, "grad_norm": 8.946110725402832, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8631553649902344, "num_tokens": 862470921.0, "step": 22603 }, { "epoch": 2.8754611372598906, "ewc_loss": 0.07534489035606384, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038723801844753325, "grad_norm": 8.884049415588379, "learning_rate": 1e-06, "loss": 0.4369, "mean_token_accuracy": 0.8715376257896423, "num_tokens": 862505204.0, "step": 22604 }, { "epoch": 2.875588347538481, "ewc_loss": 0.07547837495803833, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038857277831993997, "grad_norm": 8.918444633483887, "learning_rate": 1e-06, "loss": 0.4823, "mean_token_accuracy": 0.8645352125167847, "num_tokens": 862548686.0, "step": 22605 }, { "epoch": 2.8757155578170717, "ewc_loss": 0.07535490393638611, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038733810652047396, "grad_norm": 8.922967910766602, "learning_rate": 1e-06, "loss": 0.4356, "mean_token_accuracy": 0.879513144493103, "num_tokens": 862585920.0, "step": 22606 }, { "epoch": 2.875842768095662, "ewc_loss": 0.07529540359973907, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038674307870678604, "grad_norm": 11.479805946350098, "learning_rate": 1e-06, "loss": 0.4768, "mean_token_accuracy": 0.8606688976287842, "num_tokens": 862619364.0, "step": 22607 }, { "epoch": 2.8759699783742527, "ewc_loss": 0.07580646872520447, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003918537113349885, "grad_norm": 8.786881446838379, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8615239262580872, "num_tokens": 862661317.0, "step": 22608 }, { "epoch": 2.8760971886528433, "ewc_loss": 0.07853510975837708, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004191401822026819, "grad_norm": 9.467529296875, "learning_rate": 1e-06, "loss": 0.4468, "mean_token_accuracy": 0.8705466389656067, "num_tokens": 862698236.0, "step": 22609 }, { "epoch": 2.876224398931434, "ewc_loss": 0.07451692223548889, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037895829882472754, "grad_norm": 8.715099334716797, "learning_rate": 1e-06, "loss": 0.425, "mean_token_accuracy": 0.8775246739387512, "num_tokens": 862739224.0, "step": 22610 }, { "epoch": 2.8763516092100243, "ewc_loss": 0.07852756977081299, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004190647741779685, "grad_norm": 9.496980667114258, "learning_rate": 1e-06, "loss": 0.4296, "mean_token_accuracy": 0.8811371326446533, "num_tokens": 862776534.0, "step": 22611 }, { "epoch": 2.876478819488615, "ewc_loss": 0.07521627098321915, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003859517746604979, "grad_norm": 8.837685585021973, "learning_rate": 1e-06, "loss": 0.4897, "mean_token_accuracy": 0.8588336706161499, "num_tokens": 862815690.0, "step": 22612 }, { "epoch": 2.8766060297672054, "ewc_loss": 0.07749268412590027, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040871594683267176, "grad_norm": 9.297815322875977, "learning_rate": 1e-06, "loss": 0.4633, "mean_token_accuracy": 0.8659260272979736, "num_tokens": 862851021.0, "step": 22613 }, { "epoch": 2.8767332400457954, "ewc_loss": 0.0756249874830246, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003900389128830284, "grad_norm": 8.901451110839844, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8737603425979614, "num_tokens": 862885122.0, "step": 22614 }, { "epoch": 2.8768604503243864, "ewc_loss": 0.07687884569168091, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004025775706395507, "grad_norm": 9.249844551086426, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8725636005401611, "num_tokens": 862922233.0, "step": 22615 }, { "epoch": 2.8769876606029765, "ewc_loss": 0.07564349472522736, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003902240132447332, "grad_norm": 8.954983711242676, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8787391185760498, "num_tokens": 862961355.0, "step": 22616 }, { "epoch": 2.8771148708815675, "ewc_loss": 0.07630832493305206, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039687228854745626, "grad_norm": 9.121389389038086, "learning_rate": 1e-06, "loss": 0.4434, "mean_token_accuracy": 0.8750239610671997, "num_tokens": 862998523.0, "step": 22617 }, { "epoch": 2.8772420811601576, "ewc_loss": 0.07551071047782898, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003888961218763143, "grad_norm": 8.908709526062012, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8657829761505127, "num_tokens": 863039684.0, "step": 22618 }, { "epoch": 2.877369291438748, "ewc_loss": 0.076194629073143, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003957353183068335, "grad_norm": 9.08800983428955, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.87074875831604, "num_tokens": 863077649.0, "step": 22619 }, { "epoch": 2.8774965017173386, "ewc_loss": 0.07546251267194748, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038841419154778123, "grad_norm": 8.91525936126709, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8629567623138428, "num_tokens": 863116816.0, "step": 22620 }, { "epoch": 2.877623711995929, "ewc_loss": 0.07583675533533096, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039215662400238216, "grad_norm": 9.029241561889648, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8770464062690735, "num_tokens": 863155306.0, "step": 22621 }, { "epoch": 2.8777509222745197, "ewc_loss": 0.0755658894777298, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038944798870943487, "grad_norm": 8.936014175415039, "learning_rate": 1e-06, "loss": 0.4398, "mean_token_accuracy": 0.8732788562774658, "num_tokens": 863194381.0, "step": 22622 }, { "epoch": 2.87787813255311, "ewc_loss": 0.07593517005443573, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039314074092544615, "grad_norm": 9.041305541992188, "learning_rate": 1e-06, "loss": 0.4734, "mean_token_accuracy": 0.8650591373443604, "num_tokens": 863231470.0, "step": 22623 }, { "epoch": 2.8780053428317007, "ewc_loss": 0.07553401589393616, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003891292435582727, "grad_norm": 8.982476234436035, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8584581017494202, "num_tokens": 863265617.0, "step": 22624 }, { "epoch": 2.8781325531102913, "ewc_loss": 0.07568659633398056, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000390655011869967, "grad_norm": 8.97981071472168, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8580156564712524, "num_tokens": 863300111.0, "step": 22625 }, { "epoch": 2.878259763388882, "ewc_loss": 0.07554985582828522, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003892875974997878, "grad_norm": 8.915146827697754, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.873963475227356, "num_tokens": 863338626.0, "step": 22626 }, { "epoch": 2.8783869736674723, "ewc_loss": 0.07575719803571701, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913610416930169, "grad_norm": 9.044670104980469, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.8581691980361938, "num_tokens": 863374672.0, "step": 22627 }, { "epoch": 2.878514183946063, "ewc_loss": 0.07523781061172485, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003861671139020473, "grad_norm": 8.81608772277832, "learning_rate": 1e-06, "loss": 0.4753, "mean_token_accuracy": 0.8598849773406982, "num_tokens": 863412717.0, "step": 22628 }, { "epoch": 2.8786413942246534, "ewc_loss": 0.07567746937274933, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00039300520438700914, "grad_norm": 8.996576309204102, "learning_rate": 1e-06, "loss": 0.4388, "mean_token_accuracy": 0.8750429153442383, "num_tokens": 863447776.0, "step": 22629 }, { "epoch": 2.878768604503244, "ewc_loss": 0.07518652081489563, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003856542462017387, "grad_norm": 8.807706832885742, "learning_rate": 1e-06, "loss": 0.4102, "mean_token_accuracy": 0.8835378885269165, "num_tokens": 863485697.0, "step": 22630 }, { "epoch": 2.8788958147818344, "ewc_loss": 0.07603422552347183, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039413131889887154, "grad_norm": 8.998046875, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8620980978012085, "num_tokens": 863528706.0, "step": 22631 }, { "epoch": 2.879023025060425, "ewc_loss": 0.07519883662462234, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038577744271606207, "grad_norm": 8.783069610595703, "learning_rate": 1e-06, "loss": 0.5145, "mean_token_accuracy": 0.848739743232727, "num_tokens": 863569181.0, "step": 22632 }, { "epoch": 2.8791502353390155, "ewc_loss": 0.07607805728912354, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039456962258554995, "grad_norm": 8.995199203491211, "learning_rate": 1e-06, "loss": 0.5083, "mean_token_accuracy": 0.850727915763855, "num_tokens": 863602985.0, "step": 22633 }, { "epoch": 2.879277445617606, "ewc_loss": 0.07511764764785767, "ewc_loss_diag": 3.647804260253906e-05, "ewc_loss_parallel": 0.00038740699528716505, "grad_norm": 8.915739059448242, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8709303140640259, "num_tokens": 863634590.0, "step": 22634 }, { "epoch": 2.8794046558961965, "ewc_loss": 0.07587467133998871, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003925357887055725, "grad_norm": 8.877461433410645, "learning_rate": 1e-06, "loss": 0.4826, "mean_token_accuracy": 0.8666864633560181, "num_tokens": 863677671.0, "step": 22635 }, { "epoch": 2.879531866174787, "ewc_loss": 0.07571516931056976, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039094072417356074, "grad_norm": 8.968279838562012, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8690305352210999, "num_tokens": 863716102.0, "step": 22636 }, { "epoch": 2.8796590764533776, "ewc_loss": 0.07546630501747131, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038845211383886635, "grad_norm": 8.990213394165039, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8594649434089661, "num_tokens": 863751560.0, "step": 22637 }, { "epoch": 2.879786286731968, "ewc_loss": 0.07558316737413406, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003896207199431956, "grad_norm": 8.869422912597656, "learning_rate": 1e-06, "loss": 0.4089, "mean_token_accuracy": 0.8806578516960144, "num_tokens": 863792373.0, "step": 22638 }, { "epoch": 2.879913497010558, "ewc_loss": 0.07565915584564209, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003903806209564209, "grad_norm": 8.936907768249512, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8631926774978638, "num_tokens": 863829969.0, "step": 22639 }, { "epoch": 2.880040707289149, "ewc_loss": 0.0755242332816124, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003890313964802772, "grad_norm": 8.846658706665039, "learning_rate": 1e-06, "loss": 0.4327, "mean_token_accuracy": 0.8740347623825073, "num_tokens": 863867080.0, "step": 22640 }, { "epoch": 2.8801679175677393, "ewc_loss": 0.07578660547733307, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039165516500361264, "grad_norm": 8.907294273376465, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.871025562286377, "num_tokens": 863904230.0, "step": 22641 }, { "epoch": 2.8802951278463302, "ewc_loss": 0.07544779777526855, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003882670134771615, "grad_norm": 8.782822608947754, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8706806898117065, "num_tokens": 863947309.0, "step": 22642 }, { "epoch": 2.8804223381249203, "ewc_loss": 0.07591895759105682, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039297863258980215, "grad_norm": 8.930442810058594, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8729937076568604, "num_tokens": 863985557.0, "step": 22643 }, { "epoch": 2.880549548403511, "ewc_loss": 0.07540722191333771, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038786124787293375, "grad_norm": 8.852344512939453, "learning_rate": 1e-06, "loss": 0.4984, "mean_token_accuracy": 0.8556362986564636, "num_tokens": 864021842.0, "step": 22644 }, { "epoch": 2.8806767586821014, "ewc_loss": 0.07597555965185165, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003935446438845247, "grad_norm": 8.966950416564941, "learning_rate": 1e-06, "loss": 0.4849, "mean_token_accuracy": 0.8586261868476868, "num_tokens": 864060765.0, "step": 22645 }, { "epoch": 2.880803968960692, "ewc_loss": 0.07545629888772964, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003883520548697561, "grad_norm": 8.871041297912598, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8683959245681763, "num_tokens": 864100624.0, "step": 22646 }, { "epoch": 2.8809311792392824, "ewc_loss": 0.07580715417861938, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003918605507351458, "grad_norm": 8.952375411987305, "learning_rate": 1e-06, "loss": 0.4273, "mean_token_accuracy": 0.8771160840988159, "num_tokens": 864136599.0, "step": 22647 }, { "epoch": 2.881058389517873, "ewc_loss": 0.07550866156816483, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038887569098733366, "grad_norm": 8.92056655883789, "learning_rate": 1e-06, "loss": 0.45, "mean_token_accuracy": 0.8703500628471375, "num_tokens": 864169876.0, "step": 22648 }, { "epoch": 2.8811855997964635, "ewc_loss": 0.07567013800144196, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003904904006049037, "grad_norm": 8.891093254089355, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.865419864654541, "num_tokens": 864213709.0, "step": 22649 }, { "epoch": 2.881312810075054, "ewc_loss": 0.07579849660396576, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917740541510284, "grad_norm": 8.961236953735352, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.8723183870315552, "num_tokens": 864245581.0, "step": 22650 }, { "epoch": 2.8814400203536445, "ewc_loss": 0.07546604424715042, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038844949449412525, "grad_norm": 8.922396659851074, "learning_rate": 1e-06, "loss": 0.5165, "mean_token_accuracy": 0.8523723483085632, "num_tokens": 864287081.0, "step": 22651 }, { "epoch": 2.881567230632235, "ewc_loss": 0.07580171525478363, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039180624298751354, "grad_norm": 8.90072250366211, "learning_rate": 1e-06, "loss": 0.5131, "mean_token_accuracy": 0.8508124351501465, "num_tokens": 864321480.0, "step": 22652 }, { "epoch": 2.8816944409108256, "ewc_loss": 0.07563354074954987, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039012441993691027, "grad_norm": 8.897632598876953, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8732949495315552, "num_tokens": 864358580.0, "step": 22653 }, { "epoch": 2.881821651189416, "ewc_loss": 0.07560113072395325, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000389800377888605, "grad_norm": 8.89805793762207, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8557566404342651, "num_tokens": 864396277.0, "step": 22654 }, { "epoch": 2.8819488614680067, "ewc_loss": 0.07557906210422516, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038957971264608204, "grad_norm": 8.88003158569336, "learning_rate": 1e-06, "loss": 0.4818, "mean_token_accuracy": 0.8625525236129761, "num_tokens": 864433282.0, "step": 22655 }, { "epoch": 2.882076071746597, "ewc_loss": 0.07561002671718597, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898893774021417, "grad_norm": 8.887137413024902, "learning_rate": 1e-06, "loss": 0.4911, "mean_token_accuracy": 0.8588364124298096, "num_tokens": 864469305.0, "step": 22656 }, { "epoch": 2.8822032820251877, "ewc_loss": 0.07568053156137466, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003905943885911256, "grad_norm": 8.953129768371582, "learning_rate": 1e-06, "loss": 0.4646, "mean_token_accuracy": 0.8671619296073914, "num_tokens": 864507246.0, "step": 22657 }, { "epoch": 2.8823304923037782, "ewc_loss": 0.0753972977399826, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003877620620187372, "grad_norm": 8.844038963317871, "learning_rate": 1e-06, "loss": 0.4798, "mean_token_accuracy": 0.8624780178070068, "num_tokens": 864547315.0, "step": 22658 }, { "epoch": 2.8824577025823688, "ewc_loss": 0.07565988600254059, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000390387955121696, "grad_norm": 8.921640396118164, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.8600473403930664, "num_tokens": 864590813.0, "step": 22659 }, { "epoch": 2.8825849128609593, "ewc_loss": 0.07557456195354462, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038953463081270456, "grad_norm": 8.947175025939941, "learning_rate": 1e-06, "loss": 0.494, "mean_token_accuracy": 0.856482744216919, "num_tokens": 864625078.0, "step": 22660 }, { "epoch": 2.88271212313955, "ewc_loss": 0.07552549242973328, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003890439693350345, "grad_norm": 8.87386417388916, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.874045729637146, "num_tokens": 864663714.0, "step": 22661 }, { "epoch": 2.88283933341814, "ewc_loss": 0.07569237053394318, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039071275386959314, "grad_norm": 8.969696044921875, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8650553226470947, "num_tokens": 864697026.0, "step": 22662 }, { "epoch": 2.882966543696731, "ewc_loss": 0.07520858943462372, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038587491144426167, "grad_norm": 8.845976829528809, "learning_rate": 1e-06, "loss": 0.4397, "mean_token_accuracy": 0.8752274513244629, "num_tokens": 864732337.0, "step": 22663 }, { "epoch": 2.883093753975321, "ewc_loss": 0.07602865993976593, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039407567237503827, "grad_norm": 9.000934600830078, "learning_rate": 1e-06, "loss": 0.4861, "mean_token_accuracy": 0.8598588109016418, "num_tokens": 864776598.0, "step": 22664 }, { "epoch": 2.883220964253912, "ewc_loss": 0.07526962459087372, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038648530608043075, "grad_norm": 8.908082008361816, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8536825776100159, "num_tokens": 864817279.0, "step": 22665 }, { "epoch": 2.883348174532502, "ewc_loss": 0.07579576969146729, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917468129657209, "grad_norm": 8.992417335510254, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8739728927612305, "num_tokens": 864854914.0, "step": 22666 }, { "epoch": 2.883475384811093, "ewc_loss": 0.07538878172636032, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038767687510699034, "grad_norm": 8.902066230773926, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8693742752075195, "num_tokens": 864892914.0, "step": 22667 }, { "epoch": 2.883602595089683, "ewc_loss": 0.07560605555772781, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898496215697378, "grad_norm": 8.990741729736328, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8641932010650635, "num_tokens": 864934189.0, "step": 22668 }, { "epoch": 2.8837298053682736, "ewc_loss": 0.07523757219314575, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003861648146994412, "grad_norm": 8.906810760498047, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.875829815864563, "num_tokens": 864972085.0, "step": 22669 }, { "epoch": 2.883857015646864, "ewc_loss": 0.07546846568584442, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884737379848957, "grad_norm": 8.969714164733887, "learning_rate": 1e-06, "loss": 0.5184, "mean_token_accuracy": 0.8497960567474365, "num_tokens": 865008674.0, "step": 22670 }, { "epoch": 2.8839842259254547, "ewc_loss": 0.07524028420448303, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003861919103655964, "grad_norm": 8.916642189025879, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8560923337936401, "num_tokens": 865046004.0, "step": 22671 }, { "epoch": 2.884111436204045, "ewc_loss": 0.07541122287511826, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003879012947436422, "grad_norm": 8.939010620117188, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8691223859786987, "num_tokens": 865082409.0, "step": 22672 }, { "epoch": 2.8842386464826357, "ewc_loss": 0.07522639632225037, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003860530268866569, "grad_norm": 8.943550109863281, "learning_rate": 1e-06, "loss": 0.4654, "mean_token_accuracy": 0.8656802773475647, "num_tokens": 865124457.0, "step": 22673 }, { "epoch": 2.8843658567612263, "ewc_loss": 0.07539813965559006, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003877704730257392, "grad_norm": 8.939305305480957, "learning_rate": 1e-06, "loss": 0.4329, "mean_token_accuracy": 0.8778602480888367, "num_tokens": 865162290.0, "step": 22674 }, { "epoch": 2.884493067039817, "ewc_loss": 0.07523086667060852, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003860977303702384, "grad_norm": 8.906071662902832, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.8624362945556641, "num_tokens": 865201686.0, "step": 22675 }, { "epoch": 2.8846202773184073, "ewc_loss": 0.0753956064581871, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003877451235894114, "grad_norm": 8.978619575500488, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8554906845092773, "num_tokens": 865236876.0, "step": 22676 }, { "epoch": 2.884747487596998, "ewc_loss": 0.0751638114452362, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003854271781165153, "grad_norm": 8.918909072875977, "learning_rate": 1e-06, "loss": 0.4504, "mean_token_accuracy": 0.8701126575469971, "num_tokens": 865276994.0, "step": 22677 }, { "epoch": 2.8848746978755884, "ewc_loss": 0.07541123032569885, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038790141115896404, "grad_norm": 8.930299758911133, "learning_rate": 1e-06, "loss": 0.455, "mean_token_accuracy": 0.8663964867591858, "num_tokens": 865311990.0, "step": 22678 }, { "epoch": 2.885001908154179, "ewc_loss": 0.07532927393913269, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003870817890856415, "grad_norm": 8.90408992767334, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8717527985572815, "num_tokens": 865352324.0, "step": 22679 }, { "epoch": 2.8851291184327694, "ewc_loss": 0.07546171545982361, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038840618799440563, "grad_norm": 8.959948539733887, "learning_rate": 1e-06, "loss": 0.4645, "mean_token_accuracy": 0.8657978177070618, "num_tokens": 865391428.0, "step": 22680 }, { "epoch": 2.88525632871136, "ewc_loss": 0.07529635727405548, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003867526538670063, "grad_norm": 8.933940887451172, "learning_rate": 1e-06, "loss": 0.4664, "mean_token_accuracy": 0.8625478744506836, "num_tokens": 865428478.0, "step": 22681 }, { "epoch": 2.8853835389899505, "ewc_loss": 0.07541688531637192, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038795790169388056, "grad_norm": 8.944101333618164, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8655472993850708, "num_tokens": 865466531.0, "step": 22682 }, { "epoch": 2.885510749268541, "ewc_loss": 0.0752769261598587, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003865583275910467, "grad_norm": 8.870919227600098, "learning_rate": 1e-06, "loss": 0.4451, "mean_token_accuracy": 0.874005913734436, "num_tokens": 865503177.0, "step": 22683 }, { "epoch": 2.8856379595471315, "ewc_loss": 0.07539160549640656, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003877051349263638, "grad_norm": 8.97633171081543, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8592708110809326, "num_tokens": 865539454.0, "step": 22684 }, { "epoch": 2.885765169825722, "ewc_loss": 0.07496625185012817, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038345158100128174, "grad_norm": 8.811033248901367, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8851499557495117, "num_tokens": 865571937.0, "step": 22685 }, { "epoch": 2.8858923801043126, "ewc_loss": 0.07558207213878632, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038960977690294385, "grad_norm": 9.020426750183105, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8693181276321411, "num_tokens": 865609457.0, "step": 22686 }, { "epoch": 2.8860195903829027, "ewc_loss": 0.07485298812389374, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003823189181275666, "grad_norm": 8.814526557922363, "learning_rate": 1e-06, "loss": 0.5078, "mean_token_accuracy": 0.8532516956329346, "num_tokens": 865647391.0, "step": 22687 }, { "epoch": 2.8861468006614936, "ewc_loss": 0.07576711475849152, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039146022754721344, "grad_norm": 9.077290534973145, "learning_rate": 1e-06, "loss": 0.461, "mean_token_accuracy": 0.8658452033996582, "num_tokens": 865680578.0, "step": 22688 }, { "epoch": 2.8862740109400837, "ewc_loss": 0.07482512295246124, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038204024895094335, "grad_norm": 8.8309907913208, "learning_rate": 1e-06, "loss": 0.478, "mean_token_accuracy": 0.8630546927452087, "num_tokens": 865714610.0, "step": 22689 }, { "epoch": 2.8864012212186747, "ewc_loss": 0.07571348547935486, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003909238730557263, "grad_norm": 9.042081832885742, "learning_rate": 1e-06, "loss": 0.4956, "mean_token_accuracy": 0.8617381453514099, "num_tokens": 865755672.0, "step": 22690 }, { "epoch": 2.886528431497265, "ewc_loss": 0.074927419424057, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038306330679915845, "grad_norm": 8.924528121948242, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8726690411567688, "num_tokens": 865790111.0, "step": 22691 }, { "epoch": 2.8866556417758558, "ewc_loss": 0.0753278136253357, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038706714985892177, "grad_norm": 8.921126365661621, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8647775650024414, "num_tokens": 865825580.0, "step": 22692 }, { "epoch": 2.886782852054446, "ewc_loss": 0.07516910135746002, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003854800888802856, "grad_norm": 8.91698932647705, "learning_rate": 1e-06, "loss": 0.4465, "mean_token_accuracy": 0.8704279661178589, "num_tokens": 865869099.0, "step": 22693 }, { "epoch": 2.8869100623330364, "ewc_loss": 0.07511475682258606, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003849366621579975, "grad_norm": 8.95451545715332, "learning_rate": 1e-06, "loss": 0.5079, "mean_token_accuracy": 0.856659471988678, "num_tokens": 865906957.0, "step": 22694 }, { "epoch": 2.887037272611627, "ewc_loss": 0.07509434223175049, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003847324405796826, "grad_norm": 8.865997314453125, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8759499192237854, "num_tokens": 865945712.0, "step": 22695 }, { "epoch": 2.8871644828902174, "ewc_loss": 0.0752718448638916, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003865074831992388, "grad_norm": 8.933545112609863, "learning_rate": 1e-06, "loss": 0.5128, "mean_token_accuracy": 0.853317141532898, "num_tokens": 865984469.0, "step": 22696 }, { "epoch": 2.887291693168808, "ewc_loss": 0.0751209408044815, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003849984786938876, "grad_norm": 8.861844062805176, "learning_rate": 1e-06, "loss": 0.469, "mean_token_accuracy": 0.8676598072052002, "num_tokens": 866024624.0, "step": 22697 }, { "epoch": 2.8874189034473985, "ewc_loss": 0.07534623146057129, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038725140620954335, "grad_norm": 8.93465518951416, "learning_rate": 1e-06, "loss": 0.4672, "mean_token_accuracy": 0.8632769584655762, "num_tokens": 866062446.0, "step": 22698 }, { "epoch": 2.887546113725989, "ewc_loss": 0.07518762350082397, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038566524744965136, "grad_norm": 8.86019229888916, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8757649660110474, "num_tokens": 866100896.0, "step": 22699 }, { "epoch": 2.8876733240045795, "ewc_loss": 0.07534315437078476, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003872206143569201, "grad_norm": 8.917631149291992, "learning_rate": 1e-06, "loss": 0.501, "mean_token_accuracy": 0.8521653413772583, "num_tokens": 866134668.0, "step": 22700 }, { "epoch": 2.88780053428317, "ewc_loss": 0.075176902115345, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038555808714590967, "grad_norm": 8.896943092346191, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8593716621398926, "num_tokens": 866173834.0, "step": 22701 }, { "epoch": 2.8879277445617606, "ewc_loss": 0.07537265121936798, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003875155816785991, "grad_norm": 8.90583610534668, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.8481297492980957, "num_tokens": 866211691.0, "step": 22702 }, { "epoch": 2.888054954840351, "ewc_loss": 0.07624292373657227, "ewc_loss_diag": 3.7670135498046875e-05, "ewc_loss_parallel": 0.00038645262247882783, "grad_norm": 10.149848937988281, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8648732304573059, "num_tokens": 866240878.0, "step": 22703 }, { "epoch": 2.8881821651189417, "ewc_loss": 0.07436507940292358, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00037743986467830837, "grad_norm": 8.597151756286621, "learning_rate": 1e-06, "loss": 0.5019, "mean_token_accuracy": 0.8525819778442383, "num_tokens": 866289552.0, "step": 22704 }, { "epoch": 2.888309375397532, "ewc_loss": 0.07755663990974426, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004093554161954671, "grad_norm": 9.383851051330566, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8565041422843933, "num_tokens": 866330361.0, "step": 22705 }, { "epoch": 2.8884365856761227, "ewc_loss": 0.07417040318250656, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000375493080355227, "grad_norm": 8.692095756530762, "learning_rate": 1e-06, "loss": 0.4596, "mean_token_accuracy": 0.8689696788787842, "num_tokens": 866371557.0, "step": 22706 }, { "epoch": 2.8885637959547132, "ewc_loss": 0.07755543291568756, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040934342541731894, "grad_norm": 9.336636543273926, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8668691515922546, "num_tokens": 866410627.0, "step": 22707 }, { "epoch": 2.8886910062333038, "ewc_loss": 0.07476094365119934, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038139845128171146, "grad_norm": 8.75868034362793, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8639352321624756, "num_tokens": 866446036.0, "step": 22708 }, { "epoch": 2.8888182165118943, "ewc_loss": 0.07692739367485046, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040306299342773855, "grad_norm": 9.339024543762207, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.856472909450531, "num_tokens": 866489655.0, "step": 22709 }, { "epoch": 2.888945426790485, "ewc_loss": 0.07510484755039215, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003848375054076314, "grad_norm": 8.790013313293457, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8575247526168823, "num_tokens": 866532750.0, "step": 22710 }, { "epoch": 2.8890726370690754, "ewc_loss": 0.07665068656206131, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004002959467470646, "grad_norm": 9.21863842010498, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8726184964179993, "num_tokens": 866570748.0, "step": 22711 }, { "epoch": 2.8891998473476654, "ewc_loss": 0.07513175904750824, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038510659942403436, "grad_norm": 8.89736270904541, "learning_rate": 1e-06, "loss": 0.4858, "mean_token_accuracy": 0.8613640069961548, "num_tokens": 866609422.0, "step": 22712 }, { "epoch": 2.8893270576262564, "ewc_loss": 0.07618310302495956, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003956200962420553, "grad_norm": 9.101777076721191, "learning_rate": 1e-06, "loss": 0.4584, "mean_token_accuracy": 0.866247296333313, "num_tokens": 866650865.0, "step": 22713 }, { "epoch": 2.8894542679048465, "ewc_loss": 0.07532095909118652, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038699861033819616, "grad_norm": 8.927939414978027, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8641479015350342, "num_tokens": 866687605.0, "step": 22714 }, { "epoch": 2.8895814781834375, "ewc_loss": 0.07576486468315125, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003914377011824399, "grad_norm": 9.025967597961426, "learning_rate": 1e-06, "loss": 0.4889, "mean_token_accuracy": 0.8559733629226685, "num_tokens": 866722974.0, "step": 22715 }, { "epoch": 2.8897086884620276, "ewc_loss": 0.07528961449861526, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003866851911880076, "grad_norm": 8.952272415161133, "learning_rate": 1e-06, "loss": 0.4741, "mean_token_accuracy": 0.8644450306892395, "num_tokens": 866760694.0, "step": 22716 }, { "epoch": 2.889835898740618, "ewc_loss": 0.07559609413146973, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038974997005425394, "grad_norm": 9.028426170349121, "learning_rate": 1e-06, "loss": 0.5166, "mean_token_accuracy": 0.8475085496902466, "num_tokens": 866795102.0, "step": 22717 }, { "epoch": 2.8899631090192086, "ewc_loss": 0.07528969645500183, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003866859769914299, "grad_norm": 8.922769546508789, "learning_rate": 1e-06, "loss": 0.4206, "mean_token_accuracy": 0.8792122602462769, "num_tokens": 866831282.0, "step": 22718 }, { "epoch": 2.890090319297799, "ewc_loss": 0.07550811767578125, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003888702194672078, "grad_norm": 8.981551170349121, "learning_rate": 1e-06, "loss": 0.491, "mean_token_accuracy": 0.8583520650863647, "num_tokens": 866872661.0, "step": 22719 }, { "epoch": 2.8902175295763897, "ewc_loss": 0.07527244091033936, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038651347858831286, "grad_norm": 8.869354248046875, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8561621308326721, "num_tokens": 866917576.0, "step": 22720 }, { "epoch": 2.89034473985498, "ewc_loss": 0.07565155625343323, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039030457264743745, "grad_norm": 8.974385261535645, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.8698976039886475, "num_tokens": 866957245.0, "step": 22721 }, { "epoch": 2.8904719501335707, "ewc_loss": 0.07539704442024231, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000387759500881657, "grad_norm": 8.96837329864502, "learning_rate": 1e-06, "loss": 0.5459, "mean_token_accuracy": 0.8365135192871094, "num_tokens": 866998695.0, "step": 22722 }, { "epoch": 2.8905991604121613, "ewc_loss": 0.07539556175470352, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003877446870319545, "grad_norm": 8.941459655761719, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8520406484603882, "num_tokens": 867033502.0, "step": 22723 }, { "epoch": 2.890726370690752, "ewc_loss": 0.0754673033952713, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038846206734888256, "grad_norm": 8.980548858642578, "learning_rate": 1e-06, "loss": 0.5173, "mean_token_accuracy": 0.8476293087005615, "num_tokens": 867076032.0, "step": 22724 }, { "epoch": 2.8908535809693423, "ewc_loss": 0.07525317370891571, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003863208112306893, "grad_norm": 8.982465744018555, "learning_rate": 1e-06, "loss": 0.438, "mean_token_accuracy": 0.8662577867507935, "num_tokens": 867107429.0, "step": 22725 }, { "epoch": 2.890980791247933, "ewc_loss": 0.07555480301380157, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038933713221922517, "grad_norm": 8.963695526123047, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.8688285946846008, "num_tokens": 867139432.0, "step": 22726 }, { "epoch": 2.8911080015265234, "ewc_loss": 0.07529979199171066, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003867869672831148, "grad_norm": 8.897754669189453, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.8616956472396851, "num_tokens": 867180588.0, "step": 22727 }, { "epoch": 2.891235211805114, "ewc_loss": 0.07556772232055664, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038946623681113124, "grad_norm": 9.004104614257812, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.8616815209388733, "num_tokens": 867215170.0, "step": 22728 }, { "epoch": 2.8913624220837044, "ewc_loss": 0.07511995732784271, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003849885833915323, "grad_norm": 8.867026329040527, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8813580274581909, "num_tokens": 867254958.0, "step": 22729 }, { "epoch": 2.891489632362295, "ewc_loss": 0.07570654153823853, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039085448952391744, "grad_norm": 8.983424186706543, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8638833165168762, "num_tokens": 867291135.0, "step": 22730 }, { "epoch": 2.8916168426408855, "ewc_loss": 0.07517049461603165, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038549400051124394, "grad_norm": 8.886048316955566, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8731164932250977, "num_tokens": 867326926.0, "step": 22731 }, { "epoch": 2.891744052919476, "ewc_loss": 0.07588686048984528, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039021624252200127, "grad_norm": 9.028297424316406, "learning_rate": 1e-06, "loss": 0.4924, "mean_token_accuracy": 0.8563975095748901, "num_tokens": 867361142.0, "step": 22732 }, { "epoch": 2.8918712631980665, "ewc_loss": 0.07519114017486572, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000385700463084504, "grad_norm": 8.8670654296875, "learning_rate": 1e-06, "loss": 0.4139, "mean_token_accuracy": 0.8834011554718018, "num_tokens": 867398344.0, "step": 22733 }, { "epoch": 2.891998473476657, "ewc_loss": 0.07576315850019455, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003914206463377923, "grad_norm": 8.985462188720703, "learning_rate": 1e-06, "loss": 0.4418, "mean_token_accuracy": 0.8741433620452881, "num_tokens": 867437642.0, "step": 22734 }, { "epoch": 2.8921256837552476, "ewc_loss": 0.07521243393421173, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038591338670812547, "grad_norm": 8.885905265808105, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8544119596481323, "num_tokens": 867471646.0, "step": 22735 }, { "epoch": 2.892252894033838, "ewc_loss": 0.07583322376012802, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003921212919522077, "grad_norm": 9.064871788024902, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.866985559463501, "num_tokens": 867509682.0, "step": 22736 }, { "epoch": 2.892380104312428, "ewc_loss": 0.07511335611343384, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038492257590405643, "grad_norm": 8.846863746643066, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8605399131774902, "num_tokens": 867544425.0, "step": 22737 }, { "epoch": 2.892507314591019, "ewc_loss": 0.07589864730834961, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039277554606087506, "grad_norm": 9.113380432128906, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.858095645904541, "num_tokens": 867584978.0, "step": 22738 }, { "epoch": 2.8926345248696093, "ewc_loss": 0.07496587187051773, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038344779750332236, "grad_norm": 8.84952163696289, "learning_rate": 1e-06, "loss": 0.4622, "mean_token_accuracy": 0.8672746419906616, "num_tokens": 867620360.0, "step": 22739 }, { "epoch": 2.8927617351482002, "ewc_loss": 0.07604141533374786, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003942032635677606, "grad_norm": 9.202342987060547, "learning_rate": 1e-06, "loss": 0.4328, "mean_token_accuracy": 0.8736001253128052, "num_tokens": 867655246.0, "step": 22740 }, { "epoch": 2.8928889454267903, "ewc_loss": 0.07478415220975876, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038163058343343437, "grad_norm": 8.814330101013184, "learning_rate": 1e-06, "loss": 0.4879, "mean_token_accuracy": 0.8557608127593994, "num_tokens": 867692792.0, "step": 22741 }, { "epoch": 2.893016155705381, "ewc_loss": 0.0761379823088646, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003951688704546541, "grad_norm": 9.181804656982422, "learning_rate": 1e-06, "loss": 0.4589, "mean_token_accuracy": 0.8677053451538086, "num_tokens": 867732365.0, "step": 22742 }, { "epoch": 2.8931433659839714, "ewc_loss": 0.0750606432557106, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003819541016127914, "grad_norm": 8.839078903198242, "learning_rate": 1e-06, "loss": 0.5127, "mean_token_accuracy": 0.8519918918609619, "num_tokens": 867764800.0, "step": 22743 }, { "epoch": 2.893270576262562, "ewc_loss": 0.07619363069534302, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039572533569298685, "grad_norm": 9.26661491394043, "learning_rate": 1e-06, "loss": 0.4906, "mean_token_accuracy": 0.8583433628082275, "num_tokens": 867807107.0, "step": 22744 }, { "epoch": 2.8933977865411524, "ewc_loss": 0.07480814307928085, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003818704863078892, "grad_norm": 8.768539428710938, "learning_rate": 1e-06, "loss": 0.4996, "mean_token_accuracy": 0.855534017086029, "num_tokens": 867850225.0, "step": 22745 }, { "epoch": 2.893524996819743, "ewc_loss": 0.07657383382320404, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003970859688706696, "grad_norm": 9.30113697052002, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8614054322242737, "num_tokens": 867890457.0, "step": 22746 }, { "epoch": 2.8936522070983335, "ewc_loss": 0.0750340074300766, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038168777246028185, "grad_norm": 8.76820182800293, "learning_rate": 1e-06, "loss": 0.4416, "mean_token_accuracy": 0.870975136756897, "num_tokens": 867925880.0, "step": 22747 }, { "epoch": 2.893779417376924, "ewc_loss": 0.07665592432022095, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039790687151253223, "grad_norm": 9.228446006774902, "learning_rate": 1e-06, "loss": 0.4202, "mean_token_accuracy": 0.8796826601028442, "num_tokens": 867963923.0, "step": 22748 }, { "epoch": 2.8939066276555145, "ewc_loss": 0.07471925765275955, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003809816262219101, "grad_norm": 8.810690879821777, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8686858415603638, "num_tokens": 868000489.0, "step": 22749 }, { "epoch": 2.894033837934105, "ewc_loss": 0.07665205746889114, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003978682216256857, "grad_norm": 9.159512519836426, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8683335781097412, "num_tokens": 868034001.0, "step": 22750 }, { "epoch": 2.8941610482126956, "ewc_loss": 0.07492749392986298, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003830640052910894, "grad_norm": 8.837947845458984, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.869463324546814, "num_tokens": 868074956.0, "step": 22751 }, { "epoch": 2.894288258491286, "ewc_loss": 0.07606308162212372, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003944199124816805, "grad_norm": 9.071682929992676, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.873604953289032, "num_tokens": 868110727.0, "step": 22752 }, { "epoch": 2.8944154687698767, "ewc_loss": 0.07527915388345718, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003865805920213461, "grad_norm": 8.919450759887695, "learning_rate": 1e-06, "loss": 0.5281, "mean_token_accuracy": 0.8454525470733643, "num_tokens": 868151480.0, "step": 22753 }, { "epoch": 2.894542679048467, "ewc_loss": 0.075685515999794, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000390644243452698, "grad_norm": 9.049470901489258, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8567762970924377, "num_tokens": 868190620.0, "step": 22754 }, { "epoch": 2.8946698893270577, "ewc_loss": 0.07525965571403503, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003863856545649469, "grad_norm": 8.90869426727295, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8642385005950928, "num_tokens": 868230598.0, "step": 22755 }, { "epoch": 2.8947970996056482, "ewc_loss": 0.075714111328125, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039093021769076586, "grad_norm": 9.081096649169922, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8588488101959229, "num_tokens": 868265933.0, "step": 22756 }, { "epoch": 2.8949243098842388, "ewc_loss": 0.07517446577548981, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038553366903215647, "grad_norm": 8.90696907043457, "learning_rate": 1e-06, "loss": 0.4461, "mean_token_accuracy": 0.8683911561965942, "num_tokens": 868300282.0, "step": 22757 }, { "epoch": 2.8950515201628293, "ewc_loss": 0.07567225396633148, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003905115881934762, "grad_norm": 9.028042793273926, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8804522752761841, "num_tokens": 868331031.0, "step": 22758 }, { "epoch": 2.89517873044142, "ewc_loss": 0.07540867477655411, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038543439586646855, "grad_norm": 8.958828926086426, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8668249845504761, "num_tokens": 868368268.0, "step": 22759 }, { "epoch": 2.89530594072001, "ewc_loss": 0.07539014518260956, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003876904956996441, "grad_norm": 9.007213592529297, "learning_rate": 1e-06, "loss": 0.5141, "mean_token_accuracy": 0.8490335941314697, "num_tokens": 868407551.0, "step": 22760 }, { "epoch": 2.895433150998601, "ewc_loss": 0.07521791756153107, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003859682474285364, "grad_norm": 8.918177604675293, "learning_rate": 1e-06, "loss": 0.4708, "mean_token_accuracy": 0.8630085587501526, "num_tokens": 868449301.0, "step": 22761 }, { "epoch": 2.895560361277191, "ewc_loss": 0.07543504238128662, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003881394804921001, "grad_norm": 8.980297088623047, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8625320196151733, "num_tokens": 868482853.0, "step": 22762 }, { "epoch": 2.895687571555782, "ewc_loss": 0.07517725229263306, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000385561550501734, "grad_norm": 8.923441886901855, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8599951267242432, "num_tokens": 868523960.0, "step": 22763 }, { "epoch": 2.895814781834372, "ewc_loss": 0.07535423338413239, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038733144174329937, "grad_norm": 8.916075706481934, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8580424189567566, "num_tokens": 868566447.0, "step": 22764 }, { "epoch": 2.895941992112963, "ewc_loss": 0.07530327141284943, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038682177546434104, "grad_norm": 8.971863746643066, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.8747087717056274, "num_tokens": 868608342.0, "step": 22765 }, { "epoch": 2.896069202391553, "ewc_loss": 0.07542955875396729, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003880846779793501, "grad_norm": 8.969405174255371, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8767763376235962, "num_tokens": 868647354.0, "step": 22766 }, { "epoch": 2.8961964126701436, "ewc_loss": 0.07529480755329132, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038673714152537286, "grad_norm": 8.878169059753418, "learning_rate": 1e-06, "loss": 0.5272, "mean_token_accuracy": 0.8477057814598083, "num_tokens": 868690716.0, "step": 22767 }, { "epoch": 2.896323622948734, "ewc_loss": 0.075532928109169, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038911832962185144, "grad_norm": 9.016351699829102, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8720282316207886, "num_tokens": 868728318.0, "step": 22768 }, { "epoch": 2.8964508332273247, "ewc_loss": 0.07521694898605347, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003859584976453334, "grad_norm": 8.857305526733398, "learning_rate": 1e-06, "loss": 0.4158, "mean_token_accuracy": 0.8801277875900269, "num_tokens": 868764323.0, "step": 22769 }, { "epoch": 2.896578043505915, "ewc_loss": 0.07569585740566254, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039074764936231077, "grad_norm": 9.03248119354248, "learning_rate": 1e-06, "loss": 0.5271, "mean_token_accuracy": 0.8484113812446594, "num_tokens": 868801371.0, "step": 22770 }, { "epoch": 2.8967052537845057, "ewc_loss": 0.07513975352048874, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003851866058539599, "grad_norm": 8.833297729492188, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8709864616394043, "num_tokens": 868846679.0, "step": 22771 }, { "epoch": 2.8968324640630962, "ewc_loss": 0.07599522173404694, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039374124025925994, "grad_norm": 9.132747650146484, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8512272238731384, "num_tokens": 868880076.0, "step": 22772 }, { "epoch": 2.8969596743416868, "ewc_loss": 0.07489658147096634, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038275489350780845, "grad_norm": 8.859057426452637, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8657439351081848, "num_tokens": 868917086.0, "step": 22773 }, { "epoch": 2.8970868846202773, "ewc_loss": 0.0759972631931305, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039376167114824057, "grad_norm": 9.03622817993164, "learning_rate": 1e-06, "loss": 0.4425, "mean_token_accuracy": 0.8710566759109497, "num_tokens": 868954952.0, "step": 22774 }, { "epoch": 2.897214094898868, "ewc_loss": 0.07505697011947632, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038435874739661813, "grad_norm": 8.869027137756348, "learning_rate": 1e-06, "loss": 0.4824, "mean_token_accuracy": 0.8612829446792603, "num_tokens": 868990911.0, "step": 22775 }, { "epoch": 2.8973413051774584, "ewc_loss": 0.07588377594947815, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039262682548724115, "grad_norm": 9.10869026184082, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8762816190719604, "num_tokens": 869033398.0, "step": 22776 }, { "epoch": 2.897468515456049, "ewc_loss": 0.07504190504550934, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038420813507400453, "grad_norm": 8.864161491394043, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8532555103302002, "num_tokens": 869070513.0, "step": 22777 }, { "epoch": 2.8975957257346394, "ewc_loss": 0.07580384612083435, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039182748878374696, "grad_norm": 8.980425834655762, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8728137016296387, "num_tokens": 869113494.0, "step": 22778 }, { "epoch": 2.89772293601323, "ewc_loss": 0.07529447972774506, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038673385279253125, "grad_norm": 8.936922073364258, "learning_rate": 1e-06, "loss": 0.4406, "mean_token_accuracy": 0.8756819367408752, "num_tokens": 869147433.0, "step": 22779 }, { "epoch": 2.8978501462918205, "ewc_loss": 0.07560917735099792, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898808208759874, "grad_norm": 8.991559982299805, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.861728310585022, "num_tokens": 869186763.0, "step": 22780 }, { "epoch": 2.897977356570411, "ewc_loss": 0.07534047216176987, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000387193780625239, "grad_norm": 8.908397674560547, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8790721893310547, "num_tokens": 869222624.0, "step": 22781 }, { "epoch": 2.8981045668490015, "ewc_loss": 0.07560224831104279, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038981152465566993, "grad_norm": 8.941179275512695, "learning_rate": 1e-06, "loss": 0.402, "mean_token_accuracy": 0.8867000341415405, "num_tokens": 869263027.0, "step": 22782 }, { "epoch": 2.898231777127592, "ewc_loss": 0.07542221993207932, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003880112781189382, "grad_norm": 8.929566383361816, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8612745404243469, "num_tokens": 869304232.0, "step": 22783 }, { "epoch": 2.8983589874061826, "ewc_loss": 0.07551512867212296, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003889403596986085, "grad_norm": 8.964958190917969, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8714136481285095, "num_tokens": 869340141.0, "step": 22784 }, { "epoch": 2.8984861976847727, "ewc_loss": 0.07541334629058838, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003879224823322147, "grad_norm": 8.98427677154541, "learning_rate": 1e-06, "loss": 0.4829, "mean_token_accuracy": 0.8573410511016846, "num_tokens": 869382428.0, "step": 22785 }, { "epoch": 2.8986134079633636, "ewc_loss": 0.07530585676431656, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003868476487696171, "grad_norm": 8.953319549560547, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.872795581817627, "num_tokens": 869418890.0, "step": 22786 }, { "epoch": 2.8987406182419537, "ewc_loss": 0.07550057768821716, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038879484054632485, "grad_norm": 9.0437650680542, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8660175800323486, "num_tokens": 869453704.0, "step": 22787 }, { "epoch": 2.8988678285205447, "ewc_loss": 0.07524137198925018, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003862028243020177, "grad_norm": 8.97886848449707, "learning_rate": 1e-06, "loss": 0.4877, "mean_token_accuracy": 0.8610008955001831, "num_tokens": 869491845.0, "step": 22788 }, { "epoch": 2.898995038799135, "ewc_loss": 0.07541186362504959, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003879076975863427, "grad_norm": 9.00719928741455, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8770127296447754, "num_tokens": 869523393.0, "step": 22789 }, { "epoch": 2.8991222490777258, "ewc_loss": 0.07524474710226059, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003862365265376866, "grad_norm": 8.994147300720215, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8633086681365967, "num_tokens": 869561927.0, "step": 22790 }, { "epoch": 2.899249459356316, "ewc_loss": 0.07529187947511673, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003867078630719334, "grad_norm": 8.988471031188965, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8644585609436035, "num_tokens": 869599568.0, "step": 22791 }, { "epoch": 2.8993766696349064, "ewc_loss": 0.07524299621582031, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003862190351355821, "grad_norm": 8.934392929077148, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8692948818206787, "num_tokens": 869636329.0, "step": 22792 }, { "epoch": 2.899503879913497, "ewc_loss": 0.07541607320308685, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038794975262135267, "grad_norm": 9.023820877075195, "learning_rate": 1e-06, "loss": 0.4799, "mean_token_accuracy": 0.8604010343551636, "num_tokens": 869679720.0, "step": 22793 }, { "epoch": 2.8996310901920874, "ewc_loss": 0.07511109858751297, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003849000495392829, "grad_norm": 8.936346054077148, "learning_rate": 1e-06, "loss": 0.4726, "mean_token_accuracy": 0.8681155443191528, "num_tokens": 869719527.0, "step": 22794 }, { "epoch": 2.899758300470678, "ewc_loss": 0.0755997821688652, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038978690281510353, "grad_norm": 9.047591209411621, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8703755140304565, "num_tokens": 869755139.0, "step": 22795 }, { "epoch": 2.8998855107492685, "ewc_loss": 0.07496412098407745, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003834302187897265, "grad_norm": 8.94445514678955, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.8601735830307007, "num_tokens": 869791860.0, "step": 22796 }, { "epoch": 2.900012721027859, "ewc_loss": 0.07553975284099579, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003891865781042725, "grad_norm": 9.037453651428223, "learning_rate": 1e-06, "loss": 0.4012, "mean_token_accuracy": 0.8878966569900513, "num_tokens": 869827294.0, "step": 22797 }, { "epoch": 2.9001399313064495, "ewc_loss": 0.07510804384946823, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038486949051730335, "grad_norm": 8.968440055847168, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8602285385131836, "num_tokens": 869859398.0, "step": 22798 }, { "epoch": 2.90026714158504, "ewc_loss": 0.07533884793519974, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038717754068784416, "grad_norm": 8.983400344848633, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8650364279747009, "num_tokens": 869898215.0, "step": 22799 }, { "epoch": 2.9003943518636306, "ewc_loss": 0.07533545792102814, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038714363472536206, "grad_norm": 8.989628791809082, "learning_rate": 1e-06, "loss": 0.4962, "mean_token_accuracy": 0.8545820713043213, "num_tokens": 869939787.0, "step": 22800 }, { "epoch": 2.900521562142221, "ewc_loss": 0.07527005672454834, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003864896425511688, "grad_norm": 8.998976707458496, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8571067452430725, "num_tokens": 869976790.0, "step": 22801 }, { "epoch": 2.9006487724208116, "ewc_loss": 0.07531477510929108, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038693679380230606, "grad_norm": 8.903963088989258, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8703811168670654, "num_tokens": 870013547.0, "step": 22802 }, { "epoch": 2.900775982699402, "ewc_loss": 0.07544712722301483, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003882603778038174, "grad_norm": 9.00333023071289, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8780156970024109, "num_tokens": 870049540.0, "step": 22803 }, { "epoch": 2.9009031929779927, "ewc_loss": 0.07523496448993683, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038613867945969105, "grad_norm": 8.879449844360352, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.8739740252494812, "num_tokens": 870088247.0, "step": 22804 }, { "epoch": 2.9010304032565832, "ewc_loss": 0.07561953365802765, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899844014085829, "grad_norm": 8.997957229614258, "learning_rate": 1e-06, "loss": 0.4698, "mean_token_accuracy": 0.8654642105102539, "num_tokens": 870129112.0, "step": 22805 }, { "epoch": 2.9011576135351738, "ewc_loss": 0.07535885274410248, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003873776295222342, "grad_norm": 8.940900802612305, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.8656126260757446, "num_tokens": 870162082.0, "step": 22806 }, { "epoch": 2.9012848238137643, "ewc_loss": 0.0756058618426323, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898476716130972, "grad_norm": 9.013081550598145, "learning_rate": 1e-06, "loss": 0.4347, "mean_token_accuracy": 0.8768796920776367, "num_tokens": 870199868.0, "step": 22807 }, { "epoch": 2.901412034092355, "ewc_loss": 0.07526575028896332, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038644656888209283, "grad_norm": 8.896628379821777, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8719574213027954, "num_tokens": 870238121.0, "step": 22808 }, { "epoch": 2.9015392443709453, "ewc_loss": 0.07562728971242905, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003900619631167501, "grad_norm": 8.970514297485352, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8562202453613281, "num_tokens": 870278008.0, "step": 22809 }, { "epoch": 2.9016664546495354, "ewc_loss": 0.07539471983909607, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038773627602495253, "grad_norm": 8.93772029876709, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8616725206375122, "num_tokens": 870317671.0, "step": 22810 }, { "epoch": 2.9017936649281264, "ewc_loss": 0.07550691813230515, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003888582286890596, "grad_norm": 9.050395965576172, "learning_rate": 1e-06, "loss": 0.4495, "mean_token_accuracy": 0.8719459772109985, "num_tokens": 870352893.0, "step": 22811 }, { "epoch": 2.9019208752067165, "ewc_loss": 0.0753316879272461, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003871058870572597, "grad_norm": 8.936367988586426, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.8733285069465637, "num_tokens": 870387513.0, "step": 22812 }, { "epoch": 2.9020480854853075, "ewc_loss": 0.07592476159334183, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003905952617060393, "grad_norm": 9.027348518371582, "learning_rate": 1e-06, "loss": 0.4409, "mean_token_accuracy": 0.8731356263160706, "num_tokens": 870428649.0, "step": 22813 }, { "epoch": 2.9021752957638975, "ewc_loss": 0.07514217495918274, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000385210762033239, "grad_norm": 8.89706039428711, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8811059594154358, "num_tokens": 870468329.0, "step": 22814 }, { "epoch": 2.902302506042488, "ewc_loss": 0.07576917111873627, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039148080395534635, "grad_norm": 9.012314796447754, "learning_rate": 1e-06, "loss": 0.4727, "mean_token_accuracy": 0.8660576343536377, "num_tokens": 870509273.0, "step": 22815 }, { "epoch": 2.9024297163210786, "ewc_loss": 0.07517264783382416, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038551559555344284, "grad_norm": 8.950338363647461, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8601221442222595, "num_tokens": 870544206.0, "step": 22816 }, { "epoch": 2.902556926599669, "ewc_loss": 0.07556501775979996, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003894392284564674, "grad_norm": 8.981260299682617, "learning_rate": 1e-06, "loss": 0.4176, "mean_token_accuracy": 0.8817685842514038, "num_tokens": 870578708.0, "step": 22817 }, { "epoch": 2.9026841368782597, "ewc_loss": 0.07538244128227234, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038761348696425557, "grad_norm": 8.932852745056152, "learning_rate": 1e-06, "loss": 0.4392, "mean_token_accuracy": 0.8730508089065552, "num_tokens": 870620599.0, "step": 22818 }, { "epoch": 2.90281134715685, "ewc_loss": 0.07551668584346771, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038895593024790287, "grad_norm": 8.978856086730957, "learning_rate": 1e-06, "loss": 0.416, "mean_token_accuracy": 0.879292905330658, "num_tokens": 870657828.0, "step": 22819 }, { "epoch": 2.9029385574354407, "ewc_loss": 0.07543651759624481, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003881542070303112, "grad_norm": 8.930586814880371, "learning_rate": 1e-06, "loss": 0.4852, "mean_token_accuracy": 0.8615089654922485, "num_tokens": 870694786.0, "step": 22820 }, { "epoch": 2.9030657677140312, "ewc_loss": 0.07552213966846466, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038901044172234833, "grad_norm": 8.957113265991211, "learning_rate": 1e-06, "loss": 0.4167, "mean_token_accuracy": 0.8841917514801025, "num_tokens": 870733850.0, "step": 22821 }, { "epoch": 2.9031929779926218, "ewc_loss": 0.07560756057500839, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898646682500839, "grad_norm": 9.009238243103027, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8676071166992188, "num_tokens": 870771556.0, "step": 22822 }, { "epoch": 2.9033201882712123, "ewc_loss": 0.0755314975976944, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038910401053726673, "grad_norm": 8.981918334960938, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8762906789779663, "num_tokens": 870809335.0, "step": 22823 }, { "epoch": 2.903447398549803, "ewc_loss": 0.07557070255279541, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038949609734117985, "grad_norm": 9.041790962219238, "learning_rate": 1e-06, "loss": 0.4704, "mean_token_accuracy": 0.862101674079895, "num_tokens": 870844239.0, "step": 22824 }, { "epoch": 2.9035746088283934, "ewc_loss": 0.07529684901237488, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038675754331052303, "grad_norm": 8.930619239807129, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8671144247055054, "num_tokens": 870879540.0, "step": 22825 }, { "epoch": 2.903701819106984, "ewc_loss": 0.07576042413711548, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039139328873716295, "grad_norm": 9.026785850524902, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8659701943397522, "num_tokens": 870917219.0, "step": 22826 }, { "epoch": 2.9038290293855744, "ewc_loss": 0.07536423206329346, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038743135519325733, "grad_norm": 8.937480926513672, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8674413561820984, "num_tokens": 870952332.0, "step": 22827 }, { "epoch": 2.903956239664165, "ewc_loss": 0.07568138837814331, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003906029451172799, "grad_norm": 9.050413131713867, "learning_rate": 1e-06, "loss": 0.509, "mean_token_accuracy": 0.8536399006843567, "num_tokens": 870991609.0, "step": 22828 }, { "epoch": 2.9040834499427555, "ewc_loss": 0.07534968852996826, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003872858942486346, "grad_norm": 8.961130142211914, "learning_rate": 1e-06, "loss": 0.4493, "mean_token_accuracy": 0.8703967332839966, "num_tokens": 871027456.0, "step": 22829 }, { "epoch": 2.904210660221346, "ewc_loss": 0.0755956619977951, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003897456917911768, "grad_norm": 8.949502944946289, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8576937913894653, "num_tokens": 871071833.0, "step": 22830 }, { "epoch": 2.9043378704999365, "ewc_loss": 0.07542702555656433, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038805935764685273, "grad_norm": 8.980094909667969, "learning_rate": 1e-06, "loss": 0.4661, "mean_token_accuracy": 0.8649332523345947, "num_tokens": 871105446.0, "step": 22831 }, { "epoch": 2.904465080778527, "ewc_loss": 0.07541278749704361, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003879169235005975, "grad_norm": 8.922517776489258, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8645082116127014, "num_tokens": 871141408.0, "step": 22832 }, { "epoch": 2.9045922910571176, "ewc_loss": 0.07559442520141602, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038973335176706314, "grad_norm": 9.022459983825684, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8686583638191223, "num_tokens": 871173296.0, "step": 22833 }, { "epoch": 2.904719501335708, "ewc_loss": 0.07526995241641998, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038648859481327236, "grad_norm": 8.924615859985352, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8624235391616821, "num_tokens": 871210711.0, "step": 22834 }, { "epoch": 2.904846711614298, "ewc_loss": 0.07575444877147675, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913335094694048, "grad_norm": 8.984798431396484, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8623063564300537, "num_tokens": 871251179.0, "step": 22835 }, { "epoch": 2.904973921892889, "ewc_loss": 0.07542628794908524, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038805193617008626, "grad_norm": 8.89861011505127, "learning_rate": 1e-06, "loss": 0.404, "mean_token_accuracy": 0.8857377767562866, "num_tokens": 871295758.0, "step": 22836 }, { "epoch": 2.9051011321714793, "ewc_loss": 0.07574525475502014, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000391241570468992, "grad_norm": 8.98689079284668, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8502169847488403, "num_tokens": 871331145.0, "step": 22837 }, { "epoch": 2.9052283424500702, "ewc_loss": 0.07544521242380142, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003882411983795464, "grad_norm": 9.031947135925293, "learning_rate": 1e-06, "loss": 0.5404, "mean_token_accuracy": 0.8414644598960876, "num_tokens": 871372819.0, "step": 22838 }, { "epoch": 2.9053555527286603, "ewc_loss": 0.07558144629001617, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038960357778705657, "grad_norm": 8.983577728271484, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8605147004127502, "num_tokens": 871414223.0, "step": 22839 }, { "epoch": 2.905482763007251, "ewc_loss": 0.07547302544116974, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003885192854795605, "grad_norm": 8.900103569030762, "learning_rate": 1e-06, "loss": 0.4486, "mean_token_accuracy": 0.8704891204833984, "num_tokens": 871455013.0, "step": 22840 }, { "epoch": 2.9056099732858414, "ewc_loss": 0.07558037340641022, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003895928093697876, "grad_norm": 9.106489181518555, "learning_rate": 1e-06, "loss": 0.4764, "mean_token_accuracy": 0.8649460077285767, "num_tokens": 871495462.0, "step": 22841 }, { "epoch": 2.905737183564432, "ewc_loss": 0.07520751655101776, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003858642594423145, "grad_norm": 8.883560180664062, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8688998818397522, "num_tokens": 871534999.0, "step": 22842 }, { "epoch": 2.9058643938430224, "ewc_loss": 0.07590737193822861, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003928627702407539, "grad_norm": 9.091790199279785, "learning_rate": 1e-06, "loss": 0.4624, "mean_token_accuracy": 0.8664877414703369, "num_tokens": 871569816.0, "step": 22843 }, { "epoch": 2.905991604121613, "ewc_loss": 0.07515895366668701, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003853785456158221, "grad_norm": 8.949444770812988, "learning_rate": 1e-06, "loss": 0.4565, "mean_token_accuracy": 0.8691219091415405, "num_tokens": 871610441.0, "step": 22844 }, { "epoch": 2.9061188144002035, "ewc_loss": 0.07571880519390106, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039097710396163166, "grad_norm": 8.980887413024902, "learning_rate": 1e-06, "loss": 0.505, "mean_token_accuracy": 0.8518826961517334, "num_tokens": 871649005.0, "step": 22845 }, { "epoch": 2.906246024678794, "ewc_loss": 0.07545682042837143, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038835726445540786, "grad_norm": 8.941146850585938, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8613191843032837, "num_tokens": 871689263.0, "step": 22846 }, { "epoch": 2.9063732349573845, "ewc_loss": 0.07568380236625671, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000390627101296559, "grad_norm": 8.991580963134766, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8755178451538086, "num_tokens": 871725440.0, "step": 22847 }, { "epoch": 2.906500445235975, "ewc_loss": 0.07567095756530762, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003904986078850925, "grad_norm": 8.947282791137695, "learning_rate": 1e-06, "loss": 0.4802, "mean_token_accuracy": 0.8631771802902222, "num_tokens": 871767839.0, "step": 22848 }, { "epoch": 2.9066276555145656, "ewc_loss": 0.0756157636642456, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899467410519719, "grad_norm": 8.984983444213867, "learning_rate": 1e-06, "loss": 0.4453, "mean_token_accuracy": 0.8704568147659302, "num_tokens": 871801516.0, "step": 22849 }, { "epoch": 2.906754865793156, "ewc_loss": 0.07569912075996399, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907802456524223, "grad_norm": 9.0128173828125, "learning_rate": 1e-06, "loss": 0.4795, "mean_token_accuracy": 0.862354040145874, "num_tokens": 871840359.0, "step": 22850 }, { "epoch": 2.9068820760717466, "ewc_loss": 0.07543572783470154, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038814637809991837, "grad_norm": 8.964029312133789, "learning_rate": 1e-06, "loss": 0.4531, "mean_token_accuracy": 0.870339035987854, "num_tokens": 871879396.0, "step": 22851 }, { "epoch": 2.907009286350337, "ewc_loss": 0.07561184465885162, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899075381923467, "grad_norm": 9.007527351379395, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.8599116802215576, "num_tokens": 871915043.0, "step": 22852 }, { "epoch": 2.9071364966289277, "ewc_loss": 0.07534999400377274, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038728900835849345, "grad_norm": 8.965059280395508, "learning_rate": 1e-06, "loss": 0.4796, "mean_token_accuracy": 0.8607518076896667, "num_tokens": 871953391.0, "step": 22853 }, { "epoch": 2.9072637069075182, "ewc_loss": 0.07559765875339508, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003897655988112092, "grad_norm": 9.00218391418457, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8689486384391785, "num_tokens": 871989879.0, "step": 22854 }, { "epoch": 2.9073909171861088, "ewc_loss": 0.07543739676475525, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038816299638710916, "grad_norm": 8.950174331665039, "learning_rate": 1e-06, "loss": 0.442, "mean_token_accuracy": 0.8691362738609314, "num_tokens": 872022428.0, "step": 22855 }, { "epoch": 2.9075181274646993, "ewc_loss": 0.07558447122573853, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003896338166669011, "grad_norm": 8.928305625915527, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8715835809707642, "num_tokens": 872060644.0, "step": 22856 }, { "epoch": 2.90764533774329, "ewc_loss": 0.07548713684082031, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003886604681611061, "grad_norm": 9.003634452819824, "learning_rate": 1e-06, "loss": 0.4737, "mean_token_accuracy": 0.8632482290267944, "num_tokens": 872098230.0, "step": 22857 }, { "epoch": 2.90777254802188, "ewc_loss": 0.07531887292861938, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003869778010994196, "grad_norm": 8.921295166015625, "learning_rate": 1e-06, "loss": 0.4921, "mean_token_accuracy": 0.8552464246749878, "num_tokens": 872131217.0, "step": 22858 }, { "epoch": 2.907899758300471, "ewc_loss": 0.07569145411252975, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039070358616299927, "grad_norm": 9.051680564880371, "learning_rate": 1e-06, "loss": 0.4358, "mean_token_accuracy": 0.8741456270217896, "num_tokens": 872163912.0, "step": 22859 }, { "epoch": 2.908026968579061, "ewc_loss": 0.07511940598487854, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038498317007906735, "grad_norm": 8.868408203125, "learning_rate": 1e-06, "loss": 0.5452, "mean_token_accuracy": 0.8422720432281494, "num_tokens": 872200694.0, "step": 22860 }, { "epoch": 2.908154178857652, "ewc_loss": 0.07595361769199371, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003933252301067114, "grad_norm": 9.041640281677246, "learning_rate": 1e-06, "loss": 0.4571, "mean_token_accuracy": 0.8662281036376953, "num_tokens": 872241156.0, "step": 22861 }, { "epoch": 2.908281389136242, "ewc_loss": 0.0751427561044693, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038521664100699127, "grad_norm": 8.885438919067383, "learning_rate": 1e-06, "loss": 0.4549, "mean_token_accuracy": 0.8677722215652466, "num_tokens": 872273048.0, "step": 22862 }, { "epoch": 2.908408599414833, "ewc_loss": 0.07580381631851196, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039182722684927285, "grad_norm": 8.959233283996582, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8705803155899048, "num_tokens": 872312165.0, "step": 22863 }, { "epoch": 2.908535809693423, "ewc_loss": 0.07543890178203583, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038817807217128575, "grad_norm": 8.914886474609375, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8686914443969727, "num_tokens": 872352740.0, "step": 22864 }, { "epoch": 2.9086630199720136, "ewc_loss": 0.07561038434505463, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898928698617965, "grad_norm": 8.96230411529541, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.8531437516212463, "num_tokens": 872397800.0, "step": 22865 }, { "epoch": 2.908790230250604, "ewc_loss": 0.07548454403877258, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038863453664816916, "grad_norm": 8.905381202697754, "learning_rate": 1e-06, "loss": 0.4322, "mean_token_accuracy": 0.8790042400360107, "num_tokens": 872429307.0, "step": 22866 }, { "epoch": 2.9089174405291947, "ewc_loss": 0.07574779540300369, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003912670072168112, "grad_norm": 9.029424667358398, "learning_rate": 1e-06, "loss": 0.4696, "mean_token_accuracy": 0.8626745939254761, "num_tokens": 872465897.0, "step": 22867 }, { "epoch": 2.909044650807785, "ewc_loss": 0.07530876249074936, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003868766943924129, "grad_norm": 8.891282081604004, "learning_rate": 1e-06, "loss": 0.4903, "mean_token_accuracy": 0.8573448061943054, "num_tokens": 872503252.0, "step": 22868 }, { "epoch": 2.9091718610863757, "ewc_loss": 0.07590007781982422, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039278980693779886, "grad_norm": 8.988423347473145, "learning_rate": 1e-06, "loss": 0.4532, "mean_token_accuracy": 0.8680886030197144, "num_tokens": 872546276.0, "step": 22869 }, { "epoch": 2.9092990713649662, "ewc_loss": 0.07542988657951355, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003880879667121917, "grad_norm": 8.968070030212402, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8639516830444336, "num_tokens": 872585690.0, "step": 22870 }, { "epoch": 2.9094262816435568, "ewc_loss": 0.07553353905677795, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038912449963390827, "grad_norm": 8.9979887008667, "learning_rate": 1e-06, "loss": 0.4998, "mean_token_accuracy": 0.855904221534729, "num_tokens": 872621718.0, "step": 22871 }, { "epoch": 2.9095534919221473, "ewc_loss": 0.07560475170612335, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038983661215752363, "grad_norm": 8.976988792419434, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8694704174995422, "num_tokens": 872659349.0, "step": 22872 }, { "epoch": 2.909680702200738, "ewc_loss": 0.07550011575222015, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038879027124494314, "grad_norm": 8.928360939025879, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8627312183380127, "num_tokens": 872704468.0, "step": 22873 }, { "epoch": 2.9098079124793284, "ewc_loss": 0.07551635056734085, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003889525542035699, "grad_norm": 8.970572471618652, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.866237223148346, "num_tokens": 872740003.0, "step": 22874 }, { "epoch": 2.909935122757919, "ewc_loss": 0.07545572519302368, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003883462632074952, "grad_norm": 8.925925254821777, "learning_rate": 1e-06, "loss": 0.5377, "mean_token_accuracy": 0.8445332050323486, "num_tokens": 872780362.0, "step": 22875 }, { "epoch": 2.9100623330365094, "ewc_loss": 0.07557797431945801, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003895688569173217, "grad_norm": 8.961976051330566, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8733900785446167, "num_tokens": 872823719.0, "step": 22876 }, { "epoch": 2.9101895433151, "ewc_loss": 0.07538491487503052, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038763825432397425, "grad_norm": 8.950724601745605, "learning_rate": 1e-06, "loss": 0.4591, "mean_token_accuracy": 0.869407057762146, "num_tokens": 872856276.0, "step": 22877 }, { "epoch": 2.9103167535936905, "ewc_loss": 0.07552377134561539, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038902676897123456, "grad_norm": 8.939358711242676, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.8587896823883057, "num_tokens": 872893688.0, "step": 22878 }, { "epoch": 2.910443963872281, "ewc_loss": 0.07533781975507736, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038716726703569293, "grad_norm": 8.960198402404785, "learning_rate": 1e-06, "loss": 0.4031, "mean_token_accuracy": 0.884042501449585, "num_tokens": 872931169.0, "step": 22879 }, { "epoch": 2.9105711741508715, "ewc_loss": 0.07530175149440765, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038680655416101217, "grad_norm": 8.900511741638184, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8614640831947327, "num_tokens": 872970836.0, "step": 22880 }, { "epoch": 2.910698384429462, "ewc_loss": 0.07565952837467194, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003903843753505498, "grad_norm": 8.946735382080078, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.8806085586547852, "num_tokens": 873010879.0, "step": 22881 }, { "epoch": 2.9108255947080526, "ewc_loss": 0.07531630992889404, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003869521606247872, "grad_norm": 8.894797325134277, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8680716156959534, "num_tokens": 873058018.0, "step": 22882 }, { "epoch": 2.9109528049866427, "ewc_loss": 0.07583989202976227, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039218796882778406, "grad_norm": 9.109109878540039, "learning_rate": 1e-06, "loss": 0.5026, "mean_token_accuracy": 0.8560470342636108, "num_tokens": 873096259.0, "step": 22883 }, { "epoch": 2.9110800152652336, "ewc_loss": 0.07515308260917664, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038531984318979084, "grad_norm": 8.903688430786133, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8571832180023193, "num_tokens": 873131955.0, "step": 22884 }, { "epoch": 2.9112072255438237, "ewc_loss": 0.07592602074146271, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003930492966901511, "grad_norm": 11.652190208435059, "learning_rate": 1e-06, "loss": 0.4936, "mean_token_accuracy": 0.8613675236701965, "num_tokens": 873171024.0, "step": 22885 }, { "epoch": 2.9113344358224147, "ewc_loss": 0.07632508128881454, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039703986840322614, "grad_norm": 8.849359512329102, "learning_rate": 1e-06, "loss": 0.4512, "mean_token_accuracy": 0.8699647188186646, "num_tokens": 873210568.0, "step": 22886 }, { "epoch": 2.9114616461010048, "ewc_loss": 0.07912327349185944, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004250218335073441, "grad_norm": 9.656455993652344, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8691797256469727, "num_tokens": 873250025.0, "step": 22887 }, { "epoch": 2.9115888563795957, "ewc_loss": 0.07499268651008606, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038371587288565934, "grad_norm": 8.85954761505127, "learning_rate": 1e-06, "loss": 0.4953, "mean_token_accuracy": 0.8582363128662109, "num_tokens": 873285284.0, "step": 22888 }, { "epoch": 2.911716066658186, "ewc_loss": 0.07906828820705414, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00042447197483852506, "grad_norm": 9.68091869354248, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8685712814331055, "num_tokens": 873320102.0, "step": 22889 }, { "epoch": 2.9118432769367764, "ewc_loss": 0.07551265507936478, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003889156214427203, "grad_norm": 8.951189994812012, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8701220154762268, "num_tokens": 873354668.0, "step": 22890 }, { "epoch": 2.911970487215367, "ewc_loss": 0.07790087163448334, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004127978172618896, "grad_norm": 9.53601360321045, "learning_rate": 1e-06, "loss": 0.4785, "mean_token_accuracy": 0.8627972602844238, "num_tokens": 873387415.0, "step": 22891 }, { "epoch": 2.9120976974939574, "ewc_loss": 0.07575161755084991, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913051914423704, "grad_norm": 9.07051944732666, "learning_rate": 1e-06, "loss": 0.4128, "mean_token_accuracy": 0.88496994972229, "num_tokens": 873428679.0, "step": 22892 }, { "epoch": 2.912224907772548, "ewc_loss": 0.07690399885177612, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040282902773469687, "grad_norm": 9.256187438964844, "learning_rate": 1e-06, "loss": 0.451, "mean_token_accuracy": 0.8664647936820984, "num_tokens": 873466973.0, "step": 22893 }, { "epoch": 2.9123521180511385, "ewc_loss": 0.07573454082012177, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003911344683729112, "grad_norm": 9.05003833770752, "learning_rate": 1e-06, "loss": 0.413, "mean_token_accuracy": 0.8806062340736389, "num_tokens": 873510957.0, "step": 22894 }, { "epoch": 2.912479328329729, "ewc_loss": 0.0762338787317276, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039612787077203393, "grad_norm": 9.211663246154785, "learning_rate": 1e-06, "loss": 0.5306, "mean_token_accuracy": 0.8462610840797424, "num_tokens": 873546456.0, "step": 22895 }, { "epoch": 2.9126065386083195, "ewc_loss": 0.0756167322397232, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899563744198531, "grad_norm": 9.007270812988281, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8510504961013794, "num_tokens": 873591972.0, "step": 22896 }, { "epoch": 2.91273374888691, "ewc_loss": 0.07650934904813766, "ewc_loss_diag": 3.719329833984375e-05, "ewc_loss_parallel": 0.00039399974048137665, "grad_norm": 9.236498832702637, "learning_rate": 1e-06, "loss": 0.4703, "mean_token_accuracy": 0.8637903928756714, "num_tokens": 873629790.0, "step": 22897 }, { "epoch": 2.9128609591655006, "ewc_loss": 0.07543092966079712, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038809835677966475, "grad_norm": 9.00687313079834, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8662928938865662, "num_tokens": 873667419.0, "step": 22898 }, { "epoch": 2.912988169444091, "ewc_loss": 0.07594802975654602, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039326940895989537, "grad_norm": 9.138240814208984, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8760219812393188, "num_tokens": 873705834.0, "step": 22899 }, { "epoch": 2.9131153797226816, "ewc_loss": 0.07535117864608765, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038730085361748934, "grad_norm": 9.019442558288574, "learning_rate": 1e-06, "loss": 0.4931, "mean_token_accuracy": 0.8582196235656738, "num_tokens": 873748859.0, "step": 22900 }, { "epoch": 2.913242590001272, "ewc_loss": 0.0757833868265152, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003916229179594666, "grad_norm": 9.087810516357422, "learning_rate": 1e-06, "loss": 0.4926, "mean_token_accuracy": 0.8564622402191162, "num_tokens": 873788068.0, "step": 22901 }, { "epoch": 2.9133698002798627, "ewc_loss": 0.07549671083688736, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038875616155564785, "grad_norm": 9.04401969909668, "learning_rate": 1e-06, "loss": 0.4372, "mean_token_accuracy": 0.87350994348526, "num_tokens": 873827820.0, "step": 22902 }, { "epoch": 2.9134970105584532, "ewc_loss": 0.07560611516237259, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898502327501774, "grad_norm": 9.054844856262207, "learning_rate": 1e-06, "loss": 0.476, "mean_token_accuracy": 0.8632082343101501, "num_tokens": 873866271.0, "step": 22903 }, { "epoch": 2.9136242208370438, "ewc_loss": 0.07549582421779633, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000388747313991189, "grad_norm": 9.066547393798828, "learning_rate": 1e-06, "loss": 0.4828, "mean_token_accuracy": 0.8618085980415344, "num_tokens": 873907409.0, "step": 22904 }, { "epoch": 2.9137514311156343, "ewc_loss": 0.07538831233978271, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038767216028645635, "grad_norm": 8.994447708129883, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8735373020172119, "num_tokens": 873947940.0, "step": 22905 }, { "epoch": 2.913878641394225, "ewc_loss": 0.07553012669086456, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003890903026331216, "grad_norm": 9.04625415802002, "learning_rate": 1e-06, "loss": 0.5047, "mean_token_accuracy": 0.8516585826873779, "num_tokens": 873990062.0, "step": 22906 }, { "epoch": 2.9140058516728153, "ewc_loss": 0.07540962100028992, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038788525853306055, "grad_norm": 8.953028678894043, "learning_rate": 1e-06, "loss": 0.4237, "mean_token_accuracy": 0.8773745894432068, "num_tokens": 874026660.0, "step": 22907 }, { "epoch": 2.9141330619514054, "ewc_loss": 0.07568247616291046, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003906138299498707, "grad_norm": 9.123058319091797, "learning_rate": 1e-06, "loss": 0.4601, "mean_token_accuracy": 0.872567892074585, "num_tokens": 874064935.0, "step": 22908 }, { "epoch": 2.9142602722299964, "ewc_loss": 0.07523147761821747, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038610384217463434, "grad_norm": 9.010621070861816, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8657587766647339, "num_tokens": 874102807.0, "step": 22909 }, { "epoch": 2.9143874825085865, "ewc_loss": 0.07567906379699707, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003905796620529145, "grad_norm": 9.065398216247559, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8688576221466064, "num_tokens": 874139688.0, "step": 22910 }, { "epoch": 2.9145146927871775, "ewc_loss": 0.07543223351240158, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038811139529570937, "grad_norm": 9.066944122314453, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8710793256759644, "num_tokens": 874184505.0, "step": 22911 }, { "epoch": 2.9146419030657675, "ewc_loss": 0.07550568878650665, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003888459468726069, "grad_norm": 9.044899940490723, "learning_rate": 1e-06, "loss": 0.4612, "mean_token_accuracy": 0.8666241765022278, "num_tokens": 874223210.0, "step": 22912 }, { "epoch": 2.914769113344358, "ewc_loss": 0.0755271464586258, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038906055851839483, "grad_norm": 9.089104652404785, "learning_rate": 1e-06, "loss": 0.4995, "mean_token_accuracy": 0.853675365447998, "num_tokens": 874263939.0, "step": 22913 }, { "epoch": 2.9148963236229486, "ewc_loss": 0.07539273798465729, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038771648542024195, "grad_norm": 9.064663887023926, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.874525785446167, "num_tokens": 874303946.0, "step": 22914 }, { "epoch": 2.915023533901539, "ewc_loss": 0.07547951489686966, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038858421612530947, "grad_norm": 8.989416122436523, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.86687171459198, "num_tokens": 874343701.0, "step": 22915 }, { "epoch": 2.9151507441801296, "ewc_loss": 0.07569118589162827, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907009377144277, "grad_norm": 9.154516220092773, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8677535057067871, "num_tokens": 874380425.0, "step": 22916 }, { "epoch": 2.91527795445872, "ewc_loss": 0.07525750994682312, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003863642050419003, "grad_norm": 9.018977165222168, "learning_rate": 1e-06, "loss": 0.4617, "mean_token_accuracy": 0.8671126961708069, "num_tokens": 874417643.0, "step": 22917 }, { "epoch": 2.9154051647373107, "ewc_loss": 0.07588469982147217, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039263605140149593, "grad_norm": 9.107834815979004, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8585938215255737, "num_tokens": 874463939.0, "step": 22918 }, { "epoch": 2.9155323750159012, "ewc_loss": 0.07532956451177597, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003870846994686872, "grad_norm": 9.067159652709961, "learning_rate": 1e-06, "loss": 0.4803, "mean_token_accuracy": 0.8627355098724365, "num_tokens": 874503156.0, "step": 22919 }, { "epoch": 2.9156595852944918, "ewc_loss": 0.07565722614526749, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003903613251168281, "grad_norm": 9.081921577453613, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8550857901573181, "num_tokens": 874546754.0, "step": 22920 }, { "epoch": 2.9157867955730823, "ewc_loss": 0.07548266649246216, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038861570646986365, "grad_norm": 8.960484504699707, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8695259094238281, "num_tokens": 874590781.0, "step": 22921 }, { "epoch": 2.915914005851673, "ewc_loss": 0.07584457099437714, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003922347677871585, "grad_norm": 9.203556060791016, "learning_rate": 1e-06, "loss": 0.4297, "mean_token_accuracy": 0.8768224716186523, "num_tokens": 874629592.0, "step": 22922 }, { "epoch": 2.9160412161302633, "ewc_loss": 0.07524408400058746, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003862299199681729, "grad_norm": 9.03596305847168, "learning_rate": 1e-06, "loss": 0.4331, "mean_token_accuracy": 0.8722634315490723, "num_tokens": 874667372.0, "step": 22923 }, { "epoch": 2.916168426408854, "ewc_loss": 0.075886070728302, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003926497302018106, "grad_norm": 9.104082107543945, "learning_rate": 1e-06, "loss": 0.4689, "mean_token_accuracy": 0.8624991178512573, "num_tokens": 874704687.0, "step": 22924 }, { "epoch": 2.9162956366874444, "ewc_loss": 0.0754629597067833, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884186444338411, "grad_norm": 9.133933067321777, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8650791645050049, "num_tokens": 874740471.0, "step": 22925 }, { "epoch": 2.916422846966035, "ewc_loss": 0.07543250173330307, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003881140728481114, "grad_norm": 9.00196647644043, "learning_rate": 1e-06, "loss": 0.4414, "mean_token_accuracy": 0.8717584609985352, "num_tokens": 874780747.0, "step": 22926 }, { "epoch": 2.9165500572446255, "ewc_loss": 0.07564780116081238, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003902670578099787, "grad_norm": 9.101691246032715, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8712954521179199, "num_tokens": 874814204.0, "step": 22927 }, { "epoch": 2.916677267523216, "ewc_loss": 0.07545357942581177, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038832490099593997, "grad_norm": 9.075117111206055, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.866176187992096, "num_tokens": 874851872.0, "step": 22928 }, { "epoch": 2.9168044778018065, "ewc_loss": 0.07564299553632736, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003902190364897251, "grad_norm": 9.031973838806152, "learning_rate": 1e-06, "loss": 0.4989, "mean_token_accuracy": 0.8512841463088989, "num_tokens": 874884215.0, "step": 22929 }, { "epoch": 2.916931688080397, "ewc_loss": 0.07554997503757477, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003892887616530061, "grad_norm": 9.07558536529541, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8637810349464417, "num_tokens": 874919047.0, "step": 22930 }, { "epoch": 2.917058898358987, "ewc_loss": 0.07548899948596954, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003886790946125984, "grad_norm": 9.091341972351074, "learning_rate": 1e-06, "loss": 0.4759, "mean_token_accuracy": 0.8599230051040649, "num_tokens": 874956469.0, "step": 22931 }, { "epoch": 2.917186108637578, "ewc_loss": 0.07542754709720612, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003880644799210131, "grad_norm": 8.986937522888184, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8770630955696106, "num_tokens": 874994633.0, "step": 22932 }, { "epoch": 2.917313318916168, "ewc_loss": 0.07576169073581696, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039140600711107254, "grad_norm": 9.065033912658691, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8787679672241211, "num_tokens": 875028530.0, "step": 22933 }, { "epoch": 2.917440529194759, "ewc_loss": 0.07537730038166046, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038756203139200807, "grad_norm": 8.963227272033691, "learning_rate": 1e-06, "loss": 0.4888, "mean_token_accuracy": 0.8584847450256348, "num_tokens": 875059785.0, "step": 22934 }, { "epoch": 2.9175677394733492, "ewc_loss": 0.07582832872867584, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039207233930937946, "grad_norm": 8.998919486999512, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8640192747116089, "num_tokens": 875096057.0, "step": 22935 }, { "epoch": 2.91769494975194, "ewc_loss": 0.07565276324748993, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039031673804856837, "grad_norm": 8.92470932006836, "learning_rate": 1e-06, "loss": 0.4417, "mean_token_accuracy": 0.872917115688324, "num_tokens": 875141988.0, "step": 22936 }, { "epoch": 2.9178221600305303, "ewc_loss": 0.0759972557425499, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039376161294057965, "grad_norm": 9.065956115722656, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8711953163146973, "num_tokens": 875176973.0, "step": 22937 }, { "epoch": 2.917949370309121, "ewc_loss": 0.07561446726322174, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899337607435882, "grad_norm": 8.931814193725586, "learning_rate": 1e-06, "loss": 0.44, "mean_token_accuracy": 0.8757452368736267, "num_tokens": 875209271.0, "step": 22938 }, { "epoch": 2.9180765805877114, "ewc_loss": 0.07606759667396545, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039446508162654936, "grad_norm": 9.120194435119629, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8713279962539673, "num_tokens": 875247460.0, "step": 22939 }, { "epoch": 2.918203790866302, "ewc_loss": 0.07538020610809326, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038759116432629526, "grad_norm": 8.873539924621582, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8642531633377075, "num_tokens": 875286013.0, "step": 22940 }, { "epoch": 2.9183310011448924, "ewc_loss": 0.07631133496761322, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000396902411011979, "grad_norm": 9.1815185546875, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8609163165092468, "num_tokens": 875324547.0, "step": 22941 }, { "epoch": 2.918458211423483, "ewc_loss": 0.07519712299108505, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038576030055992305, "grad_norm": 8.862037658691406, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8577567934989929, "num_tokens": 875366655.0, "step": 22942 }, { "epoch": 2.9185854217020735, "ewc_loss": 0.07636826485395432, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039747171103954315, "grad_norm": 9.097211837768555, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8607097864151001, "num_tokens": 875408571.0, "step": 22943 }, { "epoch": 2.918712631980664, "ewc_loss": 0.07539430260658264, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003877321141771972, "grad_norm": 8.981989860534668, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.8593416213989258, "num_tokens": 875450730.0, "step": 22944 }, { "epoch": 2.9188398422592545, "ewc_loss": 0.07610595971345901, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039484864100813866, "grad_norm": 9.104973793029785, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8688845634460449, "num_tokens": 875486850.0, "step": 22945 }, { "epoch": 2.918967052537845, "ewc_loss": 0.07558601349592209, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003896492125932127, "grad_norm": 8.883223533630371, "learning_rate": 1e-06, "loss": 0.4971, "mean_token_accuracy": 0.8595739603042603, "num_tokens": 875528653.0, "step": 22946 }, { "epoch": 2.9190942628164356, "ewc_loss": 0.07634741067886353, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003972631529904902, "grad_norm": 9.217418670654297, "learning_rate": 1e-06, "loss": 0.4365, "mean_token_accuracy": 0.8768417835235596, "num_tokens": 875564155.0, "step": 22947 }, { "epoch": 2.919221473095026, "ewc_loss": 0.0753336101770401, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038712515379302204, "grad_norm": 8.8823823928833, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8654845952987671, "num_tokens": 875598548.0, "step": 22948 }, { "epoch": 2.9193486833736166, "ewc_loss": 0.07663369178771973, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040012598037719727, "grad_norm": 9.216952323913574, "learning_rate": 1e-06, "loss": 0.4172, "mean_token_accuracy": 0.8823428153991699, "num_tokens": 875634587.0, "step": 22949 }, { "epoch": 2.919475893652207, "ewc_loss": 0.07524481415748596, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000386237254133448, "grad_norm": 8.936576843261719, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8648483753204346, "num_tokens": 875671956.0, "step": 22950 }, { "epoch": 2.9196031039307977, "ewc_loss": 0.07642051577568054, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003979942121077329, "grad_norm": 9.119818687438965, "learning_rate": 1e-06, "loss": 0.4557, "mean_token_accuracy": 0.8689572811126709, "num_tokens": 875706327.0, "step": 22951 }, { "epoch": 2.9197303142093882, "ewc_loss": 0.07539035379886627, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003876925620716065, "grad_norm": 8.953699111938477, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.8731579780578613, "num_tokens": 875749320.0, "step": 22952 }, { "epoch": 2.9198575244879788, "ewc_loss": 0.07612098008394241, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003949988749809563, "grad_norm": 9.170858383178711, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8758097290992737, "num_tokens": 875783871.0, "step": 22953 }, { "epoch": 2.9199847347665693, "ewc_loss": 0.07542264461517334, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038801555638201535, "grad_norm": 8.933362007141113, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8656877279281616, "num_tokens": 875821716.0, "step": 22954 }, { "epoch": 2.92011194504516, "ewc_loss": 0.07614769041538239, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039526596083305776, "grad_norm": 9.10934066772461, "learning_rate": 1e-06, "loss": 0.4424, "mean_token_accuracy": 0.869967520236969, "num_tokens": 875854339.0, "step": 22955 }, { "epoch": 2.92023915532375, "ewc_loss": 0.07535603642463684, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003873494570143521, "grad_norm": 8.969757080078125, "learning_rate": 1e-06, "loss": 0.4603, "mean_token_accuracy": 0.8666898608207703, "num_tokens": 875889456.0, "step": 22956 }, { "epoch": 2.920366365602341, "ewc_loss": 0.0759243294596672, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003930323582608253, "grad_norm": 9.028075218200684, "learning_rate": 1e-06, "loss": 0.4955, "mean_token_accuracy": 0.8577910661697388, "num_tokens": 875926659.0, "step": 22957 }, { "epoch": 2.920493575880931, "ewc_loss": 0.07570093125104904, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039079837733879685, "grad_norm": 8.937252044677734, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8748834133148193, "num_tokens": 875972766.0, "step": 22958 }, { "epoch": 2.920620786159522, "ewc_loss": 0.07576456665992737, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003914346743840724, "grad_norm": 9.028165817260742, "learning_rate": 1e-06, "loss": 0.4866, "mean_token_accuracy": 0.8615962266921997, "num_tokens": 876011824.0, "step": 22959 }, { "epoch": 2.920747996438112, "ewc_loss": 0.0757308304309845, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003910974191967398, "grad_norm": 9.012240409851074, "learning_rate": 1e-06, "loss": 0.5318, "mean_token_accuracy": 0.8475987315177917, "num_tokens": 876050353.0, "step": 22960 }, { "epoch": 2.920875206716703, "ewc_loss": 0.07567936182022095, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039058265974745154, "grad_norm": 8.976922035217285, "learning_rate": 1e-06, "loss": 0.508, "mean_token_accuracy": 0.8522472381591797, "num_tokens": 876089358.0, "step": 22961 }, { "epoch": 2.921002416995293, "ewc_loss": 0.07594912499189377, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003932802937924862, "grad_norm": 9.014775276184082, "learning_rate": 1e-06, "loss": 0.5097, "mean_token_accuracy": 0.8514044284820557, "num_tokens": 876126287.0, "step": 22962 }, { "epoch": 2.9211296272738836, "ewc_loss": 0.0756203755736351, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899928124155849, "grad_norm": 9.031304359436035, "learning_rate": 1e-06, "loss": 0.4496, "mean_token_accuracy": 0.8759585618972778, "num_tokens": 876165157.0, "step": 22963 }, { "epoch": 2.921256837552474, "ewc_loss": 0.07579362392425537, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917253343388438, "grad_norm": 9.02473258972168, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8667048215866089, "num_tokens": 876201391.0, "step": 22964 }, { "epoch": 2.9213840478310646, "ewc_loss": 0.07566076517105103, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039039674447849393, "grad_norm": 8.9441499710083, "learning_rate": 1e-06, "loss": 0.4445, "mean_token_accuracy": 0.8728758692741394, "num_tokens": 876243004.0, "step": 22965 }, { "epoch": 2.921511258109655, "ewc_loss": 0.07582525908946991, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003920416347682476, "grad_norm": 9.014476776123047, "learning_rate": 1e-06, "loss": 0.4513, "mean_token_accuracy": 0.8690779805183411, "num_tokens": 876279245.0, "step": 22966 }, { "epoch": 2.9216384683882457, "ewc_loss": 0.07564660906791687, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039025512523949146, "grad_norm": 9.064531326293945, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8650660514831543, "num_tokens": 876312145.0, "step": 22967 }, { "epoch": 2.9217656786668362, "ewc_loss": 0.07569770514965057, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907660720869899, "grad_norm": 8.997703552246094, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.867957353591919, "num_tokens": 876351354.0, "step": 22968 }, { "epoch": 2.9218928889454268, "ewc_loss": 0.07583881914615631, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000392177258618176, "grad_norm": 9.053274154663086, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8555580973625183, "num_tokens": 876388621.0, "step": 22969 }, { "epoch": 2.9220200992240173, "ewc_loss": 0.07557937502861023, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038958285585977137, "grad_norm": 9.128424644470215, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8598189353942871, "num_tokens": 876418377.0, "step": 22970 }, { "epoch": 2.922147309502608, "ewc_loss": 0.07555577903985977, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003893468528985977, "grad_norm": 9.053041458129883, "learning_rate": 1e-06, "loss": 0.4588, "mean_token_accuracy": 0.8695387840270996, "num_tokens": 876453351.0, "step": 22971 }, { "epoch": 2.9222745197811983, "ewc_loss": 0.07568904012441635, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039067945908755064, "grad_norm": 9.137704849243164, "learning_rate": 1e-06, "loss": 0.441, "mean_token_accuracy": 0.8714752793312073, "num_tokens": 876484830.0, "step": 22972 }, { "epoch": 2.922401730059789, "ewc_loss": 0.07533276081085205, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003871166263706982, "grad_norm": 9.009194374084473, "learning_rate": 1e-06, "loss": 0.4305, "mean_token_accuracy": 0.8772155046463013, "num_tokens": 876524318.0, "step": 22973 }, { "epoch": 2.9225289403383794, "ewc_loss": 0.07561297714710236, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038991885958239436, "grad_norm": 9.069307327270508, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.864240825176239, "num_tokens": 876562421.0, "step": 22974 }, { "epoch": 2.92265615061697, "ewc_loss": 0.07526859641075134, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003864750324282795, "grad_norm": 8.894903182983398, "learning_rate": 1e-06, "loss": 0.4942, "mean_token_accuracy": 0.8606771230697632, "num_tokens": 876605681.0, "step": 22975 }, { "epoch": 2.9227833608955605, "ewc_loss": 0.07575873285531998, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039137640851549804, "grad_norm": 9.066338539123535, "learning_rate": 1e-06, "loss": 0.4975, "mean_token_accuracy": 0.8554967641830444, "num_tokens": 876647371.0, "step": 22976 }, { "epoch": 2.922910571174151, "ewc_loss": 0.07530394196510315, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003868284693453461, "grad_norm": 8.973919868469238, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8710984587669373, "num_tokens": 876685563.0, "step": 22977 }, { "epoch": 2.9230377814527415, "ewc_loss": 0.07580633461475372, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039185245987027884, "grad_norm": 9.126789093017578, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.862136960029602, "num_tokens": 876718404.0, "step": 22978 }, { "epoch": 2.923164991731332, "ewc_loss": 0.07522188127040863, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038600791594944894, "grad_norm": 9.007073402404785, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.868614912033081, "num_tokens": 876751888.0, "step": 22979 }, { "epoch": 2.9232922020099226, "ewc_loss": 0.07569004595279694, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039068952901288867, "grad_norm": 9.059637069702148, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8700824975967407, "num_tokens": 876786813.0, "step": 22980 }, { "epoch": 2.9234194122885127, "ewc_loss": 0.07536273449659348, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038741642492823303, "grad_norm": 8.990286827087402, "learning_rate": 1e-06, "loss": 0.4313, "mean_token_accuracy": 0.8762436509132385, "num_tokens": 876824511.0, "step": 22981 }, { "epoch": 2.9235466225671036, "ewc_loss": 0.07567889988422394, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039057809044606984, "grad_norm": 9.208085060119629, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8686434030532837, "num_tokens": 876856409.0, "step": 22982 }, { "epoch": 2.9236738328456937, "ewc_loss": 0.07513819634914398, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038517103530466557, "grad_norm": 8.918134689331055, "learning_rate": 1e-06, "loss": 0.5069, "mean_token_accuracy": 0.850829005241394, "num_tokens": 876889204.0, "step": 22983 }, { "epoch": 2.9238010431242847, "ewc_loss": 0.075775645673275, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003915455308742821, "grad_norm": 9.083715438842773, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.867264986038208, "num_tokens": 876927185.0, "step": 22984 }, { "epoch": 2.9239282534028748, "ewc_loss": 0.07519456744194031, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003857347765006125, "grad_norm": 9.08177375793457, "learning_rate": 1e-06, "loss": 0.4773, "mean_token_accuracy": 0.8633993864059448, "num_tokens": 876965744.0, "step": 22985 }, { "epoch": 2.9240554636814657, "ewc_loss": 0.07562318444252014, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039002089761197567, "grad_norm": 9.029102325439453, "learning_rate": 1e-06, "loss": 0.4456, "mean_token_accuracy": 0.8732171058654785, "num_tokens": 876997107.0, "step": 22986 }, { "epoch": 2.924182673960056, "ewc_loss": 0.07545217871665955, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003883108147419989, "grad_norm": 9.009212493896484, "learning_rate": 1e-06, "loss": 0.4874, "mean_token_accuracy": 0.8559629917144775, "num_tokens": 877037894.0, "step": 22987 }, { "epoch": 2.9243098842386464, "ewc_loss": 0.07549624145030975, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038875150494277477, "grad_norm": 9.10401725769043, "learning_rate": 1e-06, "loss": 0.5171, "mean_token_accuracy": 0.8536994457244873, "num_tokens": 877078724.0, "step": 22988 }, { "epoch": 2.924437094517237, "ewc_loss": 0.07533953338861465, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038718440919183195, "grad_norm": 8.897551536560059, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8673514127731323, "num_tokens": 877119685.0, "step": 22989 }, { "epoch": 2.9245643047958274, "ewc_loss": 0.07580649852752686, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003918540314771235, "grad_norm": 9.129672050476074, "learning_rate": 1e-06, "loss": 0.4551, "mean_token_accuracy": 0.8692426681518555, "num_tokens": 877157055.0, "step": 22990 }, { "epoch": 2.924691515074418, "ewc_loss": 0.07505317032337189, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038432079600170255, "grad_norm": 8.906149864196777, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8713510036468506, "num_tokens": 877188569.0, "step": 22991 }, { "epoch": 2.9248187253530085, "ewc_loss": 0.07587985694408417, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003925876517314464, "grad_norm": 9.072117805480957, "learning_rate": 1e-06, "loss": 0.4896, "mean_token_accuracy": 0.858517050743103, "num_tokens": 877222745.0, "step": 22992 }, { "epoch": 2.924945935631599, "ewc_loss": 0.07524086534976959, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003861977020278573, "grad_norm": 8.936671257019043, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8504639863967896, "num_tokens": 877262510.0, "step": 22993 }, { "epoch": 2.9250731459101895, "ewc_loss": 0.0758528783917427, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000392317830119282, "grad_norm": 9.008061408996582, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.859609842300415, "num_tokens": 877300164.0, "step": 22994 }, { "epoch": 2.92520035618878, "ewc_loss": 0.07536701112985611, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038745917845517397, "grad_norm": 8.914340019226074, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8756030201911926, "num_tokens": 877337187.0, "step": 22995 }, { "epoch": 2.9253275664673706, "ewc_loss": 0.0757884830236435, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039167393697425723, "grad_norm": 9.080316543579102, "learning_rate": 1e-06, "loss": 0.5254, "mean_token_accuracy": 0.8466677665710449, "num_tokens": 877375902.0, "step": 22996 }, { "epoch": 2.925454776745961, "ewc_loss": 0.07536567747592926, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038744587800465524, "grad_norm": 8.935379028320312, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8657684922218323, "num_tokens": 877414406.0, "step": 22997 }, { "epoch": 2.9255819870245516, "ewc_loss": 0.07580742985010147, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003918633738067001, "grad_norm": 9.006998062133789, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8718077540397644, "num_tokens": 877448977.0, "step": 22998 }, { "epoch": 2.925709197303142, "ewc_loss": 0.0754891186952591, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003886802587658167, "grad_norm": 8.975272178649902, "learning_rate": 1e-06, "loss": 0.4394, "mean_token_accuracy": 0.8738747835159302, "num_tokens": 877486766.0, "step": 22999 }, { "epoch": 2.9258364075817327, "ewc_loss": 0.07569456100463867, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039073466905392706, "grad_norm": 8.996169090270996, "learning_rate": 1e-06, "loss": 0.4218, "mean_token_accuracy": 0.8802957534790039, "num_tokens": 877519320.0, "step": 23000 }, { "epoch": 2.925963617860323, "ewc_loss": 0.07565471529960632, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003903362376149744, "grad_norm": 9.086638450622559, "learning_rate": 1e-06, "loss": 0.5462, "mean_token_accuracy": 0.8416129946708679, "num_tokens": 877557913.0, "step": 23001 }, { "epoch": 2.9260908281389137, "ewc_loss": 0.07545389235019684, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038832801510579884, "grad_norm": 8.941681861877441, "learning_rate": 1e-06, "loss": 0.4629, "mean_token_accuracy": 0.8651240468025208, "num_tokens": 877591197.0, "step": 23002 }, { "epoch": 2.9262180384175043, "ewc_loss": 0.07584745436906815, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003922636096831411, "grad_norm": 9.008232116699219, "learning_rate": 1e-06, "loss": 0.4105, "mean_token_accuracy": 0.8813116550445557, "num_tokens": 877625018.0, "step": 23003 }, { "epoch": 2.926345248696095, "ewc_loss": 0.07549219578504562, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038871102151460946, "grad_norm": 9.024909973144531, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.8703679442405701, "num_tokens": 877662433.0, "step": 23004 }, { "epoch": 2.9264724589746853, "ewc_loss": 0.0756007730960846, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003897967981174588, "grad_norm": 8.964221000671387, "learning_rate": 1e-06, "loss": 0.4391, "mean_token_accuracy": 0.8780972957611084, "num_tokens": 877703576.0, "step": 23005 }, { "epoch": 2.9265996692532754, "ewc_loss": 0.0756169855594635, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899589355569333, "grad_norm": 9.052349090576172, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8610360622406006, "num_tokens": 877734025.0, "step": 23006 }, { "epoch": 2.9267268795318664, "ewc_loss": 0.07535241544246674, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000387313193641603, "grad_norm": 8.958970069885254, "learning_rate": 1e-06, "loss": 0.4267, "mean_token_accuracy": 0.8770773410797119, "num_tokens": 877771942.0, "step": 23007 }, { "epoch": 2.9268540898104565, "ewc_loss": 0.07557758688926697, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003895649570040405, "grad_norm": 8.956645965576172, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8689814805984497, "num_tokens": 877815662.0, "step": 23008 }, { "epoch": 2.9269813000890474, "ewc_loss": 0.07541605830192566, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038794969441369176, "grad_norm": 8.988601684570312, "learning_rate": 1e-06, "loss": 0.5146, "mean_token_accuracy": 0.8495425581932068, "num_tokens": 877855398.0, "step": 23009 }, { "epoch": 2.9271085103676375, "ewc_loss": 0.07534100115299225, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003871990193147212, "grad_norm": 8.933783531188965, "learning_rate": 1e-06, "loss": 0.4853, "mean_token_accuracy": 0.860276997089386, "num_tokens": 877898544.0, "step": 23010 }, { "epoch": 2.927235720646228, "ewc_loss": 0.07573775947093964, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003911666863132268, "grad_norm": 9.030946731567383, "learning_rate": 1e-06, "loss": 0.4776, "mean_token_accuracy": 0.8648533821105957, "num_tokens": 877937914.0, "step": 23011 }, { "epoch": 2.9273629309248186, "ewc_loss": 0.07528595626354218, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000386648578569293, "grad_norm": 8.91274642944336, "learning_rate": 1e-06, "loss": 0.5228, "mean_token_accuracy": 0.8499850630760193, "num_tokens": 877979266.0, "step": 23012 }, { "epoch": 2.927490141203409, "ewc_loss": 0.075650155544281, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039029057370498776, "grad_norm": 8.977434158325195, "learning_rate": 1e-06, "loss": 0.4585, "mean_token_accuracy": 0.8708236813545227, "num_tokens": 878019762.0, "step": 23013 }, { "epoch": 2.9276173514819996, "ewc_loss": 0.07559620589017868, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003897511342074722, "grad_norm": 8.965364456176758, "learning_rate": 1e-06, "loss": 0.4794, "mean_token_accuracy": 0.8607710599899292, "num_tokens": 878062174.0, "step": 23014 }, { "epoch": 2.92774456176059, "ewc_loss": 0.07566659152507782, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039045498124323785, "grad_norm": 8.955678939819336, "learning_rate": 1e-06, "loss": 0.483, "mean_token_accuracy": 0.859731912612915, "num_tokens": 878098197.0, "step": 23015 }, { "epoch": 2.9278717720391807, "ewc_loss": 0.0755898654460907, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038968774606473744, "grad_norm": 8.972821235656738, "learning_rate": 1e-06, "loss": 0.4471, "mean_token_accuracy": 0.872043251991272, "num_tokens": 878130478.0, "step": 23016 }, { "epoch": 2.9279989823177712, "ewc_loss": 0.07563839852809906, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000390173023333773, "grad_norm": 9.013917922973633, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.848856508731842, "num_tokens": 878175670.0, "step": 23017 }, { "epoch": 2.9281261925963618, "ewc_loss": 0.07546163350343704, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884054021909833, "grad_norm": 8.921051025390625, "learning_rate": 1e-06, "loss": 0.5014, "mean_token_accuracy": 0.8540818691253662, "num_tokens": 878216415.0, "step": 23018 }, { "epoch": 2.9282534028749523, "ewc_loss": 0.07578781992197037, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039166727219708264, "grad_norm": 9.069068908691406, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8649744391441345, "num_tokens": 878249582.0, "step": 23019 }, { "epoch": 2.928380613153543, "ewc_loss": 0.07543835043907166, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003881726006511599, "grad_norm": 8.902722358703613, "learning_rate": 1e-06, "loss": 0.4271, "mean_token_accuracy": 0.8771190643310547, "num_tokens": 878290859.0, "step": 23020 }, { "epoch": 2.9285078234321333, "ewc_loss": 0.07590320706367493, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003928211808670312, "grad_norm": 9.00732421875, "learning_rate": 1e-06, "loss": 0.417, "mean_token_accuracy": 0.88106369972229, "num_tokens": 878322463.0, "step": 23021 }, { "epoch": 2.928635033710724, "ewc_loss": 0.07553937286138535, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003891827946063131, "grad_norm": 8.94757080078125, "learning_rate": 1e-06, "loss": 0.431, "mean_token_accuracy": 0.8760766983032227, "num_tokens": 878365754.0, "step": 23022 }, { "epoch": 2.9287622439893144, "ewc_loss": 0.07575784623622894, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913674736395478, "grad_norm": 8.993974685668945, "learning_rate": 1e-06, "loss": 0.4247, "mean_token_accuracy": 0.8789751529693604, "num_tokens": 878410325.0, "step": 23023 }, { "epoch": 2.928889454267905, "ewc_loss": 0.07553620636463165, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038915115874260664, "grad_norm": 8.94548225402832, "learning_rate": 1e-06, "loss": 0.4441, "mean_token_accuracy": 0.871063232421875, "num_tokens": 878451604.0, "step": 23024 }, { "epoch": 2.9290166645464955, "ewc_loss": 0.0758565217256546, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039235432632267475, "grad_norm": 8.99502944946289, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8682209253311157, "num_tokens": 878485174.0, "step": 23025 }, { "epoch": 2.929143874825086, "ewc_loss": 0.07568445801734924, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039063364965841174, "grad_norm": 8.958990097045898, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8583202362060547, "num_tokens": 878525867.0, "step": 23026 }, { "epoch": 2.9292710851036765, "ewc_loss": 0.07596169412136078, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003934060223400593, "grad_norm": 8.971673011779785, "learning_rate": 1e-06, "loss": 0.5053, "mean_token_accuracy": 0.8557787537574768, "num_tokens": 878561327.0, "step": 23027 }, { "epoch": 2.929398295382267, "ewc_loss": 0.07578481733798981, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003916372370440513, "grad_norm": 8.958881378173828, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8718005418777466, "num_tokens": 878603045.0, "step": 23028 }, { "epoch": 2.929525505660857, "ewc_loss": 0.07575812190771103, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039137029671110213, "grad_norm": 8.912203788757324, "learning_rate": 1e-06, "loss": 0.4423, "mean_token_accuracy": 0.8723818063735962, "num_tokens": 878645051.0, "step": 23029 }, { "epoch": 2.929652715939448, "ewc_loss": 0.07588565349578857, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003926455683540553, "grad_norm": 8.984503746032715, "learning_rate": 1e-06, "loss": 0.5282, "mean_token_accuracy": 0.8489761352539062, "num_tokens": 878682003.0, "step": 23030 }, { "epoch": 2.929779926218038, "ewc_loss": 0.07568264752626419, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039061554707586765, "grad_norm": 8.947137832641602, "learning_rate": 1e-06, "loss": 0.4517, "mean_token_accuracy": 0.8703898787498474, "num_tokens": 878720509.0, "step": 23031 }, { "epoch": 2.929907136496629, "ewc_loss": 0.07601909339427948, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039398003718815744, "grad_norm": 8.973873138427734, "learning_rate": 1e-06, "loss": 0.4506, "mean_token_accuracy": 0.8729816675186157, "num_tokens": 878757950.0, "step": 23032 }, { "epoch": 2.9300343467752192, "ewc_loss": 0.07579876482486725, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039177676080726087, "grad_norm": 8.924419403076172, "learning_rate": 1e-06, "loss": 0.4908, "mean_token_accuracy": 0.8594014644622803, "num_tokens": 878801477.0, "step": 23033 }, { "epoch": 2.93016155705381, "ewc_loss": 0.07609102129936218, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003946992219425738, "grad_norm": 9.06141185760498, "learning_rate": 1e-06, "loss": 0.4929, "mean_token_accuracy": 0.8576114177703857, "num_tokens": 878838802.0, "step": 23034 }, { "epoch": 2.9302887673324003, "ewc_loss": 0.07560799270868301, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898689756169915, "grad_norm": 8.912524223327637, "learning_rate": 1e-06, "loss": 0.4848, "mean_token_accuracy": 0.8601053953170776, "num_tokens": 878875566.0, "step": 23035 }, { "epoch": 2.930415977610991, "ewc_loss": 0.07629656791687012, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000396754767280072, "grad_norm": 9.073127746582031, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8587309122085571, "num_tokens": 878913463.0, "step": 23036 }, { "epoch": 2.9305431878895813, "ewc_loss": 0.07559950649738312, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038978413795121014, "grad_norm": 8.96977710723877, "learning_rate": 1e-06, "loss": 0.503, "mean_token_accuracy": 0.853506863117218, "num_tokens": 878950769.0, "step": 23037 }, { "epoch": 2.930670398168172, "ewc_loss": 0.0761120617389679, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003949097008444369, "grad_norm": 8.994717597961426, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8754709959030151, "num_tokens": 878989041.0, "step": 23038 }, { "epoch": 2.9307976084467624, "ewc_loss": 0.07578998804092407, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039168892544694245, "grad_norm": 8.958518981933594, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8744201064109802, "num_tokens": 879020195.0, "step": 23039 }, { "epoch": 2.930924818725353, "ewc_loss": 0.07602636516094208, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003940527676604688, "grad_norm": 9.12522029876709, "learning_rate": 1e-06, "loss": 0.4627, "mean_token_accuracy": 0.8640228509902954, "num_tokens": 879057818.0, "step": 23040 }, { "epoch": 2.9310520290039435, "ewc_loss": 0.07563742995262146, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003901633317582309, "grad_norm": 8.945744514465332, "learning_rate": 1e-06, "loss": 0.4756, "mean_token_accuracy": 0.8615196347236633, "num_tokens": 879094747.0, "step": 23041 }, { "epoch": 2.931179239282534, "ewc_loss": 0.07617941498756409, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003955832216888666, "grad_norm": 9.04181957244873, "learning_rate": 1e-06, "loss": 0.498, "mean_token_accuracy": 0.8574744462966919, "num_tokens": 879131127.0, "step": 23042 }, { "epoch": 2.9313064495611245, "ewc_loss": 0.07554727792739868, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038926186971366405, "grad_norm": 8.940683364868164, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.865842342376709, "num_tokens": 879168541.0, "step": 23043 }, { "epoch": 2.931433659839715, "ewc_loss": 0.07626672089099884, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003964562783949077, "grad_norm": 9.126336097717285, "learning_rate": 1e-06, "loss": 0.4841, "mean_token_accuracy": 0.861865758895874, "num_tokens": 879199409.0, "step": 23044 }, { "epoch": 2.9315608701183056, "ewc_loss": 0.07541930675506592, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000387982145184651, "grad_norm": 8.904025077819824, "learning_rate": 1e-06, "loss": 0.4223, "mean_token_accuracy": 0.8791951537132263, "num_tokens": 879236447.0, "step": 23045 }, { "epoch": 2.931688080396896, "ewc_loss": 0.07614968717098236, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003952858969569206, "grad_norm": 9.074142456054688, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8669997453689575, "num_tokens": 879275596.0, "step": 23046 }, { "epoch": 2.9318152906754866, "ewc_loss": 0.07562439143657684, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003900329757016152, "grad_norm": 9.012420654296875, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8654267191886902, "num_tokens": 879315620.0, "step": 23047 }, { "epoch": 2.931942500954077, "ewc_loss": 0.07583223283290863, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003921113384421915, "grad_norm": 8.97571086883545, "learning_rate": 1e-06, "loss": 0.4548, "mean_token_accuracy": 0.8676688075065613, "num_tokens": 879356220.0, "step": 23048 }, { "epoch": 2.9320697112326677, "ewc_loss": 0.07579726725816727, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917617432307452, "grad_norm": 9.014500617980957, "learning_rate": 1e-06, "loss": 0.5202, "mean_token_accuracy": 0.8504048585891724, "num_tokens": 879397544.0, "step": 23049 }, { "epoch": 2.932196921511258, "ewc_loss": 0.07578929513692856, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039168199873529375, "grad_norm": 9.026262283325195, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8764458894729614, "num_tokens": 879438165.0, "step": 23050 }, { "epoch": 2.9323241317898487, "ewc_loss": 0.0757182389497757, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039097139961086214, "grad_norm": 9.045002937316895, "learning_rate": 1e-06, "loss": 0.4552, "mean_token_accuracy": 0.869527280330658, "num_tokens": 879473601.0, "step": 23051 }, { "epoch": 2.9324513420684393, "ewc_loss": 0.07564576715230942, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003902467433363199, "grad_norm": 9.005247116088867, "learning_rate": 1e-06, "loss": 0.4992, "mean_token_accuracy": 0.8593372702598572, "num_tokens": 879510998.0, "step": 23052 }, { "epoch": 2.93257855234703, "ewc_loss": 0.07582257688045502, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039201483014039695, "grad_norm": 9.014076232910156, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8698602914810181, "num_tokens": 879551469.0, "step": 23053 }, { "epoch": 2.93270576262562, "ewc_loss": 0.07556625455617905, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003894515975844115, "grad_norm": 8.995345115661621, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8585463762283325, "num_tokens": 879585620.0, "step": 23054 }, { "epoch": 2.932832972904211, "ewc_loss": 0.07579562813043594, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917453286703676, "grad_norm": 9.02059555053711, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8757147192955017, "num_tokens": 879621749.0, "step": 23055 }, { "epoch": 2.932960183182801, "ewc_loss": 0.07560566067695618, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898456343449652, "grad_norm": 8.96999740600586, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8686528205871582, "num_tokens": 879664529.0, "step": 23056 }, { "epoch": 2.933087393461392, "ewc_loss": 0.07567653805017471, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039055445813573897, "grad_norm": 9.091401100158691, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8708642721176147, "num_tokens": 879696293.0, "step": 23057 }, { "epoch": 2.933214603739982, "ewc_loss": 0.07543263584375381, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003881154116243124, "grad_norm": 8.934646606445312, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8712683320045471, "num_tokens": 879733562.0, "step": 23058 }, { "epoch": 2.933341814018573, "ewc_loss": 0.07593929767608643, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003931820683646947, "grad_norm": 9.047636032104492, "learning_rate": 1e-06, "loss": 0.4718, "mean_token_accuracy": 0.863056480884552, "num_tokens": 879771740.0, "step": 23059 }, { "epoch": 2.933469024297163, "ewc_loss": 0.07541271299123764, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038791619590483606, "grad_norm": 8.934439659118652, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8618847131729126, "num_tokens": 879805578.0, "step": 23060 }, { "epoch": 2.9335962345757536, "ewc_loss": 0.07606108486652374, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039439991815015674, "grad_norm": 9.042800903320312, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.864876389503479, "num_tokens": 879845175.0, "step": 23061 }, { "epoch": 2.933723444854344, "ewc_loss": 0.07550472766160965, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038883634260855615, "grad_norm": 8.902788162231445, "learning_rate": 1e-06, "loss": 0.4178, "mean_token_accuracy": 0.877342700958252, "num_tokens": 879878975.0, "step": 23062 }, { "epoch": 2.9338506551329346, "ewc_loss": 0.0760866329073906, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039465539157390594, "grad_norm": 9.01486873626709, "learning_rate": 1e-06, "loss": 0.4938, "mean_token_accuracy": 0.8585692644119263, "num_tokens": 879919373.0, "step": 23063 }, { "epoch": 2.933977865411525, "ewc_loss": 0.07558703422546387, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038965942803770304, "grad_norm": 8.940238952636719, "learning_rate": 1e-06, "loss": 0.5181, "mean_token_accuracy": 0.8463283777236938, "num_tokens": 879953821.0, "step": 23064 }, { "epoch": 2.9341050756901157, "ewc_loss": 0.07606424391269684, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003944315540138632, "grad_norm": 9.00616455078125, "learning_rate": 1e-06, "loss": 0.4913, "mean_token_accuracy": 0.8570328950881958, "num_tokens": 879993505.0, "step": 23065 }, { "epoch": 2.934232285968706, "ewc_loss": 0.07586419582366943, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039243107312358916, "grad_norm": 9.025778770446777, "learning_rate": 1e-06, "loss": 0.4463, "mean_token_accuracy": 0.869989275932312, "num_tokens": 880030124.0, "step": 23066 }, { "epoch": 2.9343594962472968, "ewc_loss": 0.07577431201934814, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039153220131993294, "grad_norm": 9.016860008239746, "learning_rate": 1e-06, "loss": 0.5359, "mean_token_accuracy": 0.8418834805488586, "num_tokens": 880059480.0, "step": 23067 }, { "epoch": 2.9344867065258873, "ewc_loss": 0.07587097585201263, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003924987686332315, "grad_norm": 9.046928405761719, "learning_rate": 1e-06, "loss": 0.4572, "mean_token_accuracy": 0.8663055896759033, "num_tokens": 880093402.0, "step": 23068 }, { "epoch": 2.934613916804478, "ewc_loss": 0.07565340399742126, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003903230535797775, "grad_norm": 8.984076499938965, "learning_rate": 1e-06, "loss": 0.4702, "mean_token_accuracy": 0.8626143932342529, "num_tokens": 880127874.0, "step": 23069 }, { "epoch": 2.9347411270830683, "ewc_loss": 0.07578613609075546, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003916504210792482, "grad_norm": 8.974686622619629, "learning_rate": 1e-06, "loss": 0.4469, "mean_token_accuracy": 0.8753345608711243, "num_tokens": 880165755.0, "step": 23070 }, { "epoch": 2.934868337361659, "ewc_loss": 0.07566076517105103, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039039671537466347, "grad_norm": 9.164118766784668, "learning_rate": 1e-06, "loss": 0.4916, "mean_token_accuracy": 0.8606041669845581, "num_tokens": 880203931.0, "step": 23071 }, { "epoch": 2.9349955476402494, "ewc_loss": 0.0752975344657898, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003867644118145108, "grad_norm": 8.894173622131348, "learning_rate": 1e-06, "loss": 0.4574, "mean_token_accuracy": 0.8683620691299438, "num_tokens": 880242844.0, "step": 23072 }, { "epoch": 2.93512275791884, "ewc_loss": 0.07595787942409515, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003933678090106696, "grad_norm": 9.044262886047363, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8499510288238525, "num_tokens": 880280068.0, "step": 23073 }, { "epoch": 2.9352499681974304, "ewc_loss": 0.07522326707839966, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038602176937274635, "grad_norm": 8.977538108825684, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.8724054098129272, "num_tokens": 880314044.0, "step": 23074 }, { "epoch": 2.935377178476021, "ewc_loss": 0.07584261149168015, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003922151809092611, "grad_norm": 9.005683898925781, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8634688258171082, "num_tokens": 880355077.0, "step": 23075 }, { "epoch": 2.9355043887546115, "ewc_loss": 0.0754387378692627, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038817647146061063, "grad_norm": 8.924731254577637, "learning_rate": 1e-06, "loss": 0.4159, "mean_token_accuracy": 0.8806566596031189, "num_tokens": 880388906.0, "step": 23076 }, { "epoch": 2.935631599033202, "ewc_loss": 0.07567888498306274, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039057794492691755, "grad_norm": 8.968239784240723, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.8772869110107422, "num_tokens": 880426834.0, "step": 23077 }, { "epoch": 2.9357588093117926, "ewc_loss": 0.07583862543106079, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003897339047398418, "grad_norm": 8.987631797790527, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.8640779852867126, "num_tokens": 880463477.0, "step": 23078 }, { "epoch": 2.9358860195903826, "ewc_loss": 0.07545699924230576, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003883590688928962, "grad_norm": 8.92736530303955, "learning_rate": 1e-06, "loss": 0.5149, "mean_token_accuracy": 0.8489273190498352, "num_tokens": 880500884.0, "step": 23079 }, { "epoch": 2.9360132298689736, "ewc_loss": 0.0757642388343811, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003914314729627222, "grad_norm": 9.042909622192383, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8611989617347717, "num_tokens": 880539724.0, "step": 23080 }, { "epoch": 2.9361404401475637, "ewc_loss": 0.07555340230464935, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003893231332767755, "grad_norm": 8.950090408325195, "learning_rate": 1e-06, "loss": 0.4786, "mean_token_accuracy": 0.8593387007713318, "num_tokens": 880577820.0, "step": 23081 }, { "epoch": 2.9362676504261547, "ewc_loss": 0.07568490505218506, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003906381025444716, "grad_norm": 8.97545051574707, "learning_rate": 1e-06, "loss": 0.4535, "mean_token_accuracy": 0.8699357509613037, "num_tokens": 880612055.0, "step": 23082 }, { "epoch": 2.9363948607047448, "ewc_loss": 0.07573139667510986, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039110303623601794, "grad_norm": 9.014898300170898, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8743903636932373, "num_tokens": 880654907.0, "step": 23083 }, { "epoch": 2.9365220709833357, "ewc_loss": 0.07552911341190338, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003890802327077836, "grad_norm": 8.92342758178711, "learning_rate": 1e-06, "loss": 0.4088, "mean_token_accuracy": 0.8808678388595581, "num_tokens": 880692418.0, "step": 23084 }, { "epoch": 2.936649281261926, "ewc_loss": 0.07593448460102081, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00039069255581125617, "grad_norm": 8.988068580627441, "learning_rate": 1e-06, "loss": 0.5055, "mean_token_accuracy": 0.8516746759414673, "num_tokens": 880727742.0, "step": 23085 }, { "epoch": 2.9367764915405163, "ewc_loss": 0.07553467154502869, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038913582102395594, "grad_norm": 8.990023612976074, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8780444860458374, "num_tokens": 880766673.0, "step": 23086 }, { "epoch": 2.936903701819107, "ewc_loss": 0.07552988082170486, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038908785791136324, "grad_norm": 8.894652366638184, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8582298755645752, "num_tokens": 880810390.0, "step": 23087 }, { "epoch": 2.9370309120976974, "ewc_loss": 0.07577534765005112, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003915425331797451, "grad_norm": 8.966302871704102, "learning_rate": 1e-06, "loss": 0.4769, "mean_token_accuracy": 0.8611120581626892, "num_tokens": 880846920.0, "step": 23088 }, { "epoch": 2.937158122376288, "ewc_loss": 0.07543925940990448, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003881816519424319, "grad_norm": 8.961714744567871, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8720625042915344, "num_tokens": 880885867.0, "step": 23089 }, { "epoch": 2.9372853326548785, "ewc_loss": 0.07567322254180908, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003905213379766792, "grad_norm": 8.936636924743652, "learning_rate": 1e-06, "loss": 0.4412, "mean_token_accuracy": 0.8770031929016113, "num_tokens": 880928408.0, "step": 23090 }, { "epoch": 2.937412542933469, "ewc_loss": 0.07568894326686859, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039067844045348465, "grad_norm": 8.935063362121582, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8649380207061768, "num_tokens": 880960681.0, "step": 23091 }, { "epoch": 2.9375397532120595, "ewc_loss": 0.07569950073957443, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907840873580426, "grad_norm": 9.014677047729492, "learning_rate": 1e-06, "loss": 0.4922, "mean_token_accuracy": 0.8553475737571716, "num_tokens": 881000358.0, "step": 23092 }, { "epoch": 2.93766696349065, "ewc_loss": 0.07554684579372406, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003892574750352651, "grad_norm": 8.893804550170898, "learning_rate": 1e-06, "loss": 0.4637, "mean_token_accuracy": 0.867814838886261, "num_tokens": 881037765.0, "step": 23093 }, { "epoch": 2.9377941737692406, "ewc_loss": 0.07592029869556427, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039299207855947316, "grad_norm": 8.973057746887207, "learning_rate": 1e-06, "loss": 0.4266, "mean_token_accuracy": 0.8802427649497986, "num_tokens": 881076091.0, "step": 23094 }, { "epoch": 2.937921384047831, "ewc_loss": 0.07559274137020111, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003897164424415678, "grad_norm": 8.90027904510498, "learning_rate": 1e-06, "loss": 0.4742, "mean_token_accuracy": 0.8657062649726868, "num_tokens": 881112784.0, "step": 23095 }, { "epoch": 2.9380485943264216, "ewc_loss": 0.07590904086828232, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039287947583943605, "grad_norm": 9.191524505615234, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8723230361938477, "num_tokens": 881149938.0, "step": 23096 }, { "epoch": 2.938175804605012, "ewc_loss": 0.07530113309621811, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038680038414895535, "grad_norm": 8.8370943069458, "learning_rate": 1e-06, "loss": 0.4917, "mean_token_accuracy": 0.8569098114967346, "num_tokens": 881187541.0, "step": 23097 }, { "epoch": 2.9383030148836027, "ewc_loss": 0.07628227770328522, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039661183836869895, "grad_norm": 9.141715049743652, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.867761492729187, "num_tokens": 881221920.0, "step": 23098 }, { "epoch": 2.938430225162193, "ewc_loss": 0.07509550452232361, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003847440530080348, "grad_norm": 8.757309913635254, "learning_rate": 1e-06, "loss": 0.5082, "mean_token_accuracy": 0.850915253162384, "num_tokens": 881264804.0, "step": 23099 }, { "epoch": 2.9385574354407837, "ewc_loss": 0.07677610963582993, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040155017632059753, "grad_norm": 9.210782051086426, "learning_rate": 1e-06, "loss": 0.4836, "mean_token_accuracy": 0.8600698709487915, "num_tokens": 881304482.0, "step": 23100 }, { "epoch": 2.9386846457193743, "ewc_loss": 0.07500980794429779, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038388717803172767, "grad_norm": 8.80400562286377, "learning_rate": 1e-06, "loss": 0.4986, "mean_token_accuracy": 0.8566051721572876, "num_tokens": 881340822.0, "step": 23101 }, { "epoch": 2.938811855997965, "ewc_loss": 0.07676969468593597, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040148606058210135, "grad_norm": 9.17261791229248, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8629968166351318, "num_tokens": 881374352.0, "step": 23102 }, { "epoch": 2.9389390662765553, "ewc_loss": 0.07529795169830322, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038676857366226614, "grad_norm": 8.889297485351562, "learning_rate": 1e-06, "loss": 0.4793, "mean_token_accuracy": 0.8598704934120178, "num_tokens": 881413176.0, "step": 23103 }, { "epoch": 2.9390662765551454, "ewc_loss": 0.07646188884973526, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003984079521615058, "grad_norm": 9.150897026062012, "learning_rate": 1e-06, "loss": 0.4846, "mean_token_accuracy": 0.8608570098876953, "num_tokens": 881454309.0, "step": 23104 }, { "epoch": 2.9391934868337364, "ewc_loss": 0.0753646045923233, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038743510958738625, "grad_norm": 8.88127326965332, "learning_rate": 1e-06, "loss": 0.4615, "mean_token_accuracy": 0.8668364882469177, "num_tokens": 881490702.0, "step": 23105 }, { "epoch": 2.9393206971123265, "ewc_loss": 0.07642720639705658, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039806118002161384, "grad_norm": 9.136363983154297, "learning_rate": 1e-06, "loss": 0.4688, "mean_token_accuracy": 0.8723382949829102, "num_tokens": 881530394.0, "step": 23106 }, { "epoch": 2.9394479073909174, "ewc_loss": 0.07542898505926132, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038807891542091966, "grad_norm": 8.969976425170898, "learning_rate": 1e-06, "loss": 0.418, "mean_token_accuracy": 0.8787637948989868, "num_tokens": 881567286.0, "step": 23107 }, { "epoch": 2.9395751176695075, "ewc_loss": 0.07607031613588333, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003944922355003655, "grad_norm": 9.000602722167969, "learning_rate": 1e-06, "loss": 0.5012, "mean_token_accuracy": 0.8620153665542603, "num_tokens": 881607017.0, "step": 23108 }, { "epoch": 2.939702327948098, "ewc_loss": 0.07568087428808212, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039059779373928905, "grad_norm": 9.042865753173828, "learning_rate": 1e-06, "loss": 0.4577, "mean_token_accuracy": 0.8660073280334473, "num_tokens": 881645793.0, "step": 23109 }, { "epoch": 2.9398295382266886, "ewc_loss": 0.07574683427810669, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003912574320565909, "grad_norm": 8.988259315490723, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8686237335205078, "num_tokens": 881683596.0, "step": 23110 }, { "epoch": 2.939956748505279, "ewc_loss": 0.07586236298084259, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039241270860657096, "grad_norm": 9.032780647277832, "learning_rate": 1e-06, "loss": 0.5191, "mean_token_accuracy": 0.8489105701446533, "num_tokens": 881717607.0, "step": 23111 }, { "epoch": 2.9400839587838696, "ewc_loss": 0.07561469823122025, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038993603084236383, "grad_norm": 9.028524398803711, "learning_rate": 1e-06, "loss": 0.4216, "mean_token_accuracy": 0.8791666030883789, "num_tokens": 881750884.0, "step": 23112 }, { "epoch": 2.94021116906246, "ewc_loss": 0.07562599331140518, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003900490119121969, "grad_norm": 8.978818893432617, "learning_rate": 1e-06, "loss": 0.4837, "mean_token_accuracy": 0.8620871305465698, "num_tokens": 881789345.0, "step": 23113 }, { "epoch": 2.9403383793410507, "ewc_loss": 0.07573646306991577, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003911536477971822, "grad_norm": 8.97885513305664, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8603163957595825, "num_tokens": 881828459.0, "step": 23114 }, { "epoch": 2.940465589619641, "ewc_loss": 0.07564657926559448, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039025480509735644, "grad_norm": 9.018333435058594, "learning_rate": 1e-06, "loss": 0.4928, "mean_token_accuracy": 0.8590658903121948, "num_tokens": 881865294.0, "step": 23115 }, { "epoch": 2.9405927998982317, "ewc_loss": 0.07557695358991623, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003895586123690009, "grad_norm": 8.952611923217773, "learning_rate": 1e-06, "loss": 0.4724, "mean_token_accuracy": 0.8684390783309937, "num_tokens": 881904936.0, "step": 23116 }, { "epoch": 2.9407200101768223, "ewc_loss": 0.07575735449790955, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039136261329986155, "grad_norm": 8.990504264831543, "learning_rate": 1e-06, "loss": 0.4311, "mean_token_accuracy": 0.8769761919975281, "num_tokens": 881936300.0, "step": 23117 }, { "epoch": 2.940847220455413, "ewc_loss": 0.07555519044399261, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038934091571718454, "grad_norm": 8.95289421081543, "learning_rate": 1e-06, "loss": 0.4339, "mean_token_accuracy": 0.8764459490776062, "num_tokens": 881974092.0, "step": 23118 }, { "epoch": 2.9409744307340033, "ewc_loss": 0.07578077167272568, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039159678271971643, "grad_norm": 8.94249153137207, "learning_rate": 1e-06, "loss": 0.4657, "mean_token_accuracy": 0.8667584657669067, "num_tokens": 882019339.0, "step": 23119 }, { "epoch": 2.941101641012594, "ewc_loss": 0.07562167942523956, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039000585093162954, "grad_norm": 8.922197341918945, "learning_rate": 1e-06, "loss": 0.4569, "mean_token_accuracy": 0.8684651255607605, "num_tokens": 882052837.0, "step": 23120 }, { "epoch": 2.9412288512911844, "ewc_loss": 0.075995072722435, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003937397850677371, "grad_norm": 9.024662971496582, "learning_rate": 1e-06, "loss": 0.4436, "mean_token_accuracy": 0.8705652952194214, "num_tokens": 882088449.0, "step": 23121 }, { "epoch": 2.941356061569775, "ewc_loss": 0.07563025504350662, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039009161991998553, "grad_norm": 8.976630210876465, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8580185770988464, "num_tokens": 882124844.0, "step": 23122 }, { "epoch": 2.9414832718483654, "ewc_loss": 0.07591760158538818, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003929650702048093, "grad_norm": 8.974376678466797, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.8699759244918823, "num_tokens": 882162246.0, "step": 23123 }, { "epoch": 2.941610482126956, "ewc_loss": 0.07573507726192474, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039113988168537617, "grad_norm": 8.934141159057617, "learning_rate": 1e-06, "loss": 0.5458, "mean_token_accuracy": 0.8551324009895325, "num_tokens": 882202825.0, "step": 23124 }, { "epoch": 2.9417376924055465, "ewc_loss": 0.07602744549512863, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003940635360777378, "grad_norm": 8.997825622558594, "learning_rate": 1e-06, "loss": 0.5046, "mean_token_accuracy": 0.851870059967041, "num_tokens": 882237951.0, "step": 23125 }, { "epoch": 2.941864902684137, "ewc_loss": 0.0758037269115448, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039182629552669823, "grad_norm": 8.945878028869629, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8634234070777893, "num_tokens": 882279147.0, "step": 23126 }, { "epoch": 2.941992112962727, "ewc_loss": 0.07597997784614563, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000393588823499158, "grad_norm": 8.919206619262695, "learning_rate": 1e-06, "loss": 0.3985, "mean_token_accuracy": 0.8879804611206055, "num_tokens": 882322777.0, "step": 23127 }, { "epoch": 2.942119323241318, "ewc_loss": 0.07590650022029877, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003928540681954473, "grad_norm": 8.951322555541992, "learning_rate": 1e-06, "loss": 0.4482, "mean_token_accuracy": 0.8707047700881958, "num_tokens": 882360681.0, "step": 23128 }, { "epoch": 2.942246533519908, "ewc_loss": 0.07598865032196045, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003936755529139191, "grad_norm": 8.99126148223877, "learning_rate": 1e-06, "loss": 0.5256, "mean_token_accuracy": 0.844624400138855, "num_tokens": 882403823.0, "step": 23129 }, { "epoch": 2.942373743798499, "ewc_loss": 0.07602233439683914, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039401240064762533, "grad_norm": 8.959795951843262, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8740241527557373, "num_tokens": 882443723.0, "step": 23130 }, { "epoch": 2.9425009540770892, "ewc_loss": 0.07597695291042328, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000393558555515483, "grad_norm": 9.067615509033203, "learning_rate": 1e-06, "loss": 0.521, "mean_token_accuracy": 0.8503072261810303, "num_tokens": 882480610.0, "step": 23131 }, { "epoch": 2.94262816435568, "ewc_loss": 0.07584850490093231, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000392274116165936, "grad_norm": 8.9658842086792, "learning_rate": 1e-06, "loss": 0.4607, "mean_token_accuracy": 0.8676630258560181, "num_tokens": 882518789.0, "step": 23132 }, { "epoch": 2.9427553746342703, "ewc_loss": 0.07604707777500153, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003942598414141685, "grad_norm": 9.12639331817627, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8529883623123169, "num_tokens": 882556598.0, "step": 23133 }, { "epoch": 2.942882584912861, "ewc_loss": 0.07549247145652771, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003887137572746724, "grad_norm": 8.903871536254883, "learning_rate": 1e-06, "loss": 0.4827, "mean_token_accuracy": 0.8604413270950317, "num_tokens": 882593788.0, "step": 23134 }, { "epoch": 2.9430097951914513, "ewc_loss": 0.07625791430473328, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003963682393077761, "grad_norm": 9.075069427490234, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8718942403793335, "num_tokens": 882630861.0, "step": 23135 }, { "epoch": 2.943137005470042, "ewc_loss": 0.07534410059452057, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000387230102205649, "grad_norm": 8.902270317077637, "learning_rate": 1e-06, "loss": 0.4507, "mean_token_accuracy": 0.8722463846206665, "num_tokens": 882670950.0, "step": 23136 }, { "epoch": 2.9432642157486324, "ewc_loss": 0.07629600167274475, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039674912113696337, "grad_norm": 9.146458625793457, "learning_rate": 1e-06, "loss": 0.502, "mean_token_accuracy": 0.8556833267211914, "num_tokens": 882705680.0, "step": 23137 }, { "epoch": 2.943391426027223, "ewc_loss": 0.0752507746219635, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000386296771466732, "grad_norm": 8.938749313354492, "learning_rate": 1e-06, "loss": 0.4393, "mean_token_accuracy": 0.875501275062561, "num_tokens": 882737589.0, "step": 23138 }, { "epoch": 2.9435186363058135, "ewc_loss": 0.0762697160243988, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039648619713261724, "grad_norm": 9.111804008483887, "learning_rate": 1e-06, "loss": 0.5161, "mean_token_accuracy": 0.8503177165985107, "num_tokens": 882774502.0, "step": 23139 }, { "epoch": 2.943645846584404, "ewc_loss": 0.07531541585922241, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038694325485266745, "grad_norm": 8.91964340209961, "learning_rate": 1e-06, "loss": 0.4951, "mean_token_accuracy": 0.855373740196228, "num_tokens": 882815184.0, "step": 23140 }, { "epoch": 2.9437730568629945, "ewc_loss": 0.07614988088607788, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039528781780973077, "grad_norm": 9.049479484558105, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8766153454780579, "num_tokens": 882856793.0, "step": 23141 }, { "epoch": 2.943900267141585, "ewc_loss": 0.0754677951335907, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884670150000602, "grad_norm": 8.934969902038574, "learning_rate": 1e-06, "loss": 0.4359, "mean_token_accuracy": 0.876154899597168, "num_tokens": 882896128.0, "step": 23142 }, { "epoch": 2.9440274774201756, "ewc_loss": 0.07598727941513062, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003936618159059435, "grad_norm": 9.07090950012207, "learning_rate": 1e-06, "loss": 0.4508, "mean_token_accuracy": 0.8730217218399048, "num_tokens": 882931253.0, "step": 23143 }, { "epoch": 2.944154687698766, "ewc_loss": 0.07555995136499405, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003893885586876422, "grad_norm": 8.950758934020996, "learning_rate": 1e-06, "loss": 0.4111, "mean_token_accuracy": 0.8826768398284912, "num_tokens": 882975072.0, "step": 23144 }, { "epoch": 2.9442818979773566, "ewc_loss": 0.07588638365268707, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003926529025193304, "grad_norm": 9.061854362487793, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8789303302764893, "num_tokens": 883013576.0, "step": 23145 }, { "epoch": 2.944409108255947, "ewc_loss": 0.07549378275871277, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003887269413098693, "grad_norm": 9.010974884033203, "learning_rate": 1e-06, "loss": 0.4543, "mean_token_accuracy": 0.8670690059661865, "num_tokens": 883046052.0, "step": 23146 }, { "epoch": 2.9445363185345377, "ewc_loss": 0.07574748992919922, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039126392221078277, "grad_norm": 9.017030715942383, "learning_rate": 1e-06, "loss": 0.5311, "mean_token_accuracy": 0.8453824520111084, "num_tokens": 883080398.0, "step": 23147 }, { "epoch": 2.944663528813128, "ewc_loss": 0.07563238590955734, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039011292392387986, "grad_norm": 8.966146469116211, "learning_rate": 1e-06, "loss": 0.492, "mean_token_accuracy": 0.8571010828018188, "num_tokens": 883120321.0, "step": 23148 }, { "epoch": 2.9447907390917187, "ewc_loss": 0.07581386715173721, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003919277514796704, "grad_norm": 9.047942161560059, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8730272054672241, "num_tokens": 883161285.0, "step": 23149 }, { "epoch": 2.9449179493703093, "ewc_loss": 0.07628776133060455, "ewc_loss_diag": 3.743171691894531e-05, "ewc_loss_parallel": 0.00038934245822019875, "grad_norm": 9.113637924194336, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8651582598686218, "num_tokens": 883199857.0, "step": 23150 }, { "epoch": 2.9450451596489, "ewc_loss": 0.07554006576538086, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003891897213179618, "grad_norm": 8.964394569396973, "learning_rate": 1e-06, "loss": 0.4492, "mean_token_accuracy": 0.8728710412979126, "num_tokens": 883238325.0, "step": 23151 }, { "epoch": 2.94517236992749, "ewc_loss": 0.07589837163686752, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039277278119698167, "grad_norm": 9.025680541992188, "learning_rate": 1e-06, "loss": 0.4894, "mean_token_accuracy": 0.8564907312393188, "num_tokens": 883281814.0, "step": 23152 }, { "epoch": 2.945299580206081, "ewc_loss": 0.0754900574684143, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038868963019922376, "grad_norm": 8.950098037719727, "learning_rate": 1e-06, "loss": 0.4462, "mean_token_accuracy": 0.8695544004440308, "num_tokens": 883317385.0, "step": 23153 }, { "epoch": 2.945426790484671, "ewc_loss": 0.0760565996170044, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000394355010939762, "grad_norm": 9.126469612121582, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.873795747756958, "num_tokens": 883353498.0, "step": 23154 }, { "epoch": 2.945554000763262, "ewc_loss": 0.07535216212272644, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003873106907121837, "grad_norm": 8.932661056518555, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.8734524846076965, "num_tokens": 883391709.0, "step": 23155 }, { "epoch": 2.945681211041852, "ewc_loss": 0.07601138949394226, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000393902970245108, "grad_norm": 9.087274551391602, "learning_rate": 1e-06, "loss": 0.5189, "mean_token_accuracy": 0.8502707481384277, "num_tokens": 883426329.0, "step": 23156 }, { "epoch": 2.945808421320443, "ewc_loss": 0.07540056109428406, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038779465830884874, "grad_norm": 8.944992065429688, "learning_rate": 1e-06, "loss": 0.4706, "mean_token_accuracy": 0.8639808297157288, "num_tokens": 883465797.0, "step": 23157 }, { "epoch": 2.945935631599033, "ewc_loss": 0.07599496096372604, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039373867912217975, "grad_norm": 8.981550216674805, "learning_rate": 1e-06, "loss": 0.4599, "mean_token_accuracy": 0.8681288957595825, "num_tokens": 883501632.0, "step": 23158 }, { "epoch": 2.9460628418776236, "ewc_loss": 0.07556519657373428, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003894410328939557, "grad_norm": 8.936009407043457, "learning_rate": 1e-06, "loss": 0.4346, "mean_token_accuracy": 0.8760714530944824, "num_tokens": 883537622.0, "step": 23159 }, { "epoch": 2.946190052156214, "ewc_loss": 0.07588186115026474, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003926076751668006, "grad_norm": 9.028586387634277, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8665240406990051, "num_tokens": 883576458.0, "step": 23160 }, { "epoch": 2.9463172624348046, "ewc_loss": 0.07565823942422867, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039037145324982703, "grad_norm": 8.928053855895996, "learning_rate": 1e-06, "loss": 0.4845, "mean_token_accuracy": 0.8560565710067749, "num_tokens": 883620929.0, "step": 23161 }, { "epoch": 2.946444472713395, "ewc_loss": 0.07592149078845978, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039300398202612996, "grad_norm": 9.002945899963379, "learning_rate": 1e-06, "loss": 0.5486, "mean_token_accuracy": 0.841504693031311, "num_tokens": 883652589.0, "step": 23162 }, { "epoch": 2.9465716829919857, "ewc_loss": 0.0757235586643219, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039102460141293705, "grad_norm": 8.928359985351562, "learning_rate": 1e-06, "loss": 0.4655, "mean_token_accuracy": 0.8667690753936768, "num_tokens": 883691844.0, "step": 23163 }, { "epoch": 2.946698893270576, "ewc_loss": 0.07606473565101624, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003944364725612104, "grad_norm": 9.022173881530762, "learning_rate": 1e-06, "loss": 0.5322, "mean_token_accuracy": 0.8456178903579712, "num_tokens": 883732838.0, "step": 23164 }, { "epoch": 2.9468261035491667, "ewc_loss": 0.07570238411426544, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003908129292540252, "grad_norm": 8.934310913085938, "learning_rate": 1e-06, "loss": 0.4639, "mean_token_accuracy": 0.8649716377258301, "num_tokens": 883771415.0, "step": 23165 }, { "epoch": 2.9469533138277573, "ewc_loss": 0.0759589672088623, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039337872294709086, "grad_norm": 9.063346862792969, "learning_rate": 1e-06, "loss": 0.5167, "mean_token_accuracy": 0.8511000275611877, "num_tokens": 883808807.0, "step": 23166 }, { "epoch": 2.947080524106348, "ewc_loss": 0.07568417489528656, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039063082658685744, "grad_norm": 8.934521675109863, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8800166249275208, "num_tokens": 883844300.0, "step": 23167 }, { "epoch": 2.9472077343849383, "ewc_loss": 0.07610687613487244, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039485777961090207, "grad_norm": 9.061792373657227, "learning_rate": 1e-06, "loss": 0.4211, "mean_token_accuracy": 0.8806707859039307, "num_tokens": 883879337.0, "step": 23168 }, { "epoch": 2.947334944663529, "ewc_loss": 0.07558300346136093, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038961909012869, "grad_norm": 8.904424667358398, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8557296991348267, "num_tokens": 883925810.0, "step": 23169 }, { "epoch": 2.9474621549421194, "ewc_loss": 0.07610221952199936, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039481124258600175, "grad_norm": 9.048237800598145, "learning_rate": 1e-06, "loss": 0.4475, "mean_token_accuracy": 0.8747572302818298, "num_tokens": 883964753.0, "step": 23170 }, { "epoch": 2.94758936522071, "ewc_loss": 0.0755467414855957, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038925642729736865, "grad_norm": 8.937830924987793, "learning_rate": 1e-06, "loss": 0.4993, "mean_token_accuracy": 0.857062578201294, "num_tokens": 884005737.0, "step": 23171 }, { "epoch": 2.9477165754993004, "ewc_loss": 0.07598242908716202, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039361335802823305, "grad_norm": 9.029532432556152, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8658000230789185, "num_tokens": 884043414.0, "step": 23172 }, { "epoch": 2.947843785777891, "ewc_loss": 0.0756722167134285, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003905112389475107, "grad_norm": 8.974530220031738, "learning_rate": 1e-06, "loss": 0.4509, "mean_token_accuracy": 0.870587170124054, "num_tokens": 884077399.0, "step": 23173 }, { "epoch": 2.9479709960564815, "ewc_loss": 0.0758829265832901, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039261835627257824, "grad_norm": 8.989280700683594, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.864315390586853, "num_tokens": 884116539.0, "step": 23174 }, { "epoch": 2.948098206335072, "ewc_loss": 0.07579243183135986, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917133726645261, "grad_norm": 9.042959213256836, "learning_rate": 1e-06, "loss": 0.445, "mean_token_accuracy": 0.8722835183143616, "num_tokens": 884149756.0, "step": 23175 }, { "epoch": 2.9482254166136626, "ewc_loss": 0.07572398334741592, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003910288796760142, "grad_norm": 9.014878273010254, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8852390646934509, "num_tokens": 884181960.0, "step": 23176 }, { "epoch": 2.9483526268922526, "ewc_loss": 0.07563535869121552, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003901426971424371, "grad_norm": 8.971901893615723, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8610626459121704, "num_tokens": 884223562.0, "step": 23177 }, { "epoch": 2.9484798371708436, "ewc_loss": 0.075694739818573, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907364443875849, "grad_norm": 8.990827560424805, "learning_rate": 1e-06, "loss": 0.4413, "mean_token_accuracy": 0.8706183433532715, "num_tokens": 884264541.0, "step": 23178 }, { "epoch": 2.9486070474494337, "ewc_loss": 0.07567259669303894, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039051499334163964, "grad_norm": 8.973456382751465, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8522775769233704, "num_tokens": 884301171.0, "step": 23179 }, { "epoch": 2.9487342577280247, "ewc_loss": 0.07578014582395554, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039159052539616823, "grad_norm": 9.009841918945312, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8647173047065735, "num_tokens": 884332481.0, "step": 23180 }, { "epoch": 2.9488614680066147, "ewc_loss": 0.07563143968582153, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003901034069713205, "grad_norm": 8.943146705627441, "learning_rate": 1e-06, "loss": 0.4026, "mean_token_accuracy": 0.8865889310836792, "num_tokens": 884371616.0, "step": 23181 }, { "epoch": 2.9489886782852053, "ewc_loss": 0.075999915599823, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003937882138416171, "grad_norm": 9.022144317626953, "learning_rate": 1e-06, "loss": 0.4991, "mean_token_accuracy": 0.854670524597168, "num_tokens": 884411245.0, "step": 23182 }, { "epoch": 2.949115888563796, "ewc_loss": 0.07569808512926102, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039076991379261017, "grad_norm": 8.985241889953613, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8623239994049072, "num_tokens": 884445712.0, "step": 23183 }, { "epoch": 2.9492430988423863, "ewc_loss": 0.07590938359498978, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003928828809875995, "grad_norm": 9.005266189575195, "learning_rate": 1e-06, "loss": 0.4497, "mean_token_accuracy": 0.8702535033226013, "num_tokens": 884484935.0, "step": 23184 }, { "epoch": 2.949370309120977, "ewc_loss": 0.07587793469429016, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039256844320334494, "grad_norm": 9.001057624816895, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.866483747959137, "num_tokens": 884518714.0, "step": 23185 }, { "epoch": 2.9494975193995674, "ewc_loss": 0.07591603696346283, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003929494705516845, "grad_norm": 9.025321960449219, "learning_rate": 1e-06, "loss": 0.495, "mean_token_accuracy": 0.858637809753418, "num_tokens": 884560430.0, "step": 23186 }, { "epoch": 2.949624729678158, "ewc_loss": 0.07570694386959076, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039085850585252047, "grad_norm": 8.933049201965332, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8577487468719482, "num_tokens": 884595937.0, "step": 23187 }, { "epoch": 2.9497519399567484, "ewc_loss": 0.07605703175067902, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039435940561816096, "grad_norm": 9.060696601867676, "learning_rate": 1e-06, "loss": 0.4857, "mean_token_accuracy": 0.8604714274406433, "num_tokens": 884633201.0, "step": 23188 }, { "epoch": 2.949879150235339, "ewc_loss": 0.07558972388505936, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003896862908732146, "grad_norm": 9.000641822814941, "learning_rate": 1e-06, "loss": 0.4566, "mean_token_accuracy": 0.8684629797935486, "num_tokens": 884672602.0, "step": 23189 }, { "epoch": 2.9500063605139295, "ewc_loss": 0.07596717029809952, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003934607666451484, "grad_norm": 9.019566535949707, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.8681623339653015, "num_tokens": 884708378.0, "step": 23190 }, { "epoch": 2.95013357079252, "ewc_loss": 0.07554872334003448, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038927633431740105, "grad_norm": 8.954448699951172, "learning_rate": 1e-06, "loss": 0.4719, "mean_token_accuracy": 0.8621529340744019, "num_tokens": 884747527.0, "step": 23191 }, { "epoch": 2.9502607810711106, "ewc_loss": 0.07584726810455322, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039226168883033097, "grad_norm": 8.985737800598145, "learning_rate": 1e-06, "loss": 0.4052, "mean_token_accuracy": 0.8847981691360474, "num_tokens": 884781738.0, "step": 23192 }, { "epoch": 2.950387991349701, "ewc_loss": 0.07575720548629761, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039136112900450826, "grad_norm": 8.958209037780762, "learning_rate": 1e-06, "loss": 0.5028, "mean_token_accuracy": 0.8525516390800476, "num_tokens": 884818884.0, "step": 23193 }, { "epoch": 2.9505152016282916, "ewc_loss": 0.07578927278518677, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003916817659046501, "grad_norm": 9.058465003967285, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8749618530273438, "num_tokens": 884851751.0, "step": 23194 }, { "epoch": 2.950642411906882, "ewc_loss": 0.07559539377689362, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003897429851349443, "grad_norm": 8.900519371032715, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8690803050994873, "num_tokens": 884892935.0, "step": 23195 }, { "epoch": 2.9507696221854727, "ewc_loss": 0.07589542865753174, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039274338632822037, "grad_norm": 8.980935096740723, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8710154294967651, "num_tokens": 884935820.0, "step": 23196 }, { "epoch": 2.950896832464063, "ewc_loss": 0.07570438086986542, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003908328653778881, "grad_norm": 9.01153564453125, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8641587495803833, "num_tokens": 884976042.0, "step": 23197 }, { "epoch": 2.9510240427426537, "ewc_loss": 0.07584099471569061, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003921990573871881, "grad_norm": 9.054312705993652, "learning_rate": 1e-06, "loss": 0.4033, "mean_token_accuracy": 0.8842240571975708, "num_tokens": 885013511.0, "step": 23198 }, { "epoch": 2.9511512530212443, "ewc_loss": 0.07552048563957214, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003889939689543098, "grad_norm": 9.04937744140625, "learning_rate": 1e-06, "loss": 0.5033, "mean_token_accuracy": 0.8527518510818481, "num_tokens": 885047291.0, "step": 23199 }, { "epoch": 2.951278463299835, "ewc_loss": 0.0755307674407959, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003890967054758221, "grad_norm": 8.984299659729004, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8616939783096313, "num_tokens": 885083116.0, "step": 23200 }, { "epoch": 2.9514056735784253, "ewc_loss": 0.07571794092655182, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000390968460123986, "grad_norm": 9.070361137390137, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8626586198806763, "num_tokens": 885123282.0, "step": 23201 }, { "epoch": 2.9515328838570154, "ewc_loss": 0.07533933222293854, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003871823428198695, "grad_norm": 8.924020767211914, "learning_rate": 1e-06, "loss": 0.4503, "mean_token_accuracy": 0.8695580363273621, "num_tokens": 885166277.0, "step": 23202 }, { "epoch": 2.9516600941356064, "ewc_loss": 0.07583381235599518, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039212717092595994, "grad_norm": 9.024824142456055, "learning_rate": 1e-06, "loss": 0.4228, "mean_token_accuracy": 0.8771359920501709, "num_tokens": 885206414.0, "step": 23203 }, { "epoch": 2.9517873044141965, "ewc_loss": 0.07550524920225143, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003888415521942079, "grad_norm": 9.043023109436035, "learning_rate": 1e-06, "loss": 0.4499, "mean_token_accuracy": 0.8740054368972778, "num_tokens": 885240977.0, "step": 23204 }, { "epoch": 2.9519145146927874, "ewc_loss": 0.07557034492492676, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038949251757003367, "grad_norm": 8.939201354980469, "learning_rate": 1e-06, "loss": 0.4147, "mean_token_accuracy": 0.8827316164970398, "num_tokens": 885283190.0, "step": 23205 }, { "epoch": 2.9520417249713775, "ewc_loss": 0.07580000162124634, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000391789129935205, "grad_norm": 9.029014587402344, "learning_rate": 1e-06, "loss": 0.4249, "mean_token_accuracy": 0.8798229694366455, "num_tokens": 885321340.0, "step": 23206 }, { "epoch": 2.952168935249968, "ewc_loss": 0.0754690170288086, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884792677126825, "grad_norm": 8.931822776794434, "learning_rate": 1e-06, "loss": 0.4371, "mean_token_accuracy": 0.8751777410507202, "num_tokens": 885357524.0, "step": 23207 }, { "epoch": 2.9522961455285586, "ewc_loss": 0.0757618322968483, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039140740409493446, "grad_norm": 9.048041343688965, "learning_rate": 1e-06, "loss": 0.5361, "mean_token_accuracy": 0.841384768486023, "num_tokens": 885389696.0, "step": 23208 }, { "epoch": 2.952423355807149, "ewc_loss": 0.07542521506547928, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038804119685664773, "grad_norm": 8.91018295288086, "learning_rate": 1e-06, "loss": 0.4152, "mean_token_accuracy": 0.8817397952079773, "num_tokens": 885429574.0, "step": 23209 }, { "epoch": 2.9525505660857396, "ewc_loss": 0.07604975998401642, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003942866751458496, "grad_norm": 9.164911270141602, "learning_rate": 1e-06, "loss": 0.4518, "mean_token_accuracy": 0.8693157434463501, "num_tokens": 885459961.0, "step": 23210 }, { "epoch": 2.95267777636433, "ewc_loss": 0.07527339458465576, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003865229955408722, "grad_norm": 8.883784294128418, "learning_rate": 1e-06, "loss": 0.4886, "mean_token_accuracy": 0.8595174551010132, "num_tokens": 885506725.0, "step": 23211 }, { "epoch": 2.9528049866429207, "ewc_loss": 0.07598608732223511, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003936499124392867, "grad_norm": 9.013832092285156, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8701344132423401, "num_tokens": 885549409.0, "step": 23212 }, { "epoch": 2.952932196921511, "ewc_loss": 0.0752987265586853, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003867763443849981, "grad_norm": 8.98189926147461, "learning_rate": 1e-06, "loss": 0.4521, "mean_token_accuracy": 0.8728584051132202, "num_tokens": 885589272.0, "step": 23213 }, { "epoch": 2.9530594072001017, "ewc_loss": 0.07593460381031036, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000393135123886168, "grad_norm": 9.107417106628418, "learning_rate": 1e-06, "loss": 0.5162, "mean_token_accuracy": 0.8494915962219238, "num_tokens": 885625951.0, "step": 23214 }, { "epoch": 2.9531866174786923, "ewc_loss": 0.0754479169845581, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038826826494187117, "grad_norm": 8.898942947387695, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8695615530014038, "num_tokens": 885665844.0, "step": 23215 }, { "epoch": 2.953313827757283, "ewc_loss": 0.07606328278779984, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039442189154215157, "grad_norm": 9.103556632995605, "learning_rate": 1e-06, "loss": 0.4671, "mean_token_accuracy": 0.8648161292076111, "num_tokens": 885705616.0, "step": 23216 }, { "epoch": 2.9534410380358733, "ewc_loss": 0.07545216381549835, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003883106983266771, "grad_norm": 8.985515594482422, "learning_rate": 1e-06, "loss": 0.4378, "mean_token_accuracy": 0.8780285120010376, "num_tokens": 885752362.0, "step": 23217 }, { "epoch": 2.953568248314464, "ewc_loss": 0.0759836882352829, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039362593088299036, "grad_norm": 9.047584533691406, "learning_rate": 1e-06, "loss": 0.4856, "mean_token_accuracy": 0.8614637851715088, "num_tokens": 885792485.0, "step": 23218 }, { "epoch": 2.9536954585930544, "ewc_loss": 0.07557672262191772, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038955631316639483, "grad_norm": 9.022333145141602, "learning_rate": 1e-06, "loss": 0.4562, "mean_token_accuracy": 0.867702841758728, "num_tokens": 885834809.0, "step": 23219 }, { "epoch": 2.953822668871645, "ewc_loss": 0.07574815303087234, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039127058698795736, "grad_norm": 9.025832176208496, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.8706261515617371, "num_tokens": 885870555.0, "step": 23220 }, { "epoch": 2.9539498791502354, "ewc_loss": 0.07565127313137054, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039030180778354406, "grad_norm": 8.967668533325195, "learning_rate": 1e-06, "loss": 0.4556, "mean_token_accuracy": 0.8686181902885437, "num_tokens": 885911911.0, "step": 23221 }, { "epoch": 2.954077089428826, "ewc_loss": 0.07588540017604828, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000392643065424636, "grad_norm": 9.134902954101562, "learning_rate": 1e-06, "loss": 0.4236, "mean_token_accuracy": 0.878546416759491, "num_tokens": 885941094.0, "step": 23222 }, { "epoch": 2.9542042997074165, "ewc_loss": 0.07545165717601776, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003883056342601776, "grad_norm": 8.957226753234863, "learning_rate": 1e-06, "loss": 0.4763, "mean_token_accuracy": 0.86534583568573, "num_tokens": 885978094.0, "step": 23223 }, { "epoch": 2.954331509986007, "ewc_loss": 0.07595061510801315, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039329519495368004, "grad_norm": 9.036359786987305, "learning_rate": 1e-06, "loss": 0.4181, "mean_token_accuracy": 0.8837921023368835, "num_tokens": 886014859.0, "step": 23224 }, { "epoch": 2.954458720264597, "ewc_loss": 0.07554236054420471, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003892126842401922, "grad_norm": 8.994624137878418, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.8740540742874146, "num_tokens": 886048317.0, "step": 23225 }, { "epoch": 2.954585930543188, "ewc_loss": 0.07593472301959991, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039313628803938627, "grad_norm": 9.082948684692383, "learning_rate": 1e-06, "loss": 0.4547, "mean_token_accuracy": 0.8682416081428528, "num_tokens": 886081797.0, "step": 23226 }, { "epoch": 2.954713140821778, "ewc_loss": 0.07549545168876648, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000388743617804721, "grad_norm": 8.943622589111328, "learning_rate": 1e-06, "loss": 0.5068, "mean_token_accuracy": 0.8526355028152466, "num_tokens": 886124111.0, "step": 23227 }, { "epoch": 2.954840351100369, "ewc_loss": 0.0760565847158432, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003943549527321011, "grad_norm": 9.020757675170898, "learning_rate": 1e-06, "loss": 0.4452, "mean_token_accuracy": 0.8776794075965881, "num_tokens": 886162823.0, "step": 23228 }, { "epoch": 2.954967561378959, "ewc_loss": 0.0755128413438797, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038891745498403907, "grad_norm": 8.973523139953613, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8685631155967712, "num_tokens": 886204706.0, "step": 23229 }, { "epoch": 2.95509477165755, "ewc_loss": 0.07587984949350357, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000392587564419955, "grad_norm": 8.99599838256836, "learning_rate": 1e-06, "loss": 0.4404, "mean_token_accuracy": 0.8749091625213623, "num_tokens": 886241461.0, "step": 23230 }, { "epoch": 2.9552219819361403, "ewc_loss": 0.07574566453695297, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039124570321291685, "grad_norm": 8.971778869628906, "learning_rate": 1e-06, "loss": 0.4797, "mean_token_accuracy": 0.8626152276992798, "num_tokens": 886279276.0, "step": 23231 }, { "epoch": 2.955349192214731, "ewc_loss": 0.07591473311185837, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003929364029318094, "grad_norm": 9.004386901855469, "learning_rate": 1e-06, "loss": 0.4561, "mean_token_accuracy": 0.8676004409790039, "num_tokens": 886317703.0, "step": 23232 }, { "epoch": 2.9554764024933213, "ewc_loss": 0.07586008310317993, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039238992030732334, "grad_norm": 9.020447731018066, "learning_rate": 1e-06, "loss": 0.4448, "mean_token_accuracy": 0.8728972673416138, "num_tokens": 886355235.0, "step": 23233 }, { "epoch": 2.955603612771912, "ewc_loss": 0.0757514014840126, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913030668627471, "grad_norm": 8.94581413269043, "learning_rate": 1e-06, "loss": 0.3862, "mean_token_accuracy": 0.8912054300308228, "num_tokens": 886387354.0, "step": 23234 }, { "epoch": 2.9557308230505024, "ewc_loss": 0.07598644495010376, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039365352131426334, "grad_norm": 9.087125778198242, "learning_rate": 1e-06, "loss": 0.4695, "mean_token_accuracy": 0.8648450970649719, "num_tokens": 886419742.0, "step": 23235 }, { "epoch": 2.955858033329093, "ewc_loss": 0.0754706859588623, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884958859998733, "grad_norm": 8.87913990020752, "learning_rate": 1e-06, "loss": 0.459, "mean_token_accuracy": 0.868240237236023, "num_tokens": 886455218.0, "step": 23236 }, { "epoch": 2.9559852436076834, "ewc_loss": 0.07628682255744934, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039665729855187237, "grad_norm": 9.064096450805664, "learning_rate": 1e-06, "loss": 0.4632, "mean_token_accuracy": 0.8675710558891296, "num_tokens": 886494454.0, "step": 23237 }, { "epoch": 2.956112453886274, "ewc_loss": 0.07551595568656921, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038894862518645823, "grad_norm": 8.932599067687988, "learning_rate": 1e-06, "loss": 0.514, "mean_token_accuracy": 0.8444498777389526, "num_tokens": 886532854.0, "step": 23238 }, { "epoch": 2.9562396641648645, "ewc_loss": 0.07614555954933167, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000395244627725333, "grad_norm": 9.117555618286133, "learning_rate": 1e-06, "loss": 0.4649, "mean_token_accuracy": 0.8630460500717163, "num_tokens": 886564288.0, "step": 23239 }, { "epoch": 2.956366874443455, "ewc_loss": 0.07550767064094543, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038886573747731745, "grad_norm": 9.072380065917969, "learning_rate": 1e-06, "loss": 0.4476, "mean_token_accuracy": 0.8691803216934204, "num_tokens": 886597155.0, "step": 23240 }, { "epoch": 2.9564940847220456, "ewc_loss": 0.07602834701538086, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039407258736900985, "grad_norm": 9.441852569580078, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.8699512481689453, "num_tokens": 886632015.0, "step": 23241 }, { "epoch": 2.956621295000636, "ewc_loss": 0.07507877051830292, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038457682239823043, "grad_norm": 9.20548152923584, "learning_rate": 1e-06, "loss": 0.4044, "mean_token_accuracy": 0.8826851844787598, "num_tokens": 886665736.0, "step": 23242 }, { "epoch": 2.9567485052792266, "ewc_loss": 0.07551681250333786, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003889571817126125, "grad_norm": 9.030670166015625, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8662657737731934, "num_tokens": 886702652.0, "step": 23243 }, { "epoch": 2.956875715557817, "ewc_loss": 0.07530060410499573, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038679514545947313, "grad_norm": 9.19602108001709, "learning_rate": 1e-06, "loss": 0.4635, "mean_token_accuracy": 0.8640259504318237, "num_tokens": 886743573.0, "step": 23244 }, { "epoch": 2.9570029258364077, "ewc_loss": 0.07506804168224335, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000384469487471506, "grad_norm": 9.107110977172852, "learning_rate": 1e-06, "loss": 0.4479, "mean_token_accuracy": 0.872071385383606, "num_tokens": 886777932.0, "step": 23245 }, { "epoch": 2.957130136114998, "ewc_loss": 0.07530969381332397, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003868859785143286, "grad_norm": 9.049688339233398, "learning_rate": 1e-06, "loss": 0.5056, "mean_token_accuracy": 0.8569661974906921, "num_tokens": 886813211.0, "step": 23246 }, { "epoch": 2.9572573463935887, "ewc_loss": 0.07513538748025894, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038514292100444436, "grad_norm": 8.978004455566406, "learning_rate": 1e-06, "loss": 0.4457, "mean_token_accuracy": 0.8713645339012146, "num_tokens": 886849416.0, "step": 23247 }, { "epoch": 2.9573845566721793, "ewc_loss": 0.07538872957229614, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003876763512380421, "grad_norm": 8.985901832580566, "learning_rate": 1e-06, "loss": 0.5136, "mean_token_accuracy": 0.8521708250045776, "num_tokens": 886887983.0, "step": 23248 }, { "epoch": 2.95751176695077, "ewc_loss": 0.0753476619720459, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038726572529412806, "grad_norm": 8.95284366607666, "learning_rate": 1e-06, "loss": 0.5153, "mean_token_accuracy": 0.8504453301429749, "num_tokens": 886930867.0, "step": 23249 }, { "epoch": 2.95763897722936, "ewc_loss": 0.07549664378166199, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003887555212713778, "grad_norm": 8.973299026489258, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8650541305541992, "num_tokens": 886971862.0, "step": 23250 }, { "epoch": 2.957766187507951, "ewc_loss": 0.07546598464250565, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003884489124175161, "grad_norm": 8.98024845123291, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8735961318016052, "num_tokens": 887005238.0, "step": 23251 }, { "epoch": 2.957893397786541, "ewc_loss": 0.07555723190307617, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038936137570999563, "grad_norm": 8.986655235290527, "learning_rate": 1e-06, "loss": 0.4739, "mean_token_accuracy": 0.8593302965164185, "num_tokens": 887035588.0, "step": 23252 }, { "epoch": 2.958020608065132, "ewc_loss": 0.07553074508905411, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038909653085283935, "grad_norm": 8.944868087768555, "learning_rate": 1e-06, "loss": 0.4605, "mean_token_accuracy": 0.8679987788200378, "num_tokens": 887079962.0, "step": 23253 }, { "epoch": 2.958147818343722, "ewc_loss": 0.07567915320396423, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039058056427165866, "grad_norm": 8.993277549743652, "learning_rate": 1e-06, "loss": 0.3969, "mean_token_accuracy": 0.8868046998977661, "num_tokens": 887114572.0, "step": 23254 }, { "epoch": 2.958275028622313, "ewc_loss": 0.07569987326860428, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907877835445106, "grad_norm": 8.923287391662598, "learning_rate": 1e-06, "loss": 0.4491, "mean_token_accuracy": 0.8712852001190186, "num_tokens": 887149396.0, "step": 23255 }, { "epoch": 2.958402238900903, "ewc_loss": 0.07583732903003693, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003921622992493212, "grad_norm": 9.024248123168945, "learning_rate": 1e-06, "loss": 0.4732, "mean_token_accuracy": 0.8620143532752991, "num_tokens": 887191916.0, "step": 23256 }, { "epoch": 2.9585294491794936, "ewc_loss": 0.07560847699642181, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038987386506050825, "grad_norm": 8.958484649658203, "learning_rate": 1e-06, "loss": 0.4432, "mean_token_accuracy": 0.875884473323822, "num_tokens": 887228263.0, "step": 23257 }, { "epoch": 2.958656659458084, "ewc_loss": 0.07575127482414246, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039130181539803743, "grad_norm": 8.980998992919922, "learning_rate": 1e-06, "loss": 0.4193, "mean_token_accuracy": 0.8835352063179016, "num_tokens": 887258688.0, "step": 23258 }, { "epoch": 2.9587838697366746, "ewc_loss": 0.0757133960723877, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003909230581484735, "grad_norm": 8.989217758178711, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.8622229099273682, "num_tokens": 887301565.0, "step": 23259 }, { "epoch": 2.958911080015265, "ewc_loss": 0.07577759027481079, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039156494312919676, "grad_norm": 8.971081733703613, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.8639003038406372, "num_tokens": 887337511.0, "step": 23260 }, { "epoch": 2.9590382902938557, "ewc_loss": 0.07575549930334091, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039134404505603015, "grad_norm": 8.96654224395752, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8703505992889404, "num_tokens": 887370694.0, "step": 23261 }, { "epoch": 2.959165500572446, "ewc_loss": 0.075740285217762, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003911919193342328, "grad_norm": 8.952536582946777, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8644611835479736, "num_tokens": 887412131.0, "step": 23262 }, { "epoch": 2.9592927108510367, "ewc_loss": 0.0756978765130043, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907678183168173, "grad_norm": 8.908342361450195, "learning_rate": 1e-06, "loss": 0.481, "mean_token_accuracy": 0.8623021841049194, "num_tokens": 887447179.0, "step": 23263 }, { "epoch": 2.9594199211296273, "ewc_loss": 0.07581696659326553, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003919587470591068, "grad_norm": 8.993621826171875, "learning_rate": 1e-06, "loss": 0.4815, "mean_token_accuracy": 0.8613626956939697, "num_tokens": 887486079.0, "step": 23264 }, { "epoch": 2.959547131408218, "ewc_loss": 0.07571600377559662, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039094913518056273, "grad_norm": 9.00319766998291, "learning_rate": 1e-06, "loss": 0.4382, "mean_token_accuracy": 0.8751004934310913, "num_tokens": 887519092.0, "step": 23265 }, { "epoch": 2.9596743416868083, "ewc_loss": 0.0758301168680191, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003920902090612799, "grad_norm": 9.012948036193848, "learning_rate": 1e-06, "loss": 0.466, "mean_token_accuracy": 0.8686121702194214, "num_tokens": 887552008.0, "step": 23266 }, { "epoch": 2.959801551965399, "ewc_loss": 0.07570015639066696, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039079063571989536, "grad_norm": 8.99959659576416, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.862614631652832, "num_tokens": 887585827.0, "step": 23267 }, { "epoch": 2.9599287622439894, "ewc_loss": 0.07583394646644592, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003921285388059914, "grad_norm": 8.95022201538086, "learning_rate": 1e-06, "loss": 0.4665, "mean_token_accuracy": 0.8653092980384827, "num_tokens": 887628041.0, "step": 23268 }, { "epoch": 2.96005597252258, "ewc_loss": 0.07585553824901581, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003923444892279804, "grad_norm": 8.936098098754883, "learning_rate": 1e-06, "loss": 0.4437, "mean_token_accuracy": 0.8712809085845947, "num_tokens": 887667210.0, "step": 23269 }, { "epoch": 2.9601831828011704, "ewc_loss": 0.07584431022405624, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039223217754624784, "grad_norm": 8.97485065460205, "learning_rate": 1e-06, "loss": 0.4381, "mean_token_accuracy": 0.8724862337112427, "num_tokens": 887705429.0, "step": 23270 }, { "epoch": 2.960310393079761, "ewc_loss": 0.07581749558448792, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003919640148524195, "grad_norm": 8.97316837310791, "learning_rate": 1e-06, "loss": 0.5296, "mean_token_accuracy": 0.848771333694458, "num_tokens": 887753188.0, "step": 23271 }, { "epoch": 2.9604376033583515, "ewc_loss": 0.07579806447029114, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039176965947262943, "grad_norm": 9.024173736572266, "learning_rate": 1e-06, "loss": 0.5103, "mean_token_accuracy": 0.8502328395843506, "num_tokens": 887795907.0, "step": 23272 }, { "epoch": 2.960564813636942, "ewc_loss": 0.0757647454738617, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039143647882156074, "grad_norm": 9.061065673828125, "learning_rate": 1e-06, "loss": 0.471, "mean_token_accuracy": 0.8668571710586548, "num_tokens": 887834859.0, "step": 23273 }, { "epoch": 2.9606920239155325, "ewc_loss": 0.07556664198637009, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038945546839386225, "grad_norm": 8.914077758789062, "learning_rate": 1e-06, "loss": 0.472, "mean_token_accuracy": 0.8627057075500488, "num_tokens": 887879041.0, "step": 23274 }, { "epoch": 2.9608192341941226, "ewc_loss": 0.07586418837308884, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003924309276044369, "grad_norm": 9.128985404968262, "learning_rate": 1e-06, "loss": 0.4749, "mean_token_accuracy": 0.8615295886993408, "num_tokens": 887913752.0, "step": 23275 }, { "epoch": 2.9609464444727136, "ewc_loss": 0.07531832158565521, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038697224226780236, "grad_norm": 8.91204833984375, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.871796727180481, "num_tokens": 887950946.0, "step": 23276 }, { "epoch": 2.9610736547513037, "ewc_loss": 0.07613255828619003, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003951146500185132, "grad_norm": 9.119293212890625, "learning_rate": 1e-06, "loss": 0.4705, "mean_token_accuracy": 0.8638893365859985, "num_tokens": 887993155.0, "step": 23277 }, { "epoch": 2.9612008650298947, "ewc_loss": 0.07528000324964523, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038658909033983946, "grad_norm": 8.9146146774292, "learning_rate": 1e-06, "loss": 0.4791, "mean_token_accuracy": 0.8662706613540649, "num_tokens": 888027831.0, "step": 23278 }, { "epoch": 2.9613280753084847, "ewc_loss": 0.07612568140029907, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039504587766714394, "grad_norm": 9.105379104614258, "learning_rate": 1e-06, "loss": 0.43, "mean_token_accuracy": 0.8766396045684814, "num_tokens": 888066941.0, "step": 23279 }, { "epoch": 2.9614552855870753, "ewc_loss": 0.0752459317445755, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000386248342692852, "grad_norm": 8.93378734588623, "learning_rate": 1e-06, "loss": 0.4146, "mean_token_accuracy": 0.8802789449691772, "num_tokens": 888099739.0, "step": 23280 }, { "epoch": 2.961582495865666, "ewc_loss": 0.07608562707901001, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039464537985622883, "grad_norm": 9.134734153747559, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8673899173736572, "num_tokens": 888139675.0, "step": 23281 }, { "epoch": 2.9617097061442563, "ewc_loss": 0.0753612369298935, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003874013782478869, "grad_norm": 8.909069061279297, "learning_rate": 1e-06, "loss": 0.4887, "mean_token_accuracy": 0.8545538187026978, "num_tokens": 888179374.0, "step": 23282 }, { "epoch": 2.961836916422847, "ewc_loss": 0.0760156661272049, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039394572377204895, "grad_norm": 9.143403053283691, "learning_rate": 1e-06, "loss": 0.4997, "mean_token_accuracy": 0.8539185523986816, "num_tokens": 888214139.0, "step": 23283 }, { "epoch": 2.9619641267014374, "ewc_loss": 0.07532203942537308, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003870094660669565, "grad_norm": 9.003449440002441, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8651602268218994, "num_tokens": 888246365.0, "step": 23284 }, { "epoch": 2.962091336980028, "ewc_loss": 0.07602076977491379, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003939967427868396, "grad_norm": 9.13613510131836, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.8608804941177368, "num_tokens": 888290903.0, "step": 23285 }, { "epoch": 2.9622185472586184, "ewc_loss": 0.07534289360046387, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038721796590834856, "grad_norm": 8.935445785522461, "learning_rate": 1e-06, "loss": 0.4744, "mean_token_accuracy": 0.8653786778450012, "num_tokens": 888331329.0, "step": 23286 }, { "epoch": 2.962345757537209, "ewc_loss": 0.07590888440608978, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039287793333642185, "grad_norm": 9.188156127929688, "learning_rate": 1e-06, "loss": 0.4822, "mean_token_accuracy": 0.860166072845459, "num_tokens": 888372411.0, "step": 23287 }, { "epoch": 2.9624729678157995, "ewc_loss": 0.0752253532409668, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038604254950769246, "grad_norm": 8.874703407287598, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.8659633994102478, "num_tokens": 888409778.0, "step": 23288 }, { "epoch": 2.96260017809439, "ewc_loss": 0.07635387033224106, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039732776349410415, "grad_norm": 9.215354919433594, "learning_rate": 1e-06, "loss": 0.4455, "mean_token_accuracy": 0.8735980987548828, "num_tokens": 888453593.0, "step": 23289 }, { "epoch": 2.9627273883729806, "ewc_loss": 0.07520442456007004, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038583329296670854, "grad_norm": 8.939160346984863, "learning_rate": 1e-06, "loss": 0.4226, "mean_token_accuracy": 0.8784591555595398, "num_tokens": 888488245.0, "step": 23290 }, { "epoch": 2.962854598651571, "ewc_loss": 0.07628092169761658, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039659833419136703, "grad_norm": 9.153578758239746, "learning_rate": 1e-06, "loss": 0.4609, "mean_token_accuracy": 0.8660362362861633, "num_tokens": 888524306.0, "step": 23291 }, { "epoch": 2.9629818089301616, "ewc_loss": 0.07523594796657562, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038614848745055497, "grad_norm": 8.897215843200684, "learning_rate": 1e-06, "loss": 0.4283, "mean_token_accuracy": 0.8785027265548706, "num_tokens": 888563791.0, "step": 23292 }, { "epoch": 2.963109019208752, "ewc_loss": 0.07652296125888824, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003990186669398099, "grad_norm": 9.162618637084961, "learning_rate": 1e-06, "loss": 0.4519, "mean_token_accuracy": 0.871407151222229, "num_tokens": 888605021.0, "step": 23293 }, { "epoch": 2.9632362294873427, "ewc_loss": 0.07547607272863388, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038854978629387915, "grad_norm": 8.950521469116211, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8669897317886353, "num_tokens": 888647382.0, "step": 23294 }, { "epoch": 2.963363439765933, "ewc_loss": 0.07627622783184052, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003965513897128403, "grad_norm": 9.101945877075195, "learning_rate": 1e-06, "loss": 0.4735, "mean_token_accuracy": 0.8666356205940247, "num_tokens": 888682174.0, "step": 23295 }, { "epoch": 2.9634906500445237, "ewc_loss": 0.07553726434707642, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003891617525368929, "grad_norm": 8.897438049316406, "learning_rate": 1e-06, "loss": 0.444, "mean_token_accuracy": 0.8682723045349121, "num_tokens": 888724401.0, "step": 23296 }, { "epoch": 2.9636178603231143, "ewc_loss": 0.0763235092163086, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003970241523347795, "grad_norm": 9.057355880737305, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8735798597335815, "num_tokens": 888766230.0, "step": 23297 }, { "epoch": 2.963745070601705, "ewc_loss": 0.07569906115531921, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907796635758132, "grad_norm": 8.954575538635254, "learning_rate": 1e-06, "loss": 0.5302, "mean_token_accuracy": 0.8448607921600342, "num_tokens": 888811865.0, "step": 23298 }, { "epoch": 2.9638722808802953, "ewc_loss": 0.07627572119235992, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039654632564634085, "grad_norm": 9.687454223632812, "learning_rate": 1e-06, "loss": 0.4651, "mean_token_accuracy": 0.8678816556930542, "num_tokens": 888849549.0, "step": 23299 }, { "epoch": 2.9639994911588854, "ewc_loss": 0.07491016387939453, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038289072108455, "grad_norm": 8.841873168945312, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8690877556800842, "num_tokens": 888881305.0, "step": 23300 }, { "epoch": 2.9641267014374764, "ewc_loss": 0.07715881615877151, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004053772136103362, "grad_norm": 9.294805526733398, "learning_rate": 1e-06, "loss": 0.4538, "mean_token_accuracy": 0.8716133832931519, "num_tokens": 888917862.0, "step": 23301 }, { "epoch": 2.9642539117160664, "ewc_loss": 0.07494275271892548, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038321659667417407, "grad_norm": 8.838639259338379, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8676875233650208, "num_tokens": 888953883.0, "step": 23302 }, { "epoch": 2.9643811219946574, "ewc_loss": 0.07704424858093262, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040423154132440686, "grad_norm": 9.275537490844727, "learning_rate": 1e-06, "loss": 0.4124, "mean_token_accuracy": 0.8847185373306274, "num_tokens": 888991052.0, "step": 23303 }, { "epoch": 2.9645083322732475, "ewc_loss": 0.07530677318572998, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003868568455800414, "grad_norm": 8.96575927734375, "learning_rate": 1e-06, "loss": 0.4295, "mean_token_accuracy": 0.8788810968399048, "num_tokens": 889022523.0, "step": 23304 }, { "epoch": 2.964635542551838, "ewc_loss": 0.07654298096895218, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003992188721895218, "grad_norm": 9.14123249053955, "learning_rate": 1e-06, "loss": 0.4876, "mean_token_accuracy": 0.858245849609375, "num_tokens": 889063470.0, "step": 23305 }, { "epoch": 2.9647627528304286, "ewc_loss": 0.07575594633817673, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913485270459205, "grad_norm": 9.014059066772461, "learning_rate": 1e-06, "loss": 0.4352, "mean_token_accuracy": 0.8752492666244507, "num_tokens": 889101160.0, "step": 23306 }, { "epoch": 2.964889963109019, "ewc_loss": 0.07617459446191788, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003955349966417998, "grad_norm": 9.15057373046875, "learning_rate": 1e-06, "loss": 0.4715, "mean_token_accuracy": 0.8648313283920288, "num_tokens": 889139128.0, "step": 23307 }, { "epoch": 2.9650171733876096, "ewc_loss": 0.07578939944505692, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003916830464731902, "grad_norm": 9.599625587463379, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8430415987968445, "num_tokens": 889175574.0, "step": 23308 }, { "epoch": 2.9651443836662, "ewc_loss": 0.07512696087360382, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003850586945191026, "grad_norm": 8.869278907775879, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8644516468048096, "num_tokens": 889216520.0, "step": 23309 }, { "epoch": 2.9652715939447907, "ewc_loss": 0.07668954133987427, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000400684482883662, "grad_norm": 9.223918914794922, "learning_rate": 1e-06, "loss": 0.4312, "mean_token_accuracy": 0.8781911730766296, "num_tokens": 889256746.0, "step": 23310 }, { "epoch": 2.965398804223381, "ewc_loss": 0.07493963837623596, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038318545557558537, "grad_norm": 8.815023422241211, "learning_rate": 1e-06, "loss": 0.4816, "mean_token_accuracy": 0.8628147840499878, "num_tokens": 889298868.0, "step": 23311 }, { "epoch": 2.9655260145019717, "ewc_loss": 0.07693104445934296, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004030994896311313, "grad_norm": 9.36469554901123, "learning_rate": 1e-06, "loss": 0.4788, "mean_token_accuracy": 0.8617054224014282, "num_tokens": 889337758.0, "step": 23312 }, { "epoch": 2.9656532247805623, "ewc_loss": 0.07489803433418274, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003827694454230368, "grad_norm": 8.752294540405273, "learning_rate": 1e-06, "loss": 0.4529, "mean_token_accuracy": 0.8726038932800293, "num_tokens": 889375308.0, "step": 23313 }, { "epoch": 2.965780435059153, "ewc_loss": 0.07718366384506226, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040562573121860623, "grad_norm": 9.255228042602539, "learning_rate": 1e-06, "loss": 0.4357, "mean_token_accuracy": 0.8733278512954712, "num_tokens": 889413735.0, "step": 23314 }, { "epoch": 2.9659076453377433, "ewc_loss": 0.07530930638313293, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003868820786010474, "grad_norm": 8.899855613708496, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8682389259338379, "num_tokens": 889451585.0, "step": 23315 }, { "epoch": 2.966034855616334, "ewc_loss": 0.07694189995527267, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040320807602256536, "grad_norm": 9.241229057312012, "learning_rate": 1e-06, "loss": 0.5229, "mean_token_accuracy": 0.8513616323471069, "num_tokens": 889487405.0, "step": 23316 }, { "epoch": 2.9661620658949244, "ewc_loss": 0.07552839815616608, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038907304406166077, "grad_norm": 9.00903034210205, "learning_rate": 1e-06, "loss": 0.4918, "mean_token_accuracy": 0.8545940518379211, "num_tokens": 889523051.0, "step": 23317 }, { "epoch": 2.966289276173515, "ewc_loss": 0.07642602920532227, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003980493638664484, "grad_norm": 9.180737495422363, "learning_rate": 1e-06, "loss": 0.5098, "mean_token_accuracy": 0.8490318059921265, "num_tokens": 889559822.0, "step": 23318 }, { "epoch": 2.9664164864521054, "ewc_loss": 0.0756334513425827, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039012354682199657, "grad_norm": 9.036934852600098, "learning_rate": 1e-06, "loss": 0.4606, "mean_token_accuracy": 0.865288257598877, "num_tokens": 889591598.0, "step": 23319 }, { "epoch": 2.966543696730696, "ewc_loss": 0.07626684010028839, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000396457442548126, "grad_norm": 9.201261520385742, "learning_rate": 1e-06, "loss": 0.428, "mean_token_accuracy": 0.8774887323379517, "num_tokens": 889629787.0, "step": 23320 }, { "epoch": 2.9666709070092865, "ewc_loss": 0.07552866637706757, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038907575071789324, "grad_norm": 8.97961139678955, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8737525939941406, "num_tokens": 889668730.0, "step": 23321 }, { "epoch": 2.966798117287877, "ewc_loss": 0.07615499943494797, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003953390405513346, "grad_norm": 9.123393058776855, "learning_rate": 1e-06, "loss": 0.4707, "mean_token_accuracy": 0.8654310703277588, "num_tokens": 889705215.0, "step": 23322 }, { "epoch": 2.966925327566467, "ewc_loss": 0.07565020769834518, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039029112667776644, "grad_norm": 9.040467262268066, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8688015937805176, "num_tokens": 889738528.0, "step": 23323 }, { "epoch": 2.967052537845058, "ewc_loss": 0.07595743238925934, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039336338522844017, "grad_norm": 9.083917617797852, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.869462251663208, "num_tokens": 889771694.0, "step": 23324 }, { "epoch": 2.967179748123648, "ewc_loss": 0.07573899626731873, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003911790263373405, "grad_norm": 9.025734901428223, "learning_rate": 1e-06, "loss": 0.4291, "mean_token_accuracy": 0.8796592950820923, "num_tokens": 889806096.0, "step": 23325 }, { "epoch": 2.967306958402239, "ewc_loss": 0.07596631348133087, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003934522392228246, "grad_norm": 8.972745895385742, "learning_rate": 1e-06, "loss": 0.4747, "mean_token_accuracy": 0.8667863607406616, "num_tokens": 889848514.0, "step": 23326 }, { "epoch": 2.967434168680829, "ewc_loss": 0.0759308934211731, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003930980456061661, "grad_norm": 8.981950759887695, "learning_rate": 1e-06, "loss": 0.4046, "mean_token_accuracy": 0.8876538872718811, "num_tokens": 889889689.0, "step": 23327 }, { "epoch": 2.96756137895942, "ewc_loss": 0.0759945958852768, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039373504114337265, "grad_norm": 9.045401573181152, "learning_rate": 1e-06, "loss": 0.4542, "mean_token_accuracy": 0.87078857421875, "num_tokens": 889925550.0, "step": 23328 }, { "epoch": 2.9676885892380103, "ewc_loss": 0.07604096829891205, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039419872337020934, "grad_norm": 9.055493354797363, "learning_rate": 1e-06, "loss": 0.4999, "mean_token_accuracy": 0.8532094955444336, "num_tokens": 889961797.0, "step": 23329 }, { "epoch": 2.967815799516601, "ewc_loss": 0.07590040564537048, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039279309567064047, "grad_norm": 9.04924201965332, "learning_rate": 1e-06, "loss": 0.4477, "mean_token_accuracy": 0.8734567165374756, "num_tokens": 889997935.0, "step": 23330 }, { "epoch": 2.9679430097951913, "ewc_loss": 0.07588473707437515, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039263642975129187, "grad_norm": 9.051493644714355, "learning_rate": 1e-06, "loss": 0.4232, "mean_token_accuracy": 0.8788337707519531, "num_tokens": 890037900.0, "step": 23331 }, { "epoch": 2.968070220073782, "ewc_loss": 0.07578886300325394, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039167769136838615, "grad_norm": 8.988504409790039, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8741598129272461, "num_tokens": 890074814.0, "step": 23332 }, { "epoch": 2.9681974303523724, "ewc_loss": 0.07597781717777252, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003935672575607896, "grad_norm": 9.076680183410645, "learning_rate": 1e-06, "loss": 0.4302, "mean_token_accuracy": 0.8808099627494812, "num_tokens": 890107091.0, "step": 23333 }, { "epoch": 2.968324640630963, "ewc_loss": 0.07573790103197098, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003911680541932583, "grad_norm": 9.001334190368652, "learning_rate": 1e-06, "loss": 0.4421, "mean_token_accuracy": 0.875079333782196, "num_tokens": 890140558.0, "step": 23334 }, { "epoch": 2.9684518509095534, "ewc_loss": 0.07593465596437454, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039313561865128577, "grad_norm": 9.117812156677246, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8667722940444946, "num_tokens": 890174563.0, "step": 23335 }, { "epoch": 2.968579061188144, "ewc_loss": 0.07569904625415802, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003907795762643218, "grad_norm": 9.000893592834473, "learning_rate": 1e-06, "loss": 0.4634, "mean_token_accuracy": 0.8645769357681274, "num_tokens": 890210584.0, "step": 23336 }, { "epoch": 2.9687062714667345, "ewc_loss": 0.07596513628959656, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003934404521714896, "grad_norm": 9.04144287109375, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8806731104850769, "num_tokens": 890251955.0, "step": 23337 }, { "epoch": 2.968833481745325, "ewc_loss": 0.07563061267137527, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003900951996911317, "grad_norm": 9.022939682006836, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.8620480298995972, "num_tokens": 890289095.0, "step": 23338 }, { "epoch": 2.9689606920239155, "ewc_loss": 0.0759468823671341, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039325791294686496, "grad_norm": 9.027608871459961, "learning_rate": 1e-06, "loss": 0.4625, "mean_token_accuracy": 0.8694500923156738, "num_tokens": 890327864.0, "step": 23339 }, { "epoch": 2.969087902302506, "ewc_loss": 0.07580278813838959, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003918168949894607, "grad_norm": 9.042718887329102, "learning_rate": 1e-06, "loss": 0.4613, "mean_token_accuracy": 0.8673297762870789, "num_tokens": 890364244.0, "step": 23340 }, { "epoch": 2.9692151125810966, "ewc_loss": 0.07575127482414246, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913018445018679, "grad_norm": 9.09925365447998, "learning_rate": 1e-06, "loss": 0.5157, "mean_token_accuracy": 0.85296630859375, "num_tokens": 890397652.0, "step": 23341 }, { "epoch": 2.969342322859687, "ewc_loss": 0.07584063708782196, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000392195419408381, "grad_norm": 9.057713508605957, "learning_rate": 1e-06, "loss": 0.4464, "mean_token_accuracy": 0.872506320476532, "num_tokens": 890435598.0, "step": 23342 }, { "epoch": 2.9694695331382777, "ewc_loss": 0.07584114372730255, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039220054168254137, "grad_norm": 9.07978630065918, "learning_rate": 1e-06, "loss": 0.4616, "mean_token_accuracy": 0.8708042502403259, "num_tokens": 890473445.0, "step": 23343 }, { "epoch": 2.969596743416868, "ewc_loss": 0.07578177750110626, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000391606823541224, "grad_norm": 9.101768493652344, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8713006973266602, "num_tokens": 890514263.0, "step": 23344 }, { "epoch": 2.9697239536954587, "ewc_loss": 0.0754813477396965, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038860252243466675, "grad_norm": 9.040291786193848, "learning_rate": 1e-06, "loss": 0.4348, "mean_token_accuracy": 0.8721002340316772, "num_tokens": 890554123.0, "step": 23345 }, { "epoch": 2.9698511639740492, "ewc_loss": 0.0758344754576683, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039213383570313454, "grad_norm": 9.102771759033203, "learning_rate": 1e-06, "loss": 0.4722, "mean_token_accuracy": 0.866814374923706, "num_tokens": 890593938.0, "step": 23346 }, { "epoch": 2.9699783742526398, "ewc_loss": 0.07579556107521057, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.00038930331356823444, "grad_norm": 9.084818840026855, "learning_rate": 1e-06, "loss": 0.4927, "mean_token_accuracy": 0.8575790524482727, "num_tokens": 890632260.0, "step": 23347 }, { "epoch": 2.97010558453123, "ewc_loss": 0.0756416767835617, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039020582335069776, "grad_norm": 9.073341369628906, "learning_rate": 1e-06, "loss": 0.4648, "mean_token_accuracy": 0.8642638921737671, "num_tokens": 890666852.0, "step": 23348 }, { "epoch": 2.970232794809821, "ewc_loss": 0.07562024891376495, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003899915318470448, "grad_norm": 8.974813461303711, "learning_rate": 1e-06, "loss": 0.4701, "mean_token_accuracy": 0.8632396459579468, "num_tokens": 890707959.0, "step": 23349 }, { "epoch": 2.970360005088411, "ewc_loss": 0.0758274644613266, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039206372457556427, "grad_norm": 9.106327056884766, "learning_rate": 1e-06, "loss": 0.5432, "mean_token_accuracy": 0.8458478450775146, "num_tokens": 890748729.0, "step": 23350 }, { "epoch": 2.970487215367002, "ewc_loss": 0.07560831308364868, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003898721479345113, "grad_norm": 9.029268264770508, "learning_rate": 1e-06, "loss": 0.42, "mean_token_accuracy": 0.8810431957244873, "num_tokens": 890782116.0, "step": 23351 }, { "epoch": 2.970614425645592, "ewc_loss": 0.07592246681451797, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039301373180933297, "grad_norm": 9.105396270751953, "learning_rate": 1e-06, "loss": 0.5022, "mean_token_accuracy": 0.8551865816116333, "num_tokens": 890827309.0, "step": 23352 }, { "epoch": 2.970741635924183, "ewc_loss": 0.07553455233573914, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038913454045541584, "grad_norm": 9.037176132202148, "learning_rate": 1e-06, "loss": 0.5064, "mean_token_accuracy": 0.8573127388954163, "num_tokens": 890863879.0, "step": 23353 }, { "epoch": 2.970868846202773, "ewc_loss": 0.07600398361682892, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003938289301004261, "grad_norm": 9.070197105407715, "learning_rate": 1e-06, "loss": 0.452, "mean_token_accuracy": 0.8698644638061523, "num_tokens": 890901184.0, "step": 23354 }, { "epoch": 2.9709960564813636, "ewc_loss": 0.07561356574296951, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038992470945231616, "grad_norm": 9.058215141296387, "learning_rate": 1e-06, "loss": 0.4293, "mean_token_accuracy": 0.8765208721160889, "num_tokens": 890937779.0, "step": 23355 }, { "epoch": 2.971123266759954, "ewc_loss": 0.07576771080493927, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003914661647286266, "grad_norm": 8.9937744140625, "learning_rate": 1e-06, "loss": 0.4157, "mean_token_accuracy": 0.8806418180465698, "num_tokens": 890978055.0, "step": 23356 }, { "epoch": 2.9712504770385446, "ewc_loss": 0.07605955004692078, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003943845513276756, "grad_norm": 9.094442367553711, "learning_rate": 1e-06, "loss": 0.5948, "mean_token_accuracy": 0.8352307081222534, "num_tokens": 891010985.0, "step": 23357 }, { "epoch": 2.971377687317135, "ewc_loss": 0.07560956478118896, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038988474989309907, "grad_norm": 9.061783790588379, "learning_rate": 1e-06, "loss": 0.4474, "mean_token_accuracy": 0.872071385383606, "num_tokens": 891045904.0, "step": 23358 }, { "epoch": 2.9715048975957257, "ewc_loss": 0.07592813670635223, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039307045517489314, "grad_norm": 9.004199028015137, "learning_rate": 1e-06, "loss": 0.4344, "mean_token_accuracy": 0.8719736337661743, "num_tokens": 891086240.0, "step": 23359 }, { "epoch": 2.971632107874316, "ewc_loss": 0.0759531632065773, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039332068990916014, "grad_norm": 9.048426628112793, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8635433912277222, "num_tokens": 891126482.0, "step": 23360 }, { "epoch": 2.9717593181529067, "ewc_loss": 0.0757141262292862, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039093030500225723, "grad_norm": 9.016844749450684, "learning_rate": 1e-06, "loss": 0.4925, "mean_token_accuracy": 0.8558975458145142, "num_tokens": 891165982.0, "step": 23361 }, { "epoch": 2.9718865284314973, "ewc_loss": 0.07588277757167816, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003926167846657336, "grad_norm": 9.031086921691895, "learning_rate": 1e-06, "loss": 0.5039, "mean_token_accuracy": 0.852223813533783, "num_tokens": 891199101.0, "step": 23362 }, { "epoch": 2.972013738710088, "ewc_loss": 0.07580144703388214, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003918035072274506, "grad_norm": 9.03708553314209, "learning_rate": 1e-06, "loss": 0.4731, "mean_token_accuracy": 0.8665287494659424, "num_tokens": 891231511.0, "step": 23363 }, { "epoch": 2.9721409489886783, "ewc_loss": 0.07589706033468246, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003927596553694457, "grad_norm": 9.093738555908203, "learning_rate": 1e-06, "loss": 0.5045, "mean_token_accuracy": 0.8551811575889587, "num_tokens": 891262369.0, "step": 23364 }, { "epoch": 2.972268159267269, "ewc_loss": 0.0757661834359169, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003914509143214673, "grad_norm": 8.996405601501465, "learning_rate": 1e-06, "loss": 0.5015, "mean_token_accuracy": 0.8518842458724976, "num_tokens": 891304206.0, "step": 23365 }, { "epoch": 2.9723953695458594, "ewc_loss": 0.0760006308555603, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003937954024877399, "grad_norm": 9.0430908203125, "learning_rate": 1e-06, "loss": 0.5029, "mean_token_accuracy": 0.853904128074646, "num_tokens": 891348006.0, "step": 23366 }, { "epoch": 2.97252257982445, "ewc_loss": 0.07574285566806793, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003912176180165261, "grad_norm": 8.973974227905273, "learning_rate": 1e-06, "loss": 0.4345, "mean_token_accuracy": 0.873921275138855, "num_tokens": 891386851.0, "step": 23367 }, { "epoch": 2.9726497901030404, "ewc_loss": 0.07589365541934967, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039272557478398085, "grad_norm": 8.998614311218262, "learning_rate": 1e-06, "loss": 0.4338, "mean_token_accuracy": 0.8761047124862671, "num_tokens": 891423582.0, "step": 23368 }, { "epoch": 2.972777000381631, "ewc_loss": 0.07569985091686249, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039078755071386695, "grad_norm": 8.918118476867676, "learning_rate": 1e-06, "loss": 0.4037, "mean_token_accuracy": 0.8820148706436157, "num_tokens": 891463773.0, "step": 23369 }, { "epoch": 2.9729042106602215, "ewc_loss": 0.0761224776506424, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003950138052459806, "grad_norm": 9.114931106567383, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8767122626304626, "num_tokens": 891498117.0, "step": 23370 }, { "epoch": 2.973031420938812, "ewc_loss": 0.07566103339195251, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003903993929270655, "grad_norm": 8.989912033081055, "learning_rate": 1e-06, "loss": 0.4337, "mean_token_accuracy": 0.8761717081069946, "num_tokens": 891529055.0, "step": 23371 }, { "epoch": 2.9731586312174025, "ewc_loss": 0.07609818130731583, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039477087557315826, "grad_norm": 9.06063175201416, "learning_rate": 1e-06, "loss": 0.433, "mean_token_accuracy": 0.8765647411346436, "num_tokens": 891567370.0, "step": 23372 }, { "epoch": 2.9732858414959926, "ewc_loss": 0.07577762752771378, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003915653214789927, "grad_norm": 8.99360179901123, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.870633602142334, "num_tokens": 891602737.0, "step": 23373 }, { "epoch": 2.9734130517745836, "ewc_loss": 0.07612372934818268, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003950264072045684, "grad_norm": 9.117573738098145, "learning_rate": 1e-06, "loss": 0.4994, "mean_token_accuracy": 0.8604179620742798, "num_tokens": 891639713.0, "step": 23374 }, { "epoch": 2.9735402620531737, "ewc_loss": 0.07568474113941193, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003906364436261356, "grad_norm": 8.966750144958496, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8791792392730713, "num_tokens": 891673931.0, "step": 23375 }, { "epoch": 2.9736674723317646, "ewc_loss": 0.07623244822025299, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003961135516874492, "grad_norm": 9.159777641296387, "learning_rate": 1e-06, "loss": 0.4733, "mean_token_accuracy": 0.865275502204895, "num_tokens": 891708015.0, "step": 23376 }, { "epoch": 2.9737946826103547, "ewc_loss": 0.07553078234195709, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038909688009880483, "grad_norm": 8.945466995239258, "learning_rate": 1e-06, "loss": 0.4752, "mean_token_accuracy": 0.8637816905975342, "num_tokens": 891746023.0, "step": 23377 }, { "epoch": 2.9739218928889453, "ewc_loss": 0.07629582285881042, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039674725849181414, "grad_norm": 9.104342460632324, "learning_rate": 1e-06, "loss": 0.4819, "mean_token_accuracy": 0.8657793998718262, "num_tokens": 891777767.0, "step": 23378 }, { "epoch": 2.974049103167536, "ewc_loss": 0.07566720247268677, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039046103483997285, "grad_norm": 8.938807487487793, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8690226078033447, "num_tokens": 891810816.0, "step": 23379 }, { "epoch": 2.9741763134461263, "ewc_loss": 0.07629546523094177, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039674367872066796, "grad_norm": 9.08486557006836, "learning_rate": 1e-06, "loss": 0.4781, "mean_token_accuracy": 0.8598781824111938, "num_tokens": 891848810.0, "step": 23380 }, { "epoch": 2.974303523724717, "ewc_loss": 0.07575264573097229, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039131546509452164, "grad_norm": 8.904766082763672, "learning_rate": 1e-06, "loss": 0.4602, "mean_token_accuracy": 0.8707595467567444, "num_tokens": 891892553.0, "step": 23381 }, { "epoch": 2.9744307340033074, "ewc_loss": 0.07628601789474487, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039664929499849677, "grad_norm": 9.155293464660645, "learning_rate": 1e-06, "loss": 0.4261, "mean_token_accuracy": 0.8786535859107971, "num_tokens": 891927539.0, "step": 23382 }, { "epoch": 2.974557944281898, "ewc_loss": 0.07563126087188721, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039010168984532356, "grad_norm": 8.9694185256958, "learning_rate": 1e-06, "loss": 0.4384, "mean_token_accuracy": 0.8709853291511536, "num_tokens": 891959577.0, "step": 23383 }, { "epoch": 2.9746851545604884, "ewc_loss": 0.07643696665763855, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003981587360613048, "grad_norm": 9.16702938079834, "learning_rate": 1e-06, "loss": 0.5513, "mean_token_accuracy": 0.8393498659133911, "num_tokens": 892000799.0, "step": 23384 }, { "epoch": 2.974812364839079, "ewc_loss": 0.0756370946764946, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039016001392155886, "grad_norm": 8.896855354309082, "learning_rate": 1e-06, "loss": 0.4843, "mean_token_accuracy": 0.860099196434021, "num_tokens": 892042200.0, "step": 23385 }, { "epoch": 2.9749395751176695, "ewc_loss": 0.07648473978042603, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003986364172305912, "grad_norm": 9.114666938781738, "learning_rate": 1e-06, "loss": 0.4972, "mean_token_accuracy": 0.856310248374939, "num_tokens": 892084838.0, "step": 23386 }, { "epoch": 2.97506678539626, "ewc_loss": 0.07582629472017288, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039205202483572066, "grad_norm": 9.009861946105957, "learning_rate": 1e-06, "loss": 0.5111, "mean_token_accuracy": 0.8502168655395508, "num_tokens": 892126442.0, "step": 23387 }, { "epoch": 2.9751939956748505, "ewc_loss": 0.07627636939287186, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003965527575928718, "grad_norm": 9.083806991577148, "learning_rate": 1e-06, "loss": 0.4855, "mean_token_accuracy": 0.8542121648788452, "num_tokens": 892165019.0, "step": 23388 }, { "epoch": 2.975321205953441, "ewc_loss": 0.0758502259850502, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003922913165297359, "grad_norm": 8.99077320098877, "learning_rate": 1e-06, "loss": 0.4614, "mean_token_accuracy": 0.8700301647186279, "num_tokens": 892202291.0, "step": 23389 }, { "epoch": 2.9754484162320316, "ewc_loss": 0.07612530887126923, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000395042123273015, "grad_norm": 9.104097366333008, "learning_rate": 1e-06, "loss": 0.499, "mean_token_accuracy": 0.857718825340271, "num_tokens": 892235159.0, "step": 23390 }, { "epoch": 2.975575626510622, "ewc_loss": 0.07582421600818634, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039203118649311364, "grad_norm": 9.048171043395996, "learning_rate": 1e-06, "loss": 0.5063, "mean_token_accuracy": 0.8545552492141724, "num_tokens": 892271967.0, "step": 23391 }, { "epoch": 2.9757028367892127, "ewc_loss": 0.07596561312675476, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039344519609585404, "grad_norm": 9.058395385742188, "learning_rate": 1e-06, "loss": 0.5144, "mean_token_accuracy": 0.8521506190299988, "num_tokens": 892306230.0, "step": 23392 }, { "epoch": 2.975830047067803, "ewc_loss": 0.07583862543106079, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039217533776536584, "grad_norm": 9.04677677154541, "learning_rate": 1e-06, "loss": 0.4505, "mean_token_accuracy": 0.86785888671875, "num_tokens": 892342118.0, "step": 23393 }, { "epoch": 2.9759572573463937, "ewc_loss": 0.07585867494344711, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003923758049495518, "grad_norm": 9.037256240844727, "learning_rate": 1e-06, "loss": 0.4485, "mean_token_accuracy": 0.8721416592597961, "num_tokens": 892380764.0, "step": 23394 }, { "epoch": 2.9760844676249842, "ewc_loss": 0.07590007781982422, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039278989424929023, "grad_norm": 9.05746078491211, "learning_rate": 1e-06, "loss": 0.4779, "mean_token_accuracy": 0.8630931973457336, "num_tokens": 892421113.0, "step": 23395 }, { "epoch": 2.9762116779035748, "ewc_loss": 0.07575982809066772, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003913873224519193, "grad_norm": 9.041479110717773, "learning_rate": 1e-06, "loss": 0.4431, "mean_token_accuracy": 0.8746395111083984, "num_tokens": 892457512.0, "step": 23396 }, { "epoch": 2.9763388881821653, "ewc_loss": 0.07578136026859283, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039160269079729915, "grad_norm": 8.998554229736328, "learning_rate": 1e-06, "loss": 0.4407, "mean_token_accuracy": 0.8730174899101257, "num_tokens": 892493599.0, "step": 23397 }, { "epoch": 2.9764660984607554, "ewc_loss": 0.07592564821243286, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003930455422960222, "grad_norm": 9.116915702819824, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8559644222259521, "num_tokens": 892528474.0, "step": 23398 }, { "epoch": 2.9765933087393464, "ewc_loss": 0.07554623484611511, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00038925145054236054, "grad_norm": 8.944260597229004, "learning_rate": 1e-06, "loss": 0.4361, "mean_token_accuracy": 0.8738076686859131, "num_tokens": 892569591.0, "step": 23399 }, { "epoch": 2.9767205190179364, "ewc_loss": 0.07610473036766052, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039483641739934683, "grad_norm": 9.087552070617676, "learning_rate": 1e-06, "loss": 0.4427, "mean_token_accuracy": 0.8712795972824097, "num_tokens": 892607185.0, "step": 23400 }, { "epoch": 2.9768477292965274, "ewc_loss": 0.07552720606327057, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003890611114911735, "grad_norm": 9.019330978393555, "learning_rate": 1e-06, "loss": 0.464, "mean_token_accuracy": 0.8688209056854248, "num_tokens": 892641453.0, "step": 23401 }, { "epoch": 2.9769749395751175, "ewc_loss": 0.07597620785236359, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039355113403871655, "grad_norm": 9.063003540039062, "learning_rate": 1e-06, "loss": 0.4838, "mean_token_accuracy": 0.8608736991882324, "num_tokens": 892676802.0, "step": 23402 }, { "epoch": 2.977102149853708, "ewc_loss": 0.07585137337446213, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039230281254276633, "grad_norm": 9.015600204467773, "learning_rate": 1e-06, "loss": 0.4376, "mean_token_accuracy": 0.8783873915672302, "num_tokens": 892711694.0, "step": 23403 }, { "epoch": 2.9772293601322986, "ewc_loss": 0.07587367296218872, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003925257478840649, "grad_norm": 9.052201271057129, "learning_rate": 1e-06, "loss": 0.4863, "mean_token_accuracy": 0.8581616878509521, "num_tokens": 892752662.0, "step": 23404 }, { "epoch": 2.977356570410889, "ewc_loss": 0.0757993757724762, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917827853001654, "grad_norm": 9.029606819152832, "learning_rate": 1e-06, "loss": 0.4638, "mean_token_accuracy": 0.8669735193252563, "num_tokens": 892790170.0, "step": 23405 }, { "epoch": 2.9774837806894796, "ewc_loss": 0.07583415508270264, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039213066338561475, "grad_norm": 9.0053071975708, "learning_rate": 1e-06, "loss": 0.5313, "mean_token_accuracy": 0.8496078252792358, "num_tokens": 892831437.0, "step": 23406 }, { "epoch": 2.97761099096807, "ewc_loss": 0.07589983195066452, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039278739131987095, "grad_norm": 9.021807670593262, "learning_rate": 1e-06, "loss": 0.396, "mean_token_accuracy": 0.8877902030944824, "num_tokens": 892869022.0, "step": 23407 }, { "epoch": 2.9777382012466607, "ewc_loss": 0.0758526623249054, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039231570553965867, "grad_norm": 9.043911933898926, "learning_rate": 1e-06, "loss": 0.4659, "mean_token_accuracy": 0.8673359751701355, "num_tokens": 892910738.0, "step": 23408 }, { "epoch": 2.977865411525251, "ewc_loss": 0.07593849301338196, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003931740066036582, "grad_norm": 9.048968315124512, "learning_rate": 1e-06, "loss": 0.4459, "mean_token_accuracy": 0.8745220899581909, "num_tokens": 892952055.0, "step": 23409 }, { "epoch": 2.9779926218038417, "ewc_loss": 0.07579295337200165, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917186404578388, "grad_norm": 9.045004844665527, "learning_rate": 1e-06, "loss": 0.4935, "mean_token_accuracy": 0.858833909034729, "num_tokens": 892985843.0, "step": 23410 }, { "epoch": 2.9781198320824323, "ewc_loss": 0.07576625049114227, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039145155460573733, "grad_norm": 8.950860977172852, "learning_rate": 1e-06, "loss": 0.4801, "mean_token_accuracy": 0.8568780422210693, "num_tokens": 893031197.0, "step": 23411 }, { "epoch": 2.978247042361023, "ewc_loss": 0.07620415091514587, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003958305751439184, "grad_norm": 9.121566772460938, "learning_rate": 1e-06, "loss": 0.4134, "mean_token_accuracy": 0.8813018202781677, "num_tokens": 893068113.0, "step": 23412 }, { "epoch": 2.9783742526396133, "ewc_loss": 0.07562750577926636, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039006417500786483, "grad_norm": 8.96665096282959, "learning_rate": 1e-06, "loss": 0.4275, "mean_token_accuracy": 0.8786158561706543, "num_tokens": 893109148.0, "step": 23413 }, { "epoch": 2.978501462918204, "ewc_loss": 0.07620155811309814, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003958046727348119, "grad_norm": 9.080175399780273, "learning_rate": 1e-06, "loss": 0.4458, "mean_token_accuracy": 0.871964693069458, "num_tokens": 893151406.0, "step": 23414 }, { "epoch": 2.9786286731967944, "ewc_loss": 0.07574883103370667, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003912773681804538, "grad_norm": 9.024789810180664, "learning_rate": 1e-06, "loss": 0.446, "mean_token_accuracy": 0.8710340857505798, "num_tokens": 893190209.0, "step": 23415 }, { "epoch": 2.978755883475385, "ewc_loss": 0.07598085701465607, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003935975837521255, "grad_norm": 9.037650108337402, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8667245507240295, "num_tokens": 893232949.0, "step": 23416 }, { "epoch": 2.9788830937539754, "ewc_loss": 0.07597313821315765, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003935204877052456, "grad_norm": 9.046621322631836, "learning_rate": 1e-06, "loss": 0.463, "mean_token_accuracy": 0.8676273822784424, "num_tokens": 893273931.0, "step": 23417 }, { "epoch": 2.979010304032566, "ewc_loss": 0.07596740126609802, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039346309495158494, "grad_norm": 9.08172607421875, "learning_rate": 1e-06, "loss": 0.525, "mean_token_accuracy": 0.8464735746383667, "num_tokens": 893317465.0, "step": 23418 }, { "epoch": 2.9791375143111565, "ewc_loss": 0.07589222490787506, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003927112848032266, "grad_norm": 9.073212623596191, "learning_rate": 1e-06, "loss": 0.5196, "mean_token_accuracy": 0.8503669500350952, "num_tokens": 893354089.0, "step": 23419 }, { "epoch": 2.979264724589747, "ewc_loss": 0.07591921091079712, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003929811646230519, "grad_norm": 9.044410705566406, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8634147644042969, "num_tokens": 893388051.0, "step": 23420 }, { "epoch": 2.979391934868337, "ewc_loss": 0.07608717679977417, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003946608630940318, "grad_norm": 9.106279373168945, "learning_rate": 1e-06, "loss": 0.4862, "mean_token_accuracy": 0.861430823802948, "num_tokens": 893426191.0, "step": 23421 }, { "epoch": 2.979519145146928, "ewc_loss": 0.07587626576423645, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003925517376046628, "grad_norm": 9.029062271118164, "learning_rate": 1e-06, "loss": 0.4854, "mean_token_accuracy": 0.8592987060546875, "num_tokens": 893462134.0, "step": 23422 }, { "epoch": 2.979646355425518, "ewc_loss": 0.07619566470384598, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039574570837430656, "grad_norm": 9.104348182678223, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.8610750436782837, "num_tokens": 893503159.0, "step": 23423 }, { "epoch": 2.979773565704109, "ewc_loss": 0.07570810616016388, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039087008917704225, "grad_norm": 9.043478965759277, "learning_rate": 1e-06, "loss": 0.5342, "mean_token_accuracy": 0.848182737827301, "num_tokens": 893538379.0, "step": 23424 }, { "epoch": 2.979900775982699, "ewc_loss": 0.07609303295612335, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039471936179324985, "grad_norm": 9.058950424194336, "learning_rate": 1e-06, "loss": 0.4983, "mean_token_accuracy": 0.8545523881912231, "num_tokens": 893580431.0, "step": 23425 }, { "epoch": 2.98002798626129, "ewc_loss": 0.07594795525074005, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003932686522603035, "grad_norm": 9.071564674377441, "learning_rate": 1e-06, "loss": 0.4985, "mean_token_accuracy": 0.8604980111122131, "num_tokens": 893622412.0, "step": 23426 }, { "epoch": 2.9801551965398803, "ewc_loss": 0.07596126198768616, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003934016858693212, "grad_norm": 9.064709663391113, "learning_rate": 1e-06, "loss": 0.4676, "mean_token_accuracy": 0.8695926070213318, "num_tokens": 893658575.0, "step": 23427 }, { "epoch": 2.980282406818471, "ewc_loss": 0.07598622143268585, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003936512803193182, "grad_norm": 9.034841537475586, "learning_rate": 1e-06, "loss": 0.4401, "mean_token_accuracy": 0.8742696046829224, "num_tokens": 893702812.0, "step": 23428 }, { "epoch": 2.9804096170970613, "ewc_loss": 0.07600115239620209, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039380055386573076, "grad_norm": 9.112608909606934, "learning_rate": 1e-06, "loss": 0.4881, "mean_token_accuracy": 0.8591611385345459, "num_tokens": 893742785.0, "step": 23429 }, { "epoch": 2.980536827375652, "ewc_loss": 0.07578572630882263, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039164628833532333, "grad_norm": 8.974309921264648, "learning_rate": 1e-06, "loss": 0.4467, "mean_token_accuracy": 0.8730459213256836, "num_tokens": 893782310.0, "step": 23430 }, { "epoch": 2.9806640376542424, "ewc_loss": 0.07626207917928696, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003964098577853292, "grad_norm": 9.0579252243042, "learning_rate": 1e-06, "loss": 0.4048, "mean_token_accuracy": 0.883033275604248, "num_tokens": 893822794.0, "step": 23431 }, { "epoch": 2.980791247932833, "ewc_loss": 0.07579120993614197, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003917011490557343, "grad_norm": 8.935194969177246, "learning_rate": 1e-06, "loss": 0.4581, "mean_token_accuracy": 0.8725543022155762, "num_tokens": 893870462.0, "step": 23432 }, { "epoch": 2.9809184582114234, "ewc_loss": 0.07629282772541046, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039671736885793507, "grad_norm": 9.121552467346191, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8657578825950623, "num_tokens": 893910654.0, "step": 23433 }, { "epoch": 2.981045668490014, "ewc_loss": 0.07589247077703476, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003927137586288154, "grad_norm": 8.984691619873047, "learning_rate": 1e-06, "loss": 0.4643, "mean_token_accuracy": 0.8631418943405151, "num_tokens": 893948819.0, "step": 23434 }, { "epoch": 2.9811728787686045, "ewc_loss": 0.07634630799293518, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039725209353491664, "grad_norm": 9.03547191619873, "learning_rate": 1e-06, "loss": 0.4415, "mean_token_accuracy": 0.8726512789726257, "num_tokens": 893983325.0, "step": 23435 }, { "epoch": 2.981300089047195, "ewc_loss": 0.07629257440567017, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039671483682468534, "grad_norm": 9.04153823852539, "learning_rate": 1e-06, "loss": 0.467, "mean_token_accuracy": 0.865295946598053, "num_tokens": 894023712.0, "step": 23436 }, { "epoch": 2.9814272993257855, "ewc_loss": 0.07609587162733078, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003947477671317756, "grad_norm": 9.015690803527832, "learning_rate": 1e-06, "loss": 0.4301, "mean_token_accuracy": 0.8761563897132874, "num_tokens": 894059719.0, "step": 23437 }, { "epoch": 2.981554509604376, "ewc_loss": 0.07622809708118439, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003960699832532555, "grad_norm": 9.04102897644043, "learning_rate": 1e-06, "loss": 0.462, "mean_token_accuracy": 0.8670160174369812, "num_tokens": 894100150.0, "step": 23438 }, { "epoch": 2.9816817198829666, "ewc_loss": 0.07615679502487183, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039535696851089597, "grad_norm": 9.057361602783203, "learning_rate": 1e-06, "loss": 0.5514, "mean_token_accuracy": 0.8452390432357788, "num_tokens": 894134336.0, "step": 23439 }, { "epoch": 2.981808930161557, "ewc_loss": 0.07611048221588135, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003948938974644989, "grad_norm": 9.079484939575195, "learning_rate": 1e-06, "loss": 0.5113, "mean_token_accuracy": 0.8516918420791626, "num_tokens": 894168972.0, "step": 23440 }, { "epoch": 2.9819361404401477, "ewc_loss": 0.07610723376274109, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039486141758970916, "grad_norm": 9.032155990600586, "learning_rate": 1e-06, "loss": 0.4713, "mean_token_accuracy": 0.8632388114929199, "num_tokens": 894204641.0, "step": 23441 }, { "epoch": 2.982063350718738, "ewc_loss": 0.07618513703346252, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000395640468923375, "grad_norm": 9.083102226257324, "learning_rate": 1e-06, "loss": 0.4691, "mean_token_accuracy": 0.8686175346374512, "num_tokens": 894248403.0, "step": 23442 }, { "epoch": 2.9821905609973287, "ewc_loss": 0.07587873935699463, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039257650496438146, "grad_norm": 9.01213550567627, "learning_rate": 1e-06, "loss": 0.4042, "mean_token_accuracy": 0.8839045166969299, "num_tokens": 894284035.0, "step": 23443 }, { "epoch": 2.9823177712759192, "ewc_loss": 0.0762084573507309, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039587359060533345, "grad_norm": 9.04971981048584, "learning_rate": 1e-06, "loss": 0.4442, "mean_token_accuracy": 0.8712986707687378, "num_tokens": 894321804.0, "step": 23444 }, { "epoch": 2.9824449815545098, "ewc_loss": 0.0760192722082138, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003939818125218153, "grad_norm": 9.020143508911133, "learning_rate": 1e-06, "loss": 0.4367, "mean_token_accuracy": 0.8751916885375977, "num_tokens": 894355648.0, "step": 23445 }, { "epoch": 2.9825721918331, "ewc_loss": 0.0761130228638649, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039491927600465715, "grad_norm": 9.05999755859375, "learning_rate": 1e-06, "loss": 0.4578, "mean_token_accuracy": 0.8669766187667847, "num_tokens": 894392947.0, "step": 23446 }, { "epoch": 2.982699402111691, "ewc_loss": 0.07633371651172638, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003946848155464977, "grad_norm": 11.32187271118164, "learning_rate": 1e-06, "loss": 0.4978, "mean_token_accuracy": 0.8569275140762329, "num_tokens": 894433437.0, "step": 23447 }, { "epoch": 2.982826612390281, "ewc_loss": 0.07600158452987671, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039380486123263836, "grad_norm": 8.756802558898926, "learning_rate": 1e-06, "loss": 0.4494, "mean_token_accuracy": 0.8718885183334351, "num_tokens": 894481428.0, "step": 23448 }, { "epoch": 2.982953822668872, "ewc_loss": 0.07967419922351837, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004305310139898211, "grad_norm": 9.737343788146973, "learning_rate": 1e-06, "loss": 0.4687, "mean_token_accuracy": 0.8669341802597046, "num_tokens": 894518055.0, "step": 23449 }, { "epoch": 2.983081032947462, "ewc_loss": 0.07517144083976746, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003855034592561424, "grad_norm": 8.805896759033203, "learning_rate": 1e-06, "loss": 0.4904, "mean_token_accuracy": 0.8608230352401733, "num_tokens": 894562693.0, "step": 23450 }, { "epoch": 2.983208243226053, "ewc_loss": 0.07991166412830353, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00043290574103593826, "grad_norm": 9.717597007751465, "learning_rate": 1e-06, "loss": 0.4748, "mean_token_accuracy": 0.8649218082427979, "num_tokens": 894604973.0, "step": 23451 }, { "epoch": 2.983335453504643, "ewc_loss": 0.07596422731876373, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003934313717763871, "grad_norm": 8.987732887268066, "learning_rate": 1e-06, "loss": 0.443, "mean_token_accuracy": 0.8732354640960693, "num_tokens": 894645672.0, "step": 23452 }, { "epoch": 2.9834626637832335, "ewc_loss": 0.07881854474544525, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004219744587317109, "grad_norm": 9.530900955200195, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8633574843406677, "num_tokens": 894686702.0, "step": 23453 }, { "epoch": 2.983589874061824, "ewc_loss": 0.07633578777313232, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039714694139547646, "grad_norm": 9.091484069824219, "learning_rate": 1e-06, "loss": 0.4408, "mean_token_accuracy": 0.8744747638702393, "num_tokens": 894722768.0, "step": 23454 }, { "epoch": 2.9837170843404146, "ewc_loss": 0.0777486264705658, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004112753376830369, "grad_norm": 9.443170547485352, "learning_rate": 1e-06, "loss": 0.432, "mean_token_accuracy": 0.8784055709838867, "num_tokens": 894759216.0, "step": 23455 }, { "epoch": 2.983844294619005, "ewc_loss": 0.07632946968078613, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039708372787572443, "grad_norm": 9.133262634277344, "learning_rate": 1e-06, "loss": 0.4454, "mean_token_accuracy": 0.8705122470855713, "num_tokens": 894799691.0, "step": 23456 }, { "epoch": 2.9839715048975957, "ewc_loss": 0.0772159993648529, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000405949103878811, "grad_norm": 9.312190055847168, "learning_rate": 1e-06, "loss": 0.484, "mean_token_accuracy": 0.8624422550201416, "num_tokens": 894835869.0, "step": 23457 }, { "epoch": 2.984098715176186, "ewc_loss": 0.07620047777891159, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000395793846109882, "grad_norm": 9.190572738647461, "learning_rate": 1e-06, "loss": 0.5275, "mean_token_accuracy": 0.8456005454063416, "num_tokens": 894875706.0, "step": 23458 }, { "epoch": 2.9842259254547767, "ewc_loss": 0.0765882283449173, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003996713785454631, "grad_norm": 9.201186180114746, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8723490238189697, "num_tokens": 894920135.0, "step": 23459 }, { "epoch": 2.9843531357333672, "ewc_loss": 0.07628694176673889, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039665846270509064, "grad_norm": 9.169203758239746, "learning_rate": 1e-06, "loss": 0.4555, "mean_token_accuracy": 0.8693944811820984, "num_tokens": 894960825.0, "step": 23460 }, { "epoch": 2.9844803460119578, "ewc_loss": 0.07597146183252335, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003935036947950721, "grad_norm": 9.073919296264648, "learning_rate": 1e-06, "loss": 0.4642, "mean_token_accuracy": 0.8680765628814697, "num_tokens": 895001935.0, "step": 23461 }, { "epoch": 2.9846075562905483, "ewc_loss": 0.07636956870555878, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003974847786594182, "grad_norm": 9.197300910949707, "learning_rate": 1e-06, "loss": 0.4264, "mean_token_accuracy": 0.8754802942276001, "num_tokens": 895033559.0, "step": 23462 }, { "epoch": 2.984734766569139, "ewc_loss": 0.07597847282886505, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039357374771498144, "grad_norm": 9.102225303649902, "learning_rate": 1e-06, "loss": 0.4685, "mean_token_accuracy": 0.8662364482879639, "num_tokens": 895070958.0, "step": 23463 }, { "epoch": 2.9848619768477294, "ewc_loss": 0.07619555294513702, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003957445442210883, "grad_norm": 9.130640029907227, "learning_rate": 1e-06, "loss": 0.4693, "mean_token_accuracy": 0.865161657333374, "num_tokens": 895111533.0, "step": 23464 }, { "epoch": 2.98498918712632, "ewc_loss": 0.07598079741001129, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003935970307793468, "grad_norm": 9.07664966583252, "learning_rate": 1e-06, "loss": 0.4728, "mean_token_accuracy": 0.862703800201416, "num_tokens": 895149861.0, "step": 23465 }, { "epoch": 2.9851163974049104, "ewc_loss": 0.07627609372138977, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003965500509366393, "grad_norm": 9.0906343460083, "learning_rate": 1e-06, "loss": 0.4429, "mean_token_accuracy": 0.8738340139389038, "num_tokens": 895192958.0, "step": 23466 }, { "epoch": 2.985243607683501, "ewc_loss": 0.07605724781751633, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003943615301977843, "grad_norm": 9.09231185913086, "learning_rate": 1e-06, "loss": 0.4334, "mean_token_accuracy": 0.8774967193603516, "num_tokens": 895226710.0, "step": 23467 }, { "epoch": 2.9853708179620915, "ewc_loss": 0.07618781924247742, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003956672444473952, "grad_norm": 9.124279022216797, "learning_rate": 1e-06, "loss": 0.4553, "mean_token_accuracy": 0.8700772523880005, "num_tokens": 895267695.0, "step": 23468 }, { "epoch": 2.985498028240682, "ewc_loss": 0.07607381045818329, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003945271600969136, "grad_norm": 9.106691360473633, "learning_rate": 1e-06, "loss": 0.4205, "mean_token_accuracy": 0.8823463916778564, "num_tokens": 895303101.0, "step": 23469 }, { "epoch": 2.9856252385192725, "ewc_loss": 0.07620926201343536, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003958817105740309, "grad_norm": 9.104268074035645, "learning_rate": 1e-06, "loss": 0.4422, "mean_token_accuracy": 0.8734695315361023, "num_tokens": 895340437.0, "step": 23470 }, { "epoch": 2.9857524487978626, "ewc_loss": 0.07609594613313675, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003947485238313675, "grad_norm": 9.102060317993164, "learning_rate": 1e-06, "loss": 0.4215, "mean_token_accuracy": 0.8824801445007324, "num_tokens": 895377695.0, "step": 23471 }, { "epoch": 2.9858796590764536, "ewc_loss": 0.07610391080379486, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003948281519114971, "grad_norm": 9.125471115112305, "learning_rate": 1e-06, "loss": 0.4933, "mean_token_accuracy": 0.8581412434577942, "num_tokens": 895418599.0, "step": 23472 }, { "epoch": 2.9860068693550437, "ewc_loss": 0.07603462040424347, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039413527701981366, "grad_norm": 9.112654685974121, "learning_rate": 1e-06, "loss": 0.4628, "mean_token_accuracy": 0.8674370050430298, "num_tokens": 895454832.0, "step": 23473 }, { "epoch": 2.9861340796336346, "ewc_loss": 0.0761774480342865, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039556348929181695, "grad_norm": 9.08530330657959, "learning_rate": 1e-06, "loss": 0.5289, "mean_token_accuracy": 0.8489995002746582, "num_tokens": 895490607.0, "step": 23474 }, { "epoch": 2.9862612899122247, "ewc_loss": 0.07613985240459442, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003951875551138073, "grad_norm": 9.087136268615723, "learning_rate": 1e-06, "loss": 0.4761, "mean_token_accuracy": 0.861377477645874, "num_tokens": 895531340.0, "step": 23475 }, { "epoch": 2.9863885001908153, "ewc_loss": 0.07607167959213257, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003945058851968497, "grad_norm": 9.0953950881958, "learning_rate": 1e-06, "loss": 0.4656, "mean_token_accuracy": 0.8676109313964844, "num_tokens": 895568688.0, "step": 23476 }, { "epoch": 2.986515710469406, "ewc_loss": 0.07617509365081787, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039554000250063837, "grad_norm": 9.08204460144043, "learning_rate": 1e-06, "loss": 0.496, "mean_token_accuracy": 0.8566256165504456, "num_tokens": 895607316.0, "step": 23477 }, { "epoch": 2.9866429207479963, "ewc_loss": 0.07600098103284836, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039379886584356427, "grad_norm": 9.102067947387695, "learning_rate": 1e-06, "loss": 0.4109, "mean_token_accuracy": 0.8845423460006714, "num_tokens": 895648694.0, "step": 23478 }, { "epoch": 2.986770131026587, "ewc_loss": 0.0760788694024086, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039457777165807784, "grad_norm": 9.044921875, "learning_rate": 1e-06, "loss": 0.4804, "mean_token_accuracy": 0.8622785806655884, "num_tokens": 895687996.0, "step": 23479 }, { "epoch": 2.9868973413051774, "ewc_loss": 0.07626114785671234, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003964005154557526, "grad_norm": 9.085410118103027, "learning_rate": 1e-06, "loss": 0.4489, "mean_token_accuracy": 0.8716344237327576, "num_tokens": 895726727.0, "step": 23480 }, { "epoch": 2.987024551583768, "ewc_loss": 0.07586390525102615, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039242810453288257, "grad_norm": 9.02298641204834, "learning_rate": 1e-06, "loss": 0.4364, "mean_token_accuracy": 0.871353030204773, "num_tokens": 895760688.0, "step": 23481 }, { "epoch": 2.9871517618623584, "ewc_loss": 0.076212078332901, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003959097957704216, "grad_norm": 9.097047805786133, "learning_rate": 1e-06, "loss": 0.5243, "mean_token_accuracy": 0.8465467691421509, "num_tokens": 895797303.0, "step": 23482 }, { "epoch": 2.987278972140949, "ewc_loss": 0.07601919770240784, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003939810849260539, "grad_norm": 9.041692733764648, "learning_rate": 1e-06, "loss": 0.4444, "mean_token_accuracy": 0.8724067807197571, "num_tokens": 895834078.0, "step": 23483 }, { "epoch": 2.9874061824195395, "ewc_loss": 0.07611843198537827, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003949733800254762, "grad_norm": 9.083415985107422, "learning_rate": 1e-06, "loss": 0.5025, "mean_token_accuracy": 0.8562139868736267, "num_tokens": 895872640.0, "step": 23484 }, { "epoch": 2.98753339269813, "ewc_loss": 0.07616711407899857, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000395460199797526, "grad_norm": 9.09004020690918, "learning_rate": 1e-06, "loss": 0.4253, "mean_token_accuracy": 0.8815096616744995, "num_tokens": 895907646.0, "step": 23485 }, { "epoch": 2.9876606029767205, "ewc_loss": 0.07616390287876129, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003954281273763627, "grad_norm": 9.053474426269531, "learning_rate": 1e-06, "loss": 0.436, "mean_token_accuracy": 0.8769677877426147, "num_tokens": 895944183.0, "step": 23486 }, { "epoch": 2.987787813255311, "ewc_loss": 0.07632383704185486, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003970273828599602, "grad_norm": 9.116241455078125, "learning_rate": 1e-06, "loss": 0.4534, "mean_token_accuracy": 0.8687763214111328, "num_tokens": 895983987.0, "step": 23487 }, { "epoch": 2.9879150235339016, "ewc_loss": 0.07599526643753052, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039374176412820816, "grad_norm": 9.094474792480469, "learning_rate": 1e-06, "loss": 0.4958, "mean_token_accuracy": 0.8553555011749268, "num_tokens": 896020928.0, "step": 23488 }, { "epoch": 2.988042233812492, "ewc_loss": 0.0762978345155716, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003967673983424902, "grad_norm": 9.097543716430664, "learning_rate": 1e-06, "loss": 0.4342, "mean_token_accuracy": 0.8734787702560425, "num_tokens": 896057459.0, "step": 23489 }, { "epoch": 2.9881694440910826, "ewc_loss": 0.0761413499712944, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039520254358649254, "grad_norm": 9.094730377197266, "learning_rate": 1e-06, "loss": 0.4725, "mean_token_accuracy": 0.863300621509552, "num_tokens": 896095049.0, "step": 23490 }, { "epoch": 2.988296654369673, "ewc_loss": 0.07654672861099243, "ewc_loss_diag": 3.695487976074219e-05, "ewc_loss_parallel": 0.0003968148957937956, "grad_norm": 9.192841529846191, "learning_rate": 1e-06, "loss": 0.4317, "mean_token_accuracy": 0.8775272369384766, "num_tokens": 896127433.0, "step": 23491 }, { "epoch": 2.9884238646482637, "ewc_loss": 0.07601892948150635, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039397834916599095, "grad_norm": 9.080879211425781, "learning_rate": 1e-06, "loss": 0.4472, "mean_token_accuracy": 0.87281334400177, "num_tokens": 896165662.0, "step": 23492 }, { "epoch": 2.9885510749268542, "ewc_loss": 0.07625582069158554, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003963472554460168, "grad_norm": 9.170937538146973, "learning_rate": 1e-06, "loss": 0.4711, "mean_token_accuracy": 0.8636864423751831, "num_tokens": 896207753.0, "step": 23493 }, { "epoch": 2.9886782852054448, "ewc_loss": 0.07667281478643417, "ewc_loss_diag": 3.743171691894531e-05, "ewc_loss_parallel": 0.000393192982301116, "grad_norm": 36.39855194091797, "learning_rate": 1e-06, "loss": 0.4666, "mean_token_accuracy": 0.863925576210022, "num_tokens": 896249105.0, "step": 23494 }, { "epoch": 2.9888054954840353, "ewc_loss": 0.11535728722810745, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0007873619324527681, "grad_norm": 13.531754493713379, "learning_rate": 1e-06, "loss": 0.5004, "mean_token_accuracy": 0.8670017719268799, "num_tokens": 896291728.0, "step": 23495 }, { "epoch": 2.9889327057626254, "ewc_loss": 0.07062588632106781, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003400478744879365, "grad_norm": 7.297457695007324, "learning_rate": 1e-06, "loss": 0.489, "mean_token_accuracy": 0.8567154407501221, "num_tokens": 896331729.0, "step": 23496 }, { "epoch": 2.9890599160412163, "ewc_loss": 0.10093248635530472, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0006431139190681279, "grad_norm": 12.872779846191406, "learning_rate": 1e-06, "loss": 0.5185, "mean_token_accuracy": 0.8575254678726196, "num_tokens": 896371804.0, "step": 23497 }, { "epoch": 2.9891871263198064, "ewc_loss": 0.09947052597999573, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0006284943083301187, "grad_norm": 11.561334609985352, "learning_rate": 1e-06, "loss": 0.4873, "mean_token_accuracy": 0.8635683059692383, "num_tokens": 896408398.0, "step": 23498 }, { "epoch": 2.9893143365983974, "ewc_loss": 0.08457574993371964, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00047954655019566417, "grad_norm": 9.833449363708496, "learning_rate": 1e-06, "loss": 0.4362, "mean_token_accuracy": 0.881148099899292, "num_tokens": 896440496.0, "step": 23499 }, { "epoch": 2.9894415468769875, "ewc_loss": 0.08682398498058319, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0005020289099775255, "grad_norm": 10.847395896911621, "learning_rate": 1e-06, "loss": 0.4755, "mean_token_accuracy": 0.8688125610351562, "num_tokens": 896475212.0, "step": 23500 }, { "epoch": 2.989568757155578, "ewc_loss": 0.0856848880648613, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004906379617750645, "grad_norm": 9.880489349365234, "learning_rate": 1e-06, "loss": 0.4825, "mean_token_accuracy": 0.863051176071167, "num_tokens": 896517122.0, "step": 23501 }, { "epoch": 2.9896959674341685, "ewc_loss": 0.0826699286699295, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004604883724823594, "grad_norm": 10.12276840209961, "learning_rate": 1e-06, "loss": 0.4481, "mean_token_accuracy": 0.8730994462966919, "num_tokens": 896557259.0, "step": 23502 }, { "epoch": 2.989823177712759, "ewc_loss": 0.08224383741617203, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00045622745528817177, "grad_norm": 9.837104797363281, "learning_rate": 1e-06, "loss": 0.4973, "mean_token_accuracy": 0.8578135967254639, "num_tokens": 896600237.0, "step": 23503 }, { "epoch": 2.9899503879913496, "ewc_loss": 0.08049052953720093, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004386944056022912, "grad_norm": 9.53690242767334, "learning_rate": 1e-06, "loss": 0.429, "mean_token_accuracy": 0.8784632682800293, "num_tokens": 896640564.0, "step": 23504 }, { "epoch": 2.99007759826994, "ewc_loss": 0.08047928661108017, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00043858191929757595, "grad_norm": 9.750272750854492, "learning_rate": 1e-06, "loss": 0.4245, "mean_token_accuracy": 0.8783490061759949, "num_tokens": 896674164.0, "step": 23505 }, { "epoch": 2.9902048085485307, "ewc_loss": 0.07929332554340363, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00042672231211327016, "grad_norm": 9.46534252166748, "learning_rate": 1e-06, "loss": 0.4934, "mean_token_accuracy": 0.8612173795700073, "num_tokens": 896716613.0, "step": 23506 }, { "epoch": 2.990332018827121, "ewc_loss": 0.07918325066566467, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004256215179339051, "grad_norm": 9.557673454284668, "learning_rate": 1e-06, "loss": 0.4981, "mean_token_accuracy": 0.857233464717865, "num_tokens": 896755586.0, "step": 23507 }, { "epoch": 2.9904592291057117, "ewc_loss": 0.07813054323196411, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00041509445873089135, "grad_norm": 9.280252456665039, "learning_rate": 1e-06, "loss": 0.4677, "mean_token_accuracy": 0.8645827174186707, "num_tokens": 896797641.0, "step": 23508 }, { "epoch": 2.9905864393843022, "ewc_loss": 0.07846017181873322, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004183908167760819, "grad_norm": 9.554010391235352, "learning_rate": 1e-06, "loss": 0.4765, "mean_token_accuracy": 0.8644424676895142, "num_tokens": 896835113.0, "step": 23509 }, { "epoch": 2.9907136496628928, "ewc_loss": 0.07736821472644806, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004074712051078677, "grad_norm": 9.221192359924316, "learning_rate": 1e-06, "loss": 0.4567, "mean_token_accuracy": 0.866690456867218, "num_tokens": 896866496.0, "step": 23510 }, { "epoch": 2.9908408599414833, "ewc_loss": 0.07807276397943497, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004145167185924947, "grad_norm": 9.470507621765137, "learning_rate": 1e-06, "loss": 0.4772, "mean_token_accuracy": 0.864342987537384, "num_tokens": 896907535.0, "step": 23511 }, { "epoch": 2.990968070220074, "ewc_loss": 0.0768207535147667, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040199659997597337, "grad_norm": 9.153875350952148, "learning_rate": 1e-06, "loss": 0.4071, "mean_token_accuracy": 0.8839576840400696, "num_tokens": 896948574.0, "step": 23512 }, { "epoch": 2.9910952804986644, "ewc_loss": 0.07767005264759064, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00041048956336453557, "grad_norm": 9.473891258239746, "learning_rate": 1e-06, "loss": 0.486, "mean_token_accuracy": 0.8611868619918823, "num_tokens": 896992377.0, "step": 23513 }, { "epoch": 2.991222490777255, "ewc_loss": 0.07640983164310455, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039788734284229577, "grad_norm": 9.116379737854004, "learning_rate": 1e-06, "loss": 0.4783, "mean_token_accuracy": 0.8620550632476807, "num_tokens": 897035411.0, "step": 23514 }, { "epoch": 2.9913497010558454, "ewc_loss": 0.07762016355991364, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040999072371050715, "grad_norm": 9.441506385803223, "learning_rate": 1e-06, "loss": 0.4674, "mean_token_accuracy": 0.8700536489486694, "num_tokens": 897074661.0, "step": 23515 }, { "epoch": 2.991476911334436, "ewc_loss": 0.07616545259952545, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003954435815103352, "grad_norm": 9.057369232177734, "learning_rate": 1e-06, "loss": 0.4644, "mean_token_accuracy": 0.8695311546325684, "num_tokens": 897115550.0, "step": 23516 }, { "epoch": 2.9916041216130265, "ewc_loss": 0.0775938332080841, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040972739225253463, "grad_norm": 9.395414352416992, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8603648543357849, "num_tokens": 897155091.0, "step": 23517 }, { "epoch": 2.991731331891617, "ewc_loss": 0.0761260837316513, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003950499230995774, "grad_norm": 9.067252159118652, "learning_rate": 1e-06, "loss": 0.475, "mean_token_accuracy": 0.8624969720840454, "num_tokens": 897196551.0, "step": 23518 }, { "epoch": 2.991858542170207, "ewc_loss": 0.07723163068294525, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004061053623445332, "grad_norm": 9.256487846374512, "learning_rate": 1e-06, "loss": 0.4256, "mean_token_accuracy": 0.8767126798629761, "num_tokens": 897231885.0, "step": 23519 }, { "epoch": 2.991985752448798, "ewc_loss": 0.0764489397406578, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003982784692198038, "grad_norm": 9.18596076965332, "learning_rate": 1e-06, "loss": 0.4658, "mean_token_accuracy": 0.8666789531707764, "num_tokens": 897267944.0, "step": 23520 }, { "epoch": 2.992112962727388, "ewc_loss": 0.0767485648393631, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000401274737669155, "grad_norm": 9.209748268127441, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8702380061149597, "num_tokens": 897307330.0, "step": 23521 }, { "epoch": 2.992240173005979, "ewc_loss": 0.07645987719297409, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003983878414146602, "grad_norm": 9.124394416809082, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8677061796188354, "num_tokens": 897344028.0, "step": 23522 }, { "epoch": 2.992367383284569, "ewc_loss": 0.07658042758703232, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003995933511760086, "grad_norm": 9.25598430633545, "learning_rate": 1e-06, "loss": 0.4885, "mean_token_accuracy": 0.8581406474113464, "num_tokens": 897391213.0, "step": 23523 }, { "epoch": 2.99249459356316, "ewc_loss": 0.07627878338098526, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003965768846683204, "grad_norm": 9.082390785217285, "learning_rate": 1e-06, "loss": 0.4859, "mean_token_accuracy": 0.8606597185134888, "num_tokens": 897428222.0, "step": 23524 }, { "epoch": 2.9926218038417502, "ewc_loss": 0.07677516341209412, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004015407175756991, "grad_norm": 9.253042221069336, "learning_rate": 1e-06, "loss": 0.5132, "mean_token_accuracy": 0.8550466895103455, "num_tokens": 897457737.0, "step": 23525 }, { "epoch": 2.992749014120341, "ewc_loss": 0.07622361183166504, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003960251924581826, "grad_norm": 9.070472717285156, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8754823207855225, "num_tokens": 897498925.0, "step": 23526 }, { "epoch": 2.9928762243989313, "ewc_loss": 0.07686363160610199, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004024253867100924, "grad_norm": 9.279427528381348, "learning_rate": 1e-06, "loss": 0.4812, "mean_token_accuracy": 0.8615201115608215, "num_tokens": 897533157.0, "step": 23527 }, { "epoch": 2.993003434677522, "ewc_loss": 0.07602182030677795, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003940072492696345, "grad_norm": 9.019037246704102, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8692418336868286, "num_tokens": 897574159.0, "step": 23528 }, { "epoch": 2.9931306449561124, "ewc_loss": 0.07704762369394302, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004042653017677367, "grad_norm": 9.358965873718262, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8574857711791992, "num_tokens": 897606970.0, "step": 23529 }, { "epoch": 2.993257855234703, "ewc_loss": 0.07590177655220032, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039280677447095513, "grad_norm": 9.000297546386719, "learning_rate": 1e-06, "loss": 0.5413, "mean_token_accuracy": 0.8420686721801758, "num_tokens": 897647725.0, "step": 23530 }, { "epoch": 2.9933850655132934, "ewc_loss": 0.0771443247795105, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040523233474232256, "grad_norm": 9.316962242126465, "learning_rate": 1e-06, "loss": 0.4546, "mean_token_accuracy": 0.8692522644996643, "num_tokens": 897687109.0, "step": 23531 }, { "epoch": 2.993512275791884, "ewc_loss": 0.07587644457817078, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039255351293832064, "grad_norm": 9.020242691040039, "learning_rate": 1e-06, "loss": 0.4363, "mean_token_accuracy": 0.8784635663032532, "num_tokens": 897720994.0, "step": 23532 }, { "epoch": 2.9936394860704745, "ewc_loss": 0.07716673612594604, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004054563760291785, "grad_norm": 9.295504570007324, "learning_rate": 1e-06, "loss": 0.4141, "mean_token_accuracy": 0.882219135761261, "num_tokens": 897749855.0, "step": 23533 }, { "epoch": 2.993766696349065, "ewc_loss": 0.07597610354423523, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000393550144508481, "grad_norm": 9.064497947692871, "learning_rate": 1e-06, "loss": 0.458, "mean_token_accuracy": 0.8633745908737183, "num_tokens": 897792846.0, "step": 23534 }, { "epoch": 2.9938939066276555, "ewc_loss": 0.07694031298160553, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040319221443496644, "grad_norm": 9.329448699951172, "learning_rate": 1e-06, "loss": 0.4558, "mean_token_accuracy": 0.8690376281738281, "num_tokens": 897836765.0, "step": 23535 }, { "epoch": 2.994021116906246, "ewc_loss": 0.07600413262844086, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039383044349960983, "grad_norm": 9.111618041992188, "learning_rate": 1e-06, "loss": 0.4597, "mean_token_accuracy": 0.8676791787147522, "num_tokens": 897880918.0, "step": 23536 }, { "epoch": 2.9941483271848366, "ewc_loss": 0.07660280168056488, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003998171305283904, "grad_norm": 9.293294906616211, "learning_rate": 1e-06, "loss": 0.4667, "mean_token_accuracy": 0.8629579544067383, "num_tokens": 897916676.0, "step": 23537 }, { "epoch": 2.994275537463427, "ewc_loss": 0.07611748576164246, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003949639212805778, "grad_norm": 9.23457145690918, "learning_rate": 1e-06, "loss": 0.4439, "mean_token_accuracy": 0.8722139000892639, "num_tokens": 897953968.0, "step": 23538 }, { "epoch": 2.9944027477420176, "ewc_loss": 0.07619565725326538, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003957456210628152, "grad_norm": 9.141474723815918, "learning_rate": 1e-06, "loss": 0.48, "mean_token_accuracy": 0.8671978116035461, "num_tokens": 897993055.0, "step": 23539 }, { "epoch": 2.994529958020608, "ewc_loss": 0.07631136476993561, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039690276025794446, "grad_norm": 9.126330375671387, "learning_rate": 1e-06, "loss": 0.4319, "mean_token_accuracy": 0.8750030994415283, "num_tokens": 898030202.0, "step": 23540 }, { "epoch": 2.9946571682991987, "ewc_loss": 0.07629869878292084, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039677604218013585, "grad_norm": 9.26009750366211, "learning_rate": 1e-06, "loss": 0.4694, "mean_token_accuracy": 0.8617500066757202, "num_tokens": 898067234.0, "step": 23541 }, { "epoch": 2.9947843785777892, "ewc_loss": 0.07597330212593079, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003935221175197512, "grad_norm": 9.045796394348145, "learning_rate": 1e-06, "loss": 0.4447, "mean_token_accuracy": 0.8703393340110779, "num_tokens": 898105423.0, "step": 23542 }, { "epoch": 2.9949115888563798, "ewc_loss": 0.07655386626720428, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003993276914115995, "grad_norm": 9.21138858795166, "learning_rate": 1e-06, "loss": 0.3965, "mean_token_accuracy": 0.88664311170578, "num_tokens": 898144550.0, "step": 23543 }, { "epoch": 2.99503879913497, "ewc_loss": 0.07581610232591629, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003919501032214612, "grad_norm": 9.201281547546387, "learning_rate": 1e-06, "loss": 0.4568, "mean_token_accuracy": 0.8699249625205994, "num_tokens": 898179277.0, "step": 23544 }, { "epoch": 2.995166009413561, "ewc_loss": 0.07623083889484406, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039609745726920664, "grad_norm": 9.104483604431152, "learning_rate": 1e-06, "loss": 0.457, "mean_token_accuracy": 0.8657022714614868, "num_tokens": 898218518.0, "step": 23545 }, { "epoch": 2.995293219692151, "ewc_loss": 0.07616035640239716, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003953926498070359, "grad_norm": 9.158697128295898, "learning_rate": 1e-06, "loss": 0.4965, "mean_token_accuracy": 0.8556683659553528, "num_tokens": 898259266.0, "step": 23546 }, { "epoch": 2.995420429970742, "ewc_loss": 0.07603923976421356, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003941814647987485, "grad_norm": 9.14164924621582, "learning_rate": 1e-06, "loss": 0.4947, "mean_token_accuracy": 0.8647539615631104, "num_tokens": 898292648.0, "step": 23547 }, { "epoch": 2.995547640249332, "ewc_loss": 0.07625710964202881, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003963601484429091, "grad_norm": 9.070230484008789, "learning_rate": 1e-06, "loss": 0.4604, "mean_token_accuracy": 0.8695064783096313, "num_tokens": 898328413.0, "step": 23548 }, { "epoch": 2.995674850527923, "ewc_loss": 0.07628759741783142, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039666498196311295, "grad_norm": 9.097071647644043, "learning_rate": 1e-06, "loss": 0.4621, "mean_token_accuracy": 0.8687090873718262, "num_tokens": 898376963.0, "step": 23549 }, { "epoch": 2.995802060806513, "ewc_loss": 0.0762723982334137, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003965130599681288, "grad_norm": 9.117927551269531, "learning_rate": 1e-06, "loss": 0.4239, "mean_token_accuracy": 0.8782703280448914, "num_tokens": 898407256.0, "step": 23550 }, { "epoch": 2.9959292710851035, "ewc_loss": 0.07615827023983002, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.000395371753256768, "grad_norm": 9.123322486877441, "learning_rate": 1e-06, "loss": 0.4636, "mean_token_accuracy": 0.8695245385169983, "num_tokens": 898448800.0, "step": 23551 }, { "epoch": 2.996056481363694, "ewc_loss": 0.07638904452323914, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039767951238900423, "grad_norm": 9.124290466308594, "learning_rate": 1e-06, "loss": 0.4789, "mean_token_accuracy": 0.8628116846084595, "num_tokens": 898488912.0, "step": 23552 }, { "epoch": 2.9961836916422846, "ewc_loss": 0.07607091963291168, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039449825999327004, "grad_norm": 9.045809745788574, "learning_rate": 1e-06, "loss": 0.4832, "mean_token_accuracy": 0.8612271547317505, "num_tokens": 898530434.0, "step": 23553 }, { "epoch": 2.996310901920875, "ewc_loss": 0.07650403678417206, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003988293756265193, "grad_norm": 9.274649620056152, "learning_rate": 1e-06, "loss": 0.4593, "mean_token_accuracy": 0.8677685856819153, "num_tokens": 898565044.0, "step": 23554 }, { "epoch": 2.9964381121994657, "ewc_loss": 0.0758807361125946, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003925963828805834, "grad_norm": 9.071331977844238, "learning_rate": 1e-06, "loss": 0.4354, "mean_token_accuracy": 0.8771873116493225, "num_tokens": 898601798.0, "step": 23555 }, { "epoch": 2.996565322478056, "ewc_loss": 0.07651770859956741, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039896616362966597, "grad_norm": 9.122841835021973, "learning_rate": 1e-06, "loss": 0.4909, "mean_token_accuracy": 0.8569480180740356, "num_tokens": 898644789.0, "step": 23556 }, { "epoch": 2.9966925327566467, "ewc_loss": 0.07599586248397827, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039374770130962133, "grad_norm": 9.119650840759277, "learning_rate": 1e-06, "loss": 0.4652, "mean_token_accuracy": 0.8664231300354004, "num_tokens": 898683928.0, "step": 23557 }, { "epoch": 2.9968197430352372, "ewc_loss": 0.076396644115448, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003977554733864963, "grad_norm": 9.083917617797852, "learning_rate": 1e-06, "loss": 0.4766, "mean_token_accuracy": 0.8621479272842407, "num_tokens": 898726217.0, "step": 23558 }, { "epoch": 2.9969469533138278, "ewc_loss": 0.07636460661888123, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003974351566284895, "grad_norm": 9.205435752868652, "learning_rate": 1e-06, "loss": 0.4299, "mean_token_accuracy": 0.8763796091079712, "num_tokens": 898760575.0, "step": 23559 }, { "epoch": 2.9970741635924183, "ewc_loss": 0.07617194950580597, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039550854125991464, "grad_norm": 9.053702354431152, "learning_rate": 1e-06, "loss": 0.4692, "mean_token_accuracy": 0.863824188709259, "num_tokens": 898801232.0, "step": 23560 }, { "epoch": 2.997201373871009, "ewc_loss": 0.07671275734901428, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004009166150353849, "grad_norm": 9.13421630859375, "learning_rate": 1e-06, "loss": 0.4368, "mean_token_accuracy": 0.873040497303009, "num_tokens": 898837174.0, "step": 23561 }, { "epoch": 2.9973285841495994, "ewc_loss": 0.0762372612953186, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039616168942302465, "grad_norm": 9.117040634155273, "learning_rate": 1e-06, "loss": 0.5017, "mean_token_accuracy": 0.856709361076355, "num_tokens": 898875849.0, "step": 23562 }, { "epoch": 2.99745579442819, "ewc_loss": 0.07665373384952545, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00040032644756138325, "grad_norm": 9.12391471862793, "learning_rate": 1e-06, "loss": 0.4385, "mean_token_accuracy": 0.8764945268630981, "num_tokens": 898909975.0, "step": 23563 }, { "epoch": 2.9975830047067804, "ewc_loss": 0.07627736032009125, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003965626528952271, "grad_norm": 9.042024612426758, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.858979344367981, "num_tokens": 898948250.0, "step": 23564 }, { "epoch": 2.997710214985371, "ewc_loss": 0.07658030092716217, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039959204150363803, "grad_norm": 9.10599136352539, "learning_rate": 1e-06, "loss": 0.4244, "mean_token_accuracy": 0.8835641741752625, "num_tokens": 898989122.0, "step": 23565 }, { "epoch": 2.9978374252639615, "ewc_loss": 0.07640478014945984, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003978368768002838, "grad_norm": 9.088171005249023, "learning_rate": 1e-06, "loss": 0.4757, "mean_token_accuracy": 0.8626098036766052, "num_tokens": 899028715.0, "step": 23566 }, { "epoch": 2.997964635542552, "ewc_loss": 0.07673577964305878, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004011468554381281, "grad_norm": 9.167327880859375, "learning_rate": 1e-06, "loss": 0.4488, "mean_token_accuracy": 0.8701948523521423, "num_tokens": 899064101.0, "step": 23567 }, { "epoch": 2.9980918458211425, "ewc_loss": 0.07649457454681396, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003987347590737045, "grad_norm": 9.084360122680664, "learning_rate": 1e-06, "loss": 0.4579, "mean_token_accuracy": 0.8710727691650391, "num_tokens": 899101759.0, "step": 23568 }, { "epoch": 2.9982190560997326, "ewc_loss": 0.0765693262219429, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039948232006281614, "grad_norm": 9.163607597351074, "learning_rate": 1e-06, "loss": 0.4842, "mean_token_accuracy": 0.8594176769256592, "num_tokens": 899142273.0, "step": 23569 }, { "epoch": 2.9983462663783236, "ewc_loss": 0.07630680501461029, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039685709634795785, "grad_norm": 9.091142654418945, "learning_rate": 1e-06, "loss": 0.4573, "mean_token_accuracy": 0.8678615093231201, "num_tokens": 899175878.0, "step": 23570 }, { "epoch": 2.9984734766569137, "ewc_loss": 0.07661254703998566, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003999144828412682, "grad_norm": 9.131534576416016, "learning_rate": 1e-06, "loss": 0.4188, "mean_token_accuracy": 0.878238320350647, "num_tokens": 899208937.0, "step": 23571 }, { "epoch": 2.9986006869355046, "ewc_loss": 0.07632046937942505, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039699379703961313, "grad_norm": 9.093135833740234, "learning_rate": 1e-06, "loss": 0.4641, "mean_token_accuracy": 0.867257297039032, "num_tokens": 899244088.0, "step": 23572 }, { "epoch": 2.9987278972140947, "ewc_loss": 0.07664601504802704, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004002491768915206, "grad_norm": 9.139715194702148, "learning_rate": 1e-06, "loss": 0.4869, "mean_token_accuracy": 0.863070011138916, "num_tokens": 899280111.0, "step": 23573 }, { "epoch": 2.9988551074926852, "ewc_loss": 0.0762801393866539, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039659044705331326, "grad_norm": 9.150917053222656, "learning_rate": 1e-06, "loss": 0.4675, "mean_token_accuracy": 0.8671830892562866, "num_tokens": 899310882.0, "step": 23574 }, { "epoch": 2.9989823177712758, "ewc_loss": 0.07637722790241241, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039756137994118035, "grad_norm": 9.108192443847656, "learning_rate": 1e-06, "loss": 0.5186, "mean_token_accuracy": 0.8539177179336548, "num_tokens": 899346820.0, "step": 23575 }, { "epoch": 2.9991095280498663, "ewc_loss": 0.07646133005619049, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039840233512222767, "grad_norm": 9.103349685668945, "learning_rate": 1e-06, "loss": 0.4182, "mean_token_accuracy": 0.88106369972229, "num_tokens": 899382044.0, "step": 23576 }, { "epoch": 2.999236738328457, "ewc_loss": 0.07633519917726517, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003971410624217242, "grad_norm": 9.0805082321167, "learning_rate": 1e-06, "loss": 0.4515, "mean_token_accuracy": 0.8673604130744934, "num_tokens": 899418389.0, "step": 23577 }, { "epoch": 2.9993639486070474, "ewc_loss": 0.07655169069766998, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003993059217464179, "grad_norm": 9.188098907470703, "learning_rate": 1e-06, "loss": 0.4782, "mean_token_accuracy": 0.8640859127044678, "num_tokens": 899458486.0, "step": 23578 }, { "epoch": 2.999491158885638, "ewc_loss": 0.07620492577552795, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039583834586665034, "grad_norm": 9.021611213684082, "learning_rate": 1e-06, "loss": 0.4714, "mean_token_accuracy": 0.865607738494873, "num_tokens": 899498168.0, "step": 23579 }, { "epoch": 2.9996183691642284, "ewc_loss": 0.07664531469345093, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004002422501798719, "grad_norm": 9.17064380645752, "learning_rate": 1e-06, "loss": 0.4133, "mean_token_accuracy": 0.8825998902320862, "num_tokens": 899538655.0, "step": 23580 }, { "epoch": 2.999745579442819, "ewc_loss": 0.07612679153680801, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.00039505696622654796, "grad_norm": 9.02966594696045, "learning_rate": 1e-06, "loss": 0.4982, "mean_token_accuracy": 0.8551254868507385, "num_tokens": 899581693.0, "step": 23581 }, { "epoch": 2.9998727897214095, "ewc_loss": 0.07663042098283768, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0004000932676717639, "grad_norm": 9.1331148147583, "learning_rate": 1e-06, "loss": 0.4762, "mean_token_accuracy": 0.8639349937438965, "num_tokens": 899623817.0, "step": 23582 }, { "epoch": 3.0, "ewc_loss": 0.07620732486248016, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003958623274229467, "grad_norm": 9.060023307800293, "learning_rate": 1e-06, "loss": 0.456, "mean_token_accuracy": 0.8705911636352539, "num_tokens": 899664226.0, "step": 23583 }, { "epoch": 3.0, "ewc_loss": 0.07620732486248016, "ewc_loss_diag": 3.6716461181640625e-05, "ewc_loss_parallel": 0.0003958623274229467, "step": 23583, "total_flos": 5.62815163329864e+19, "train_loss": 0.46694955879437966, "train_runtime": 50208.5497, "train_samples_per_second": 7.515, "train_steps_per_second": 0.47 } ], "logging_steps": 1, "max_steps": 23583, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 11792, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.62815163329864e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }